## Importing Libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd

**3156. Employee Task Duration and Concurrent Tasks (Hard)**

**Table: Tasks**

| Column Name   | Type     |
|---------------|----------|
| task_id       | int      |
| employee_id   | int      |
| start_time    | datetime |
| end_time      | datetime |

(task_id, employee_id) is the primary key for this table.
Each row in this table contains the task identifier, the employee identifier, and the start and end times of each task.

**Write a solution to find the total duration of tasks for each employee and the maximum number of concurrent tasks an employee handled at any point in time. The total duration should be rounded down to the nearest number of full hours.**

Return the result table ordered by employee_id ascending order.

The result format is in the following example.

**Example:**

**Input:**

**Tasks table:**

| task_id | employee_id | start_time          | end_time            |
|---------|-------------|---------------------|---------------------|
| 1       | 1001        | 2023-05-01 08:00:00 | 2023-05-01 09:00:00 |
| 2       | 1001        | 2023-05-01 08:30:00 | 2023-05-01 10:30:00 |
| 3       | 1001        | 2023-05-01 11:00:00 | 2023-05-01 12:00:00 |
| 7       | 1001        | 2023-05-01 13:00:00 | 2023-05-01 15:30:00 |
| 4       | 1002        | 2023-05-01 09:00:00 | 2023-05-01 10:00:00 |
| 5       | 1002        | 2023-05-01 09:30:00 | 2023-05-01 11:30:00 |
| 6       | 1003        | 2023-05-01 14:00:00 | 2023-05-01 16:00:00 |

**Output:**

| employee_id | total_task_hours | max_concurrent_tasks |
|-------------|------------------|----------------------|
| 1001        | 6                | 2                    |
| 1002        | 2                | 2                    |
| 1003        | 2                | 1                    |

**Explanation:**
- For employee ID 1001:
  - Task 1 and Task 2 overlap from 08:30 to 09:00 (30 minutes).
  - Task 7 has a duration of 150 minutes (2 hours and 30 minutes).
  - Total task time: 60 (Task 1) + 120 (Task 2) + 60 (Task 3) + 150 (Task 7) - 30 (overlap) = 360 minutes = 6 hours.
  - Maximum concurrent tasks: 2 (during the overlap period).
- For employee ID 1002:
  - Task 4 and Task 5 overlap from 09:30 to 10:00 (30 minutes).
  - Total task time: 60 (Task 4) + 120 (Task 5) - 30 (overlap) = 150 minutes = 2 hours and 30 minutes.
  - Total task hours (rounded down): 2 hours.
  - Maximum concurrent tasks: 2 (during the overlap period).
- For employee ID 1003:
  - No overlapping tasks.
  - Total task time: 120 minutes = 2 hours.
  - Maximum concurrent tasks: 1.

**Note:** Output table is ordered by employee_id in ascending order.

In [0]:
tasks_data_3156 = [
    (1, 1001, "2023-05-01 08:00:00", "2023-05-01 09:00:00"),
    (2, 1001, "2023-05-01 08:30:00", "2023-05-01 10:30:00"),
    (3, 1001, "2023-05-01 11:00:00", "2023-05-01 12:00:00"),
    (7, 1001, "2023-05-01 13:00:00", "2023-05-01 15:30:00"),
    (4, 1002, "2023-05-01 09:00:00", "2023-05-01 10:00:00"),
    (5, 1002, "2023-05-01 09:30:00", "2023-05-01 11:30:00"),
    (6, 1003, "2023-05-01 14:00:00", "2023-05-01 16:00:00")
]

tasks_columns_3156 = ["task_id", "employee_id", "start_time", "end_time"]
tasks_df_3156 = spark.createDataFrame(tasks_data_3156, tasks_columns_3156)
tasks_df_3156.show()

+-------+-----------+-------------------+-------------------+
|task_id|employee_id|         start_time|           end_time|
+-------+-----------+-------------------+-------------------+
|      1|       1001|2023-05-01 08:00:00|2023-05-01 09:00:00|
|      2|       1001|2023-05-01 08:30:00|2023-05-01 10:30:00|
|      3|       1001|2023-05-01 11:00:00|2023-05-01 12:00:00|
|      7|       1001|2023-05-01 13:00:00|2023-05-01 15:30:00|
|      4|       1002|2023-05-01 09:00:00|2023-05-01 10:00:00|
|      5|       1002|2023-05-01 09:30:00|2023-05-01 11:30:00|
|      6|       1003|2023-05-01 14:00:00|2023-05-01 16:00:00|
+-------+-----------+-------------------+-------------------+



In [0]:
tasks_df_3156 = tasks_df_3156\
                    .withColumn("start_time", col("start_time").cast("timestamp")) \
                        .withColumn("end_time", col("end_time").cast("timestamp"))

In [0]:
window_spec = Window.partitionBy("employee_id").orderBy("start_time")

In [0]:
tasks_df_3156 = tasks_df_3156\
                    .withColumn("prev_end", lag("end_time").over(window_spec))\
                    .withColumn( "prev_end", coalesce("prev_end", lit("1970-01-01 00:00:00").cast("timestamp")))\
                    .withColumn( "effective_start",
                                when(col("start_time") > col("prev_end"), col("start_time")).otherwise(col("prev_end")))\
                    .withColumn( "effective_duration_hours",
                                when(col("end_time") > col("effective_start"), ((col("end_time").cast("long") - col("effective_start").cast("long")) / 3600)).otherwise(0))

In [0]:
total_hours_df_3156 = tasks_df_3156\
                        .groupBy("employee_id")\
                            .agg(
                                floor(sum("effective_duration_hours")).alias("total_task_hours")
                                )

In [0]:
start_events_3156 = tasks_df_3156\
                        .select("employee_id", col("start_time").alias("time"))\
                            .withColumn("delta", lit(1))

In [0]:
end_events_3156 = tasks_df_3156\
                        .select("employee_id", col("end_time").alias("time"))\
                            .withColumn("delta", lit(-1))

In [0]:
events_df_3156 = start_events_3156.union(end_events_3156)

In [0]:
window_spec2 = Window.partitionBy("employee_id").orderBy("time").rowsBetween(Window.unboundedPreceding, 0)

In [0]:
events_df_3156 = events_df_3156\
                    .withColumn("concurrent", sum("delta").over(window_spec2))

In [0]:
max_concurrent_df_3156 = events_df_3156\
                            .groupBy("employee_id")\
                                .agg(
                                    max("concurrent").alias("max_concurrent_tasks")
                                    )

In [0]:
total_hours_df_3156\
    .join(max_concurrent_df_3156, on="employee_id")\
        .orderBy("employee_id").display()

employee_id,total_task_hours,max_concurrent_tasks
1001,6,2
1002,2,2
1003,2,1
