In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp,when, datediff,avg, hour,count, sum as spark_sum, avg as spark_avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, FloatType
spark = SparkSession.builder.appName("EmployeeAttendanceETL").getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7f0ae12229d0>

In [0]:
attendance_schema = StructType([
    StructField("Attendance_ID", IntegerType(), True),
    StructField("Employee_ID", IntegerType(), True),
    StructField("Date", StringType(), True),
    StructField("Clock_in", StringType(), True),
    StructField("Clock_out", StringType(), True),
    StructField("Department", StringType(), True)
])

tasks_schema = StructType([
    StructField("Task_ID", IntegerType(), True),
    StructField("Employee_ID", IntegerType(), True),
    StructField("Task_Name", StringType(), True),
    StructField("Assigned_Date", StringType(), True),
    StructField("Completed_Date", StringType(), True),
    StructField("Status", StringType(), True)
])

In [0]:
attendance_df = spark.read.csv("/FileStore/tables/attendance.csv", header=True, schema=attendance_schema)
tasks_df = spark.read.csv("/FileStore/tables/tasks.csv", header=True, schema=tasks_schema)

In [0]:
#cleaning data
attendance_clean = attendance_df.withColumn("Clock_in_timestamp", to_timestamp(col("Clock_in"), "dd-MM-yyyy HH:mm")).withColumn(
"Clock_out_timestamp", to_timestamp(col("Clock_out"), "dd-MM-yyyy HH:mm"))
display(attendance_clean)

attendance_clean1 = attendance_clean.withColumn("Work_hours",when(col("Clock_out_timestamp").isNotNull(),
(col("Clock_out_timestamp").cast("long") - col("Clock_in_timestamp").cast("long"))/3600).otherwise(0))
display(attendance_clean1)

Attendance_ID,Employee_ID,Date,Clock_in,Clock_out,Department,Clock_in_timestamp,Clock_out_timestamp
1,1,02-06-2025,02-06-2025 08:58,02-06-2025 17:05,IT,2025-06-02T08:58:00.000Z,2025-06-02T17:05:00.000Z
2,1,03-06-2025,03-06-2025 09:15,03-06-2025 17:30,IT,2025-06-03T09:15:00.000Z,2025-06-03T17:30:00.000Z
3,1,04-06-2025,04-06-2025 08:45,04-06-2025 16:55,IT,2025-06-04T08:45:00.000Z,2025-06-04T16:55:00.000Z
4,1,05-06-2025,05-06-2025 08:50,05-06-2025 17:10,IT,2025-06-05T08:50:00.000Z,2025-06-05T17:10:00.000Z
5,2,02-06-2025,02-06-2025 08:00,02-06-2025 17:00,HR,2025-06-02T08:00:00.000Z,2025-06-02T17:00:00.000Z
6,2,03-06-2025,03-06-2025 08:05,03-06-2025 17:10,HR,2025-06-03T08:05:00.000Z,2025-06-03T17:10:00.000Z
7,3,02-06-2025,02-06-2025 09:45,02-06-2025 16:30,Retail,2025-06-02T09:45:00.000Z,2025-06-02T16:30:00.000Z
8,3,04-06-2025,04-06-2025 11:00,04-06-2025 15:00,Retail,2025-06-04T11:00:00.000Z,2025-06-04T15:00:00.000Z
9,4,02-06-2025,02-06-2025 08:30,02-06-2025 12:00,Supply Chain,2025-06-02T08:30:00.000Z,2025-06-02T12:00:00.000Z
10,4,04-06-2025,,,Supply Chain,,


Attendance_ID,Employee_ID,Date,Clock_in,Clock_out,Department,Clock_in_timestamp,Clock_out_timestamp,Work_hours
1,1,02-06-2025,02-06-2025 08:58,02-06-2025 17:05,IT,2025-06-02T08:58:00.000Z,2025-06-02T17:05:00.000Z,8.116666666666667
2,1,03-06-2025,03-06-2025 09:15,03-06-2025 17:30,IT,2025-06-03T09:15:00.000Z,2025-06-03T17:30:00.000Z,8.25
3,1,04-06-2025,04-06-2025 08:45,04-06-2025 16:55,IT,2025-06-04T08:45:00.000Z,2025-06-04T16:55:00.000Z,8.166666666666666
4,1,05-06-2025,05-06-2025 08:50,05-06-2025 17:10,IT,2025-06-05T08:50:00.000Z,2025-06-05T17:10:00.000Z,8.333333333333334
5,2,02-06-2025,02-06-2025 08:00,02-06-2025 17:00,HR,2025-06-02T08:00:00.000Z,2025-06-02T17:00:00.000Z,9.0
6,2,03-06-2025,03-06-2025 08:05,03-06-2025 17:10,HR,2025-06-03T08:05:00.000Z,2025-06-03T17:10:00.000Z,9.083333333333334
7,3,02-06-2025,02-06-2025 09:45,02-06-2025 16:30,Retail,2025-06-02T09:45:00.000Z,2025-06-02T16:30:00.000Z,6.75
8,3,04-06-2025,04-06-2025 11:00,04-06-2025 15:00,Retail,2025-06-04T11:00:00.000Z,2025-06-04T15:00:00.000Z,4.0
9,4,02-06-2025,02-06-2025 08:30,02-06-2025 12:00,Supply Chain,2025-06-02T08:30:00.000Z,2025-06-02T12:00:00.000Z,3.5
10,4,04-06-2025,,,Supply Chain,,,0.0


In [0]:
# Process tasks data - count completed tasks per employee per day
tasks_processed = tasks_df.filter(col("Status") == "completed") \
    .groupBy("Employee_ID", "Assigned_Date") \
    .agg(count("*").alias("Tasks_completed"))
display(tasks_processed)

Employee_ID,Assigned_Date,Tasks_completed
1,05-06-2025,1
1,02-06-2025,1
1,04-06-2025,1
2,02-06-2025,1
1,03-06-2025,1
3,02-06-2025,1
4,02-06-2025,1


In [0]:
# Join attendance with tasks data
combined_df = attendance_clean1.join(tasks_processed,(attendance_clean1.Employee_ID == tasks_processed.Employee_ID) & 
(attendance_clean1.Date == tasks_processed.Assigned_Date),"left").drop(tasks_processed.Employee_ID).drop(tasks_processed.Assigned_Date)

display(combined_df)

Attendance_ID,Employee_ID,Date,Clock_in,Clock_out,Department,Clock_in_timestamp,Clock_out_timestamp,Work_hours,Tasks_completed
1,1,02-06-2025,02-06-2025 08:58,02-06-2025 17:05,IT,2025-06-02T08:58:00.000Z,2025-06-02T17:05:00.000Z,8.116666666666667,1.0
2,1,03-06-2025,03-06-2025 09:15,03-06-2025 17:30,IT,2025-06-03T09:15:00.000Z,2025-06-03T17:30:00.000Z,8.25,1.0
3,1,04-06-2025,04-06-2025 08:45,04-06-2025 16:55,IT,2025-06-04T08:45:00.000Z,2025-06-04T16:55:00.000Z,8.166666666666666,1.0
4,1,05-06-2025,05-06-2025 08:50,05-06-2025 17:10,IT,2025-06-05T08:50:00.000Z,2025-06-05T17:10:00.000Z,8.333333333333334,1.0
5,2,02-06-2025,02-06-2025 08:00,02-06-2025 17:00,HR,2025-06-02T08:00:00.000Z,2025-06-02T17:00:00.000Z,9.0,1.0
6,2,03-06-2025,03-06-2025 08:05,03-06-2025 17:10,HR,2025-06-03T08:05:00.000Z,2025-06-03T17:10:00.000Z,9.083333333333334,
7,3,02-06-2025,02-06-2025 09:45,02-06-2025 16:30,Retail,2025-06-02T09:45:00.000Z,2025-06-02T16:30:00.000Z,6.75,1.0
8,3,04-06-2025,04-06-2025 11:00,04-06-2025 15:00,Retail,2025-06-04T11:00:00.000Z,2025-06-04T15:00:00.000Z,4.0,
9,4,02-06-2025,02-06-2025 08:30,02-06-2025 12:00,Supply Chain,2025-06-02T08:30:00.000Z,2025-06-02T12:00:00.000Z,3.5,1.0
10,4,04-06-2025,,,Supply Chain,,,0.0,


In [0]:
# Fill nulls for days with no completed tasks
null_fill = combined_df.fillna(0, subset=["Tasks_completed"])
display(null_fill)

# Calculate productivity score (tasks per hour)
calc_df = combined_df.withColumn("Productivity_score",when(col("Work_hours") > 0, col("Tasks_completed")/col("Work_hours")).otherwise(0))
display(calc_df)

Attendance_ID,Employee_ID,Date,Clock_in,Clock_out,Department,Clock_in_timestamp,Clock_out_timestamp,Work_hours,Tasks_completed
1,1,02-06-2025,02-06-2025 08:58,02-06-2025 17:05,IT,2025-06-02T08:58:00.000Z,2025-06-02T17:05:00.000Z,8.116666666666667,1
2,1,03-06-2025,03-06-2025 09:15,03-06-2025 17:30,IT,2025-06-03T09:15:00.000Z,2025-06-03T17:30:00.000Z,8.25,1
3,1,04-06-2025,04-06-2025 08:45,04-06-2025 16:55,IT,2025-06-04T08:45:00.000Z,2025-06-04T16:55:00.000Z,8.166666666666666,1
4,1,05-06-2025,05-06-2025 08:50,05-06-2025 17:10,IT,2025-06-05T08:50:00.000Z,2025-06-05T17:10:00.000Z,8.333333333333334,1
5,2,02-06-2025,02-06-2025 08:00,02-06-2025 17:00,HR,2025-06-02T08:00:00.000Z,2025-06-02T17:00:00.000Z,9.0,1
6,2,03-06-2025,03-06-2025 08:05,03-06-2025 17:10,HR,2025-06-03T08:05:00.000Z,2025-06-03T17:10:00.000Z,9.083333333333334,0
7,3,02-06-2025,02-06-2025 09:45,02-06-2025 16:30,Retail,2025-06-02T09:45:00.000Z,2025-06-02T16:30:00.000Z,6.75,1
8,3,04-06-2025,04-06-2025 11:00,04-06-2025 15:00,Retail,2025-06-04T11:00:00.000Z,2025-06-04T15:00:00.000Z,4.0,0
9,4,02-06-2025,02-06-2025 08:30,02-06-2025 12:00,Supply Chain,2025-06-02T08:30:00.000Z,2025-06-02T12:00:00.000Z,3.5,1
10,4,04-06-2025,,,Supply Chain,,,0.0,0


Attendance_ID,Employee_ID,Date,Clock_in,Clock_out,Department,Clock_in_timestamp,Clock_out_timestamp,Work_hours,Tasks_completed,Productivity_score
1,1,02-06-2025,02-06-2025 08:58,02-06-2025 17:05,IT,2025-06-02T08:58:00.000Z,2025-06-02T17:05:00.000Z,8.116666666666667,1.0,0.1232032854209445
2,1,03-06-2025,03-06-2025 09:15,03-06-2025 17:30,IT,2025-06-03T09:15:00.000Z,2025-06-03T17:30:00.000Z,8.25,1.0,0.1212121212121212
3,1,04-06-2025,04-06-2025 08:45,04-06-2025 16:55,IT,2025-06-04T08:45:00.000Z,2025-06-04T16:55:00.000Z,8.166666666666666,1.0,0.1224489795918367
4,1,05-06-2025,05-06-2025 08:50,05-06-2025 17:10,IT,2025-06-05T08:50:00.000Z,2025-06-05T17:10:00.000Z,8.333333333333334,1.0,0.12
5,2,02-06-2025,02-06-2025 08:00,02-06-2025 17:00,HR,2025-06-02T08:00:00.000Z,2025-06-02T17:00:00.000Z,9.0,1.0,0.1111111111111111
6,2,03-06-2025,03-06-2025 08:05,03-06-2025 17:10,HR,2025-06-03T08:05:00.000Z,2025-06-03T17:10:00.000Z,9.083333333333334,,
7,3,02-06-2025,02-06-2025 09:45,02-06-2025 16:30,Retail,2025-06-02T09:45:00.000Z,2025-06-02T16:30:00.000Z,6.75,1.0,0.1481481481481481
8,3,04-06-2025,04-06-2025 11:00,04-06-2025 15:00,Retail,2025-06-04T11:00:00.000Z,2025-06-04T15:00:00.000Z,4.0,,
9,4,02-06-2025,02-06-2025 08:30,02-06-2025 12:00,Supply Chain,2025-06-02T08:30:00.000Z,2025-06-02T12:00:00.000Z,3.5,1.0,0.2857142857142857
10,4,04-06-2025,,,Supply Chain,,,0.0,,0.0


In [0]:
# Create department-level metrics
department_metrics = combined_df.groupBy("Department") \
    .agg(count("*").alias("Total_records"),avg("Work_hours").alias("Avg_work_hours"),spark_sum("Work_hours").alias("Total_work_hours"),spark_sum("Tasks_completed").alias("Total_tasks"))

display(department_metrics.orderBy("Department"))

Department,Total_records,Avg_work_hours,Total_work_hours,Total_tasks
HR,2,9.041666666666668,18.083333333333336,1
IT,4,8.216666666666667,32.86666666666667,4
Retail,2,5.375,10.75,1
Supply Chain,2,1.75,3.5,1


In [0]:
#delta and csv path
delta_path = "dbfs:/FileStore/output/department_metrics_delta"
csv_path = "/dbfs/FileStore/output/department_metrics.csv"

#Save Delta table and csv
(department_metrics.write.format("delta").mode("overwrite").save(delta_path))
department_metrics.write \
    .format("csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save("dbfs:/FileStore/output/department_metrics_csv")

display(dbutils.fs.ls("dbfs:/FileStore/output/"))

path,name,size,modificationTime
dbfs:/FileStore/output/department_metrics_csv/,department_metrics_csv/,0,1750001870000
dbfs:/FileStore/output/department_metrics_delta/,department_metrics_delta/,0,1750001270000
dbfs:/FileStore/output/department_metrics_spark.csv/,department_metrics_spark.csv/,0,1750001576000
