In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("AttendanceReport").getOrCreate()
spark


In [27]:
cleaned_df = spark.read.csv("cleaned_attendance_tasks.csv", header=True, inferSchema=True)
cleaned_df.printSchema()

root
 |-- Attendance_ID: integer (nullable = true)
 |-- Employee_ID: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Clock_in: string (nullable = true)
 |-- Clock_out: string (nullable = true)
 |-- work_hours: double (nullable = true)
 |-- tasks_completed: integer (nullable = true)
 |-- productivity_score: double (nullable = true)
 |-- Department: string (nullable = true)
 |-- Status: string (nullable = true)



In [31]:
# 1. Top 5 Absentees
top_absentees = cleaned_df.filter(col("Clock_out").isNull()) \
    .groupBy("Employee_ID") \
    .agg(count("*").alias("Absence_Days")) \
    .orderBy(desc("Absence_Days")) \
    .limit(5)
top_absentees.show()

# 2. Lowest Performing Departments
dept_performance = cleaned_df.groupBy("Department") \
    .agg(avg("work_hours").alias("Avg_Hours"),
         avg(when(col("Status") == "completed", 1).otherwise(0)).alias("Completion_Rate")) \
    .orderBy("Completion_Rate") \
    .limit(5)
dept_performance.show()

top_absentees.toPandas().to_csv("top_absentees.csv", index=False)
dept_performance.toPandas().to_csv("lowest_performing_depts.csv", index=False)

+-----------+------------+
|Employee_ID|Absence_Days|
+-----------+------------+
|          4|           1|
+-----------+------------+

+------------+-------------+---------------+
|  Department|    Avg_Hours|Completion_Rate|
+------------+-------------+---------------+
|          HR| 9.0416666665|            0.5|
|      Retail|        5.375|            0.5|
|Supply Chain|          0.0|            1.0|
|          IT|8.21666666675|            1.0|
+------------+-------------+---------------+

