In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("EmployeeAttendanceAnalysis") \
    .getOrCreate()
spark

In [11]:
# Load attendance data
attendance_df = spark.read.csv("attendance.csv", header=True, inferSchema=True)

# Convert date/time columns and calculate hours
attendance_df = attendance_df.withColumn(
    "Clock_in",
    to_timestamp(col("Clock_in"), "dd-MM-yyyy HH:mm")  # Changed format
).withColumn(
    "Clock_out",
    to_timestamp(col("Clock_out"), "dd-MM-yyyy HH:mm")  # Changed format
).withColumn(
    "work_hours",
    when(
        col("Clock_out").isNotNull(),
        (unix_timestamp(col("Clock_out")) - unix_timestamp(col("Clock_in"))) / 3600
    ).otherwise(None)
)

# **LATE LOGIN AND ABSENCES**

In [12]:
late_threshold = "09:00:00"

# Filter late logins (after 9AM with valid clock-in)
late_logins = attendance_df.filter(
    (col("Clock_in").isNotNull()) &
    (date_format(col("Clock_in"), "HH:mm:ss") > late_threshold)  # Compare as string
)

absences = attendance_df.filter(
    col("Clock_in").isNull()  # No clock-in recorded
)

# **Group by department to get average work hours and productivity**

In [13]:
department_stats = attendance_df.groupBy("Department").agg(
    avg("work_hours").alias("avg_hours"),
    count(when(col("Clock_in").isNull(), 1)).alias("absences"),
    count(when(date_format(col("Clock_in"), "HH:mm:ss") > late_threshold, 1)).alias("late_count")
)

# **Result**

In [14]:
print("=== Late Logins===")
late_logins.select(
    "Employee_ID",
    "Department",
    date_format("Clock_in", "dd-MM-yyyy HH:mm:ss").alias("Clock_in"),
    date_format("Clock_out", "dd-MM-yyyy HH:mm:ss").alias("Clock_out"),
    "work_hours"
).show(truncate=False)

print("\n=== Absences ===")
absences.show()

print("\n=== Department Statistics ===")
department_stats.show()

# Save results
department_stats.write.csv("department_attendance_metrics", mode="overwrite")

=== Late Logins===
+-----------+----------+-------------------+-------------------+----------+
|Employee_ID|Department|Clock_in           |Clock_out          |work_hours|
+-----------+----------+-------------------+-------------------+----------+
|1          |IT        |03-06-2025 09:15:00|03-06-2025 17:30:00|8.25      |
|3          |Retail    |02-06-2025 09:45:00|02-06-2025 16:30:00|6.75      |
|3          |Retail    |04-06-2025 11:00:00|04-06-2025 15:00:00|4.0       |
+-----------+----------+-------------------+-------------------+----------+


=== Absences ===
+-------------+-----------+----------+--------+---------+------------+----------+
|Attendance_ID|Employee_ID|      Date|Clock_in|Clock_out|  Department|work_hours|
+-------------+-----------+----------+--------+---------+------------+----------+
|           10|          4|04-06-2025|    NULL|     NULL|Supply Chain|      NULL|
+-------------+-----------+----------+--------+---------+------------+----------+


=== Department Sta