# PySpark for Attendance Analysis

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

#Starting the Spark session

In [2]:
spark = SparkSession.builder.appName("week-3").getOrCreate()
spark

# Loading large datasets

In [5]:
from google.colab import files
uploaded = files.upload()

Saving attendance.csv to attendance.csv
Saving employees.csv to employees.csv
Saving tasks.csv to tasks.csv


In [8]:
dfAtt = spark.read.csv("attendance.csv", header=True, inferSchema=True)
dfEmp = spark.read.csv("employees.csv", header=True, inferSchema=True)
dfTas = spark.read.csv("tasks.csv", header=True, inferSchema=True)

# Check Dataframes By Printing

In [7]:
dfAtt.show()

+------+----------+--------------------+----------+---------------+
|taskID|employeeID|            taskName|  taskDate|tasksCompeleted|
+------+----------+--------------------+----------+---------------+
|     1|         1|         Code Review|2024-07-01|              4|
|     2|         2|   Blog Post Writing|2024-07-01|              5|
|     3|         3|     Employee Survey|2024-07-01|              2|
|     4|         4|Server Configuration|2024-07-01|              3|
|     5|         5| Invoice Preparation|2024-07-01|              6|
|     6|         1|     Feature Testing|2024-07-02|              3|
|     7|         2|    SEO Optimization|2024-07-02|              4|
|     8|         3|Recruitment Scree...|2024-07-02|              1|
|     9|         4| Load Balancer Setup|2024-07-02|              5|
|    10|         5|Travel Reimbursem...|2024-07-02|              2|
|    11|         1|  Unit Test Coverage|2024-07-03|              4|
|    12|         2|Campaign Strategy...|2024-07-

In [9]:
dfEmp.show()

+----------+----------+-----------+-----------------+--------------------+-------------------+--------+
|employeeid|      name| department|             role|               email|           hiredate|  status|
+----------+----------+-----------+-----------------+--------------------+-------------------+--------+
|         1|arun kumar|engineering|software engineer|arun.kumar@exampl...|2023-02-10 00:00:00|  Active|
|         2|deepa rani|  marketing|   content writer|deepa.rani@exampl...|2022-12-05 00:00:00|  Active|
|         3| vijay raj|         hr|     hr executive|vijay.raj@example...|2021-10-15 00:00:00|  Active|
|         4| karthik s|engineering|  devops engineer|karthik.s@example...|2023-04-20 00:00:00|  Active|
|         5|   meena p|    finance|       accountant| meena.p@example.com|2022-07-25 00:00:00|Resigned|
+----------+----------+-----------+-----------------+--------------------+-------------------+--------+



In [10]:
dfTas.show()

+------+----------+--------------------+----------+---------------+
|taskID|employeeID|            taskName|  taskDate|tasksCompeleted|
+------+----------+--------------------+----------+---------------+
|     1|         1|         Code Review|2024-07-01|              4|
|     2|         2|   Blog Post Writing|2024-07-01|              5|
|     3|         3|     Employee Survey|2024-07-01|              2|
|     4|         4|Server Configuration|2024-07-01|              3|
|     5|         5| Invoice Preparation|2024-07-01|              6|
|     6|         1|     Feature Testing|2024-07-02|              3|
|     7|         2|    SEO Optimization|2024-07-02|              4|
|     8|         3|Recruitment Scree...|2024-07-02|              1|
|     9|         4| Load Balancer Setup|2024-07-02|              5|
|    10|         5|Travel Reimbursem...|2024-07-02|              2|
|    11|         1|  Unit Test Coverage|2024-07-03|              4|
|    12|         2|Campaign Strategy...|2024-07-

# Filtering late login and abscences

In [11]:
dfAtt.filter((dfAtt.isLate == 1) | (dfAtt.isAbscent == 1)) \
  .join(dfEmp.select(["name", "employeeID"]), on="employeeID", how="inner") \
  .withColumn("Attendance", F.when(F.col("islate") == 1, "Late Login").otherwise("Abscent")) \
  .select(["name", "Attendance", "date"]) \
  .show()

+----------+----------+----------+
|      name|Attendance|      date|
+----------+----------+----------+
|arun kumar|   Abscent|07-06-2024|
| vijay raj|Late Login|07-06-2024|
|   meena p|Late Login|07-06-2024|
|deepa rani|   Abscent|07-06-2024|
| karthik s|Late Login|07-06-2024|
|arun kumar|Late Login|07-06-2024|
| vijay raj|   Abscent|07-06-2024|
|   meena p|Late Login|07-06-2024|
|deepa rani|Late Login|07-06-2024|
| karthik s|   Abscent|07-06-2024|
|arun kumar|Late Login|07-06-2024|
| vijay raj|Late Login|07-06-2024|
|   meena p|   Abscent|07-06-2024|
|deepa rani|Late Login|07-06-2024|
| karthik s|Late Login|07-06-2024|
+----------+----------+----------+



# Group by department to get average work hours and productivity

In [12]:
dfAtt_cleaned = dfAtt.filter(F.col("clockIN") != "NULL")

In [13]:
dfJoined = dfAtt_cleaned.join(dfEmp, on="employeeID", how="inner").join(dfTas, on="employeeID", how="inner")
dfJoined = dfJoined \
    .withColumn(
     "workHours",
    F.round(
        (F.unix_timestamp(F.col("clockOUT"), "dd-MM-yyyy HH:mm") - F.unix_timestamp(F.col("clockIN"), "dd-MM-yyyy HH:mm")) / 3600,
        2
    )) \
    .withColumn("productivityScore", F.round(F.col("tasksCompeleted") / F.col("workHours"), 4))

In [14]:
dfJoined.groupBy("department").agg(
    F.round(F.mean("workHours"), 2).alias("averageWorkHours"),
    F.round(F.mean("productivityScore"), 2).alias("averageProductivityScore")
).show()

+-----------+----------------+------------------------+
| department|averageWorkHours|averageProductivityScore|
+-----------+----------------+------------------------+
|    finance|            8.25|                    0.32|
|  marketing|            8.21|                    0.45|
|         hr|            8.33|                    0.14|
|engineering|            8.23|                     0.5|
+-----------+----------------+------------------------+



# Deliverables
## 1. Pyspark script with filtering, groupby aggregations

## Filtering

### Store the result in a DataFrame

In [23]:
from pyspark.sql import functions as F

dfSummary = dfJoined.groupBy("department").agg(
    F.round(F.mean("workHours"), 2).alias("averageWorkHours"),
    F.round(F.mean("productivityScore"), 2).alias("averageProductivityScore")
)

### Save as CSV

In [24]:
dfSummary.coalesce(1).write.option("header", True).mode("overwrite").csv("/content/department_summary")

### Find the generated CSV file

In [25]:
import os
for file in os.listdir("/content/department_summary"):
    if file.endswith(".csv"):
        print("CSV File:", file)

CSV File: part-00000-a61142e9-5c18-4f07-a8ab-bc17976116d8-c000.csv


### Rename and move the file for download

In [26]:
import shutil

shutil.move(
    "/content/department_summary/" + file,
    "/content/department_summary.csv"
)

'/content/department_summary.csv'

In [27]:
from google.colab import files
files.download("/content/department_summary.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Group By

In [28]:
# Save output as CSV to a folder in Colab
dfJoined.groupBy("department").agg(
    F.round(F.mean("workHours"), 2).alias("averageWorkHours"),
    F.round(F.mean("productivityScore"), 2).alias("averageProductivityScore")
).coalesce(1).write.option("header", True).mode("overwrite").csv("/content/department_summary")

In [29]:
import os
import shutil

# Find the file inside the output folder
folder_path = "/content/department_summary"
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        shutil.move(os.path.join(folder_path, file), "/content/department_summary.csv")
        break

In [30]:
from google.colab import files
files.download("/content/department_summary.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###  Assign .filter() result to a DataFrame

In [19]:
from pyspark.sql import functions as F

filtered_df = dfAtt.filter((dfAtt.isLate == 1) | (dfAtt.isAbscent == 1)) \
  .join(dfEmp.select(["name", "employeeID"]), on="employeeID", how="inner") \
  .withColumn("Attendance", F.when(F.col("isLate") == 1, "Late Login").otherwise("Abscent")) \
  .select(["name", "Attendance", "date"])

### Save as CSV to a folder

In [20]:
filtered_df.coalesce(1).write.option("header", True).mode("overwrite").csv("attendance_output")

### Find the file and rename it for download

In [21]:
import os
import shutil
from google.colab import files

# Find and rename part file
for file in os.listdir("attendance_output"):
    if file.startswith("part-") and file.endswith(".csv"):
        shutil.copy(f"attendance_output/{file}", "attendance_filtered.csv")

### Download the CSV

In [22]:
files.download("attendance_filtered.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 2. Output showing attendance issues by department

In [15]:
# 2. attendance issues by department
dfJoined_2 = dfAtt.join(dfEmp, on="employeeID", how="inner")

dfJoined_2.select(["department", "isLate", "isAbscent"]) \
  .filter((F.col("isLate") == 1) | (F.col("isAbscent") == 1)) \
  .groupby("department") \
  .agg(
      F.sum("isLate").alias("lateCount"),
      F.sum("isAbscent").alias("abscentCount")
  ) \
  .withColumn("issuesCount", F.col("lateCount") + F.col("abscentCount")) \
  .show()

+-----------+---------+------------+-----------+
| department|lateCount|abscentCount|issuesCount|
+-----------+---------+------------+-----------+
|    finance|        2|           1|          3|
|  marketing|        2|           1|          3|
|         hr|        2|           1|          3|
|engineering|        4|           2|          6|
+-----------+---------+------------+-----------+



## Store the result as a DataFrame

In [31]:
dfAttendanceIssues = dfJoined_2.select(["department", "isLate", "isAbscent"]) \
  .filter((F.col("isLate") == 1) | (F.col("isAbscent") == 1)) \
  .groupby("department") \
  .agg(
      F.sum("isLate").alias("lateCount"),
      F.sum("isAbscent").alias("abscentCount")
  ) \
  .withColumn("issuesCount", F.col("lateCount") + F.col("abscentCount"))

## Write this DataFrame to CSV

In [32]:
dfAttendanceIssues.coalesce(1).write.option("header", True).mode("overwrite").csv("/content/attendance_issues")

## Rename the generated part file to attendance_issues.csv


In [33]:
import os
import shutil

for file in os.listdir("/content/attendance_issues"):
    if file.endswith(".csv"):
        shutil.move(f"/content/attendance_issues/{file}", "/content/attendance_issues.csv")
        break

In [34]:
from google.colab import files
files.download("/content/attendance_issues.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>