In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeeTimesheetAnalysis").getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7c27ef93cd10>

In [0]:
# 1. Load with inferred schema
timesheet_inferred = spark.read.csv("/FileStore/tables/employee_timesheet.csv", header=True, inferSchema=True)
timesheet_inferred.printSchema()

# 2. Load with explicit schema
custom_schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True)
])

timesheet_explicit = spark.read.csv("/FileStore/tables/employee_timesheet.csv", header=True, schema=custom_schema)
timesheet_explicit.printSchema()

# 3. Add Weekday column
timesheet_df = timesheet_explicit.withColumn("Weekday", date_format(col("WorkDate"), "EEEE"))
timesheet_df.show()

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode: string (nullable = true)

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = true)
 |-- Location: string (nullable = true)
 |-- Mode: string (nullable = true)

+----------+-----+----------+-------+---------+----------+---------+------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E103| J

In [0]:
# 4. Total work hours by employee
total_hours_by_emp = timesheet_df.groupBy("EmployeeID", "Name") \
    .agg(sum("WorkHours").alias("TotalHours")) \
    .orderBy("TotalHours", ascending=False)
total_hours_by_emp.show()

# 5. Average work hours per department
avg_hours_by_dept = timesheet_df.groupBy("Department") \
    .agg(avg("WorkHours").alias("AvgHours")) \
    .orderBy("AvgHours", ascending=False)
avg_hours_by_dept.show()

# 6. Top 2 employees by total hours using window
window_spec = Window.orderBy(col("TotalHours").desc())
top_employees = total_hours_by_emp.withColumn("rank", rank().over(window_spec)) \
    .filter(col("rank") <= 2) \
    .drop("rank")
top_employees.show()

+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E101|Anita|        17|
|      E102|  Raj|        15|
|      E104|Meena|         6|
|      E103| John|         5|
+----------+-----+----------+

+----------+-----------------+
|Department|         AvgHours|
+----------+-----------------+
|        IT|7.666666666666667|
|        HR|              7.5|
|   Finance|              5.0|
+----------+-----------------+





+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E101|Anita|        17|
|      E102|  Raj|        15|
+----------+-----+----------+



In [0]:
# 7. Filter weekend entries
weekend_df = timesheet_df.filter((dayofweek(col("WorkDate")) == 1) |  (dayofweek(col("WorkDate")) == 7)    )
display(weekend_df)

# 8. Running total of hours per employee
emp_window = Window.partitionBy("EmployeeID").orderBy("WorkDate")
running_total_df = timesheet_df.withColumn("RunningTotalHours", sum("WorkHours").over(emp_window))
display(running_total_df)

EmployeeID,Name,Department,Project,WorkHours,WorkDate,Location,Mode,Weekday
E102,Raj,HR,Beta,8,2024-05-04,Mumbai,Remote,Saturday


EmployeeID,Name,Department,Project,WorkHours,WorkDate,Location,Mode,Weekday,RunningTotalHours
E101,Anita,IT,Alpha,8,2024-05-01,Bangalore,Remote,Wednesday,8
E101,Anita,IT,Alpha,9,2024-05-03,Bangalore,Remote,Friday,17
E102,Raj,HR,Beta,7,2024-05-01,Mumbai,Onsite,Wednesday,7
E102,Raj,HR,Beta,8,2024-05-04,Mumbai,Remote,Saturday,15
E103,John,Finance,Alpha,5,2024-05-02,Delhi,Remote,Thursday,5
E104,Meena,IT,Gamma,6,2024-05-03,Hyderabad,Onsite,Friday,6


In [0]:
# 9. Create department_location DataFrame 
dept_df = spark.read.csv("/FileStore/tables/department_location.csv", header=True, inferSchema=True)

# 10. Join with timesheet data
joined_df = timesheet_df.join(dept_df, "Department", "left") \
    .select("EmployeeID", "Name", "Department", "DeptHead", "Project", "WorkHours")
joined_df.show()

+----------+-----+----------+--------+-------+---------+
|EmployeeID| Name|Department|DeptHead|Project|WorkHours|
+----------+-----+----------+--------+-------+---------+
|      E101|Anita|        IT|   Anand|  Alpha|        8|
|      E103| John|   Finance|   Kamal|  Alpha|        5|
|      E101|Anita|        IT|   Anand|  Alpha|        9|
|      E104|Meena|        IT|   Anand|  Gamma|        6|
|      E102|  Raj|        HR|  Shruti|   Beta|        7|
|      E102|  Raj|        HR|  Shruti|   Beta|        8|
+----------+-----+----------+--------+-------+---------+



In [0]:
# 11. Pivot: total hours per employee per project
pivot_df = timesheet_df.groupBy("EmployeeID", "Name") \
    .pivot("Project") \
    .agg(sum("WorkHours").alias("TotalHours")) \
    .fillna(0)
pivot_df.show()

# 12. Unpivot example
unpivot_expr = "stack(2, 'Remote', Remote, 'Onsite', Onsite) as (Mode, Hours)"
mode_hours_df = timesheet_df.groupBy("EmployeeID") \
    .pivot("Mode") \
    .agg(sum("WorkHours")) \
    .selectExpr("EmployeeID", unpivot_expr)
mode_hours_df.show()

+----------+-----+-----+----+-----+
|EmployeeID| Name|Alpha|Beta|Gamma|
+----------+-----+-----+----+-----+
|      E103| John|    5|   0|    0|
|      E104|Meena|    0|   0|    6|
|      E101|Anita|   17|   0|    0|
|      E102|  Raj|    0|  15|    0|
+----------+-----+-----+----+-----+

+----------+------+-----+
|EmployeeID|  Mode|Hours|
+----------+------+-----+
|      E104|Remote| NULL|
|      E104|Onsite|    6|
|      E101|Remote|   17|
|      E101|Onsite| NULL|
|      E102|Remote|    8|
|      E102|Onsite|    7|
|      E103|Remote|    5|
|      E103|Onsite| NULL|
+----------+------+-----+



In [0]:
# 13. Create workload_tag UDF
def workload_tag(hours):
    if hours >= 8: return "Full"
    elif hours >= 4: return "Partial"
    else: return "Light"

workload_udf = udf(workload_tag, StringType())

# 14. Add WorkloadCategory column
timesheet_df = timesheet_df.withColumn("WorkloadCategory", workload_udf(col("WorkHours")))
timesheet_df.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|            Full|
+----------+-----+----------+-------+---------+----------+---------+-----

In [0]:
# 15. Introduce nulls in Mode
import random
timesheet_with_nulls = timesheet_df.withColumn("Mode", 
    when(rand() > 0.8, None).otherwise(col("Mode")))

# 16. Fill nulls with "Not Provided"
timesheet_filled = timesheet_with_nulls.fillna("Not Provided", subset=["Mode"])

# 17. Drop rows where WorkHours < 4
timesheet_clean = timesheet_filled.filter(col("WorkHours") >= 4)


In [0]:
# 18. Mark "Remote Worker" if >80% entries are Remote
remote_workers = timesheet_df.groupBy("EmployeeID", "Name") \
    .agg(
        count(when(col("Mode") == "Remote", 1)).alias("RemoteCount"),
        count("*").alias("TotalEntries")
    ) \
    .withColumn("RemotePercentage", col("RemoteCount")/col("TotalEntries")) \
    .withColumn("RemoteWorker", 
        when(col("RemotePercentage") > 0.8, "Yes").otherwise("No"))
remote_workers.show()

# 19. Add ExtraHours column
timesheet_df = timesheet_df.withColumn("ExtraHours", 
    when(col("WorkHours") > 8, col("WorkHours") - 8).otherwise(0))
timesheet_df.show()

+----------+-----+-----------+------------+----------------+------------+
|EmployeeID| Name|RemoteCount|TotalEntries|RemotePercentage|RemoteWorker|
+----------+-----+-----------+------------+----------------+------------+
|      E101|Anita|          2|           2|             1.0|         Yes|
|      E103| John|          1|           1|             1.0|         Yes|
|      E104|Meena|          0|           1|             0.0|          No|
|      E102|  Raj|          1|           2|             0.5|          No|
+----------+-----+-----------+------------+----------------+------------+

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|ExtraHours|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalo

In [0]:
from datetime import date
from pyspark.sql.types import *

# Define the complete schema
timesheet_schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True),
    StructField("Weekday", StringType(), True)
])

# 20. Append dummy intern data with explicit schema
intern_data = [
    ("E105", "Intern1", "IT", "Alpha", 4, date(2024, 5, 5), "Bangalore", "Onsite", None),
    ("E106", "Intern2", "HR", "Beta", 5, date(2024, 5, 5), "Mumbai", "Remote", None)
]


intern_df = spark.createDataFrame(intern_data, schema=timesheet_schema)
timesheet_df = timesheet_df.select([col(c).cast(timesheet_schema[c].dataType) for c in timesheet_schema.names])
combined_df = timesheet_df.unionByName(intern_df)
display(combined_df)

# 21. Remove duplicates
deduped_df = combined_df.dropDuplicates()
print(f"Original count: {combined_df.count()}, Deduped count: {deduped_df.count()}")

EmployeeID,Name,Department,Project,WorkHours,WorkDate,Location,Mode,Weekday
E101,Anita,IT,Alpha,8,2024-05-01,Bangalore,Remote,Wednesday
E103,John,Finance,Alpha,5,2024-05-02,Delhi,Remote,Thursday
E101,Anita,IT,Alpha,9,2024-05-03,Bangalore,Remote,Friday
E104,Meena,IT,Gamma,6,2024-05-03,Hyderabad,Onsite,Friday
E102,Raj,HR,Beta,7,2024-05-01,Mumbai,Onsite,Wednesday
E102,Raj,HR,Beta,8,2024-05-04,Mumbai,Remote,Saturday
E105,Intern1,IT,Alpha,4,2024-05-05,Bangalore,Onsite,
E106,Intern2,HR,Beta,5,2024-05-05,Mumbai,Remote,


Original count: 8, Deduped count: 8
