In [0]:
from pyspark.sql import SparkSession
spark

In [0]:
#1. Load the CSV using inferred schema

df_auto = spark.read.option("header", "true").option("inferSchema", "true").csv("file:/Workspace/Shared/employee_timesheet.csv")
df_auto.show()
df_auto.printSchema()

+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|
+----------+-----+----------+-------+---------+----------+---------+------+

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- WorkHours: integer (nullable = true)
 |-- WorkDate: date (nullable = t

In [0]:
#2. Load the same file with schema explicitly defined
from pyspark.sql.types import *

schema = StructType([
    StructField("EmployeeID", StringType()),
    StructField("Name", StringType()),
    StructField("Department", StringType()),
    StructField("Project", StringType()),
    StructField("WorkHours", IntegerType()),
    StructField("WorkDate", DateType()),
    StructField("Location", StringType()),
    StructField("Mode", StringType())
])

df_manual = spark.read.option("header", "true").schema(schema).csv("file:/Workspace/Shared/employee_timesheet.csv")
df_manual.show()


+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|
+----------+-----+----------+-------+---------+----------+---------+------+



In [0]:
#3. Add a new column Weekday extracted from WorkDate

from pyspark.sql.functions import date_format

df = df_manual.withColumn("Weekday", date_format("WorkDate", "EEEE"))
df.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+



In [0]:
#4. Calculate total work hours by employee

from pyspark.sql.functions import sum

df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours")).show()

+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E103| John|         5|
|      E104|Meena|         6|
|      E102|  Raj|        15|
|      E101|Anita|        17|
+----------+-----+----------+



In [0]:
#5. Calculate average work hours per department
from pyspark.sql.functions import avg

df.groupBy("Department").agg(avg("WorkHours").alias("AvgHours")).show()

+----------+-----------------+
|Department|         AvgHours|
+----------+-----------------+
|        HR|              7.5|
|   Finance|              5.0|
|        IT|7.666666666666667|
+----------+-----------------+



In [0]:
#6. Get top 2 employees by total hours using window function
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

windowSpec = Window.orderBy(df["WorkHours"].desc())
df.withColumn("Rank", rank().over(windowSpec)).filter("Rank <= 2").show()


+----------+-----+----------+-------+---------+----------+---------+------+---------+----+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|Rank|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----+
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|   1|
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|   2|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|   2|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----+



In [0]:
#7. Filter entries where WorkDate falls on a weekend
from pyspark.sql.functions import dayofweek

df.filter(dayofweek("WorkDate").isin([1, 7])).show()  # Sunday=1, Saturday=7

+----------+----+----------+-------+---------+----------+--------+------+--------+
|EmployeeID|Name|Department|Project|WorkHours|  WorkDate|Location|  Mode| Weekday|
+----------+----+----------+-------+---------+----------+--------+------+--------+
|      E102| Raj|        HR|   Beta|        8|2024-05-04|  Mumbai|Remote|Saturday|
+----------+----+----------+-------+---------+----------+--------+------+--------+



In [0]:
#8. Running total of hours per employee using window

from pyspark.sql.functions import sum
from pyspark.sql.window import Window

windowSpec = Window.partitionBy("EmployeeID").orderBy("WorkDate").rowsBetween(Window.unboundedPreceding, 0)
df.withColumn("RunningTotal", sum("WorkHours").over(windowSpec)).show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|RunningTotal|
+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|           8|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|          17|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|           7|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|          15|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|           5|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|           6|
+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+



In [0]:
#9. Create department_location.csv (manually using DataFrame)
from pyspark.sql import Row

dept_data = [Row(Department="IT", DeptHead="Anand"),
             Row(Department="HR", DeptHead="Shruti"),
             Row(Department="Finance", DeptHead="Kamal")]

df_dept = spark.createDataFrame(dept_data)
df_dept.show()

+----------+--------+
|Department|DeptHead|
+----------+--------+
|        IT|   Anand|
|        HR|  Shruti|
|   Finance|   Kamal|
+----------+--------+



In [0]:
#10. Join with timesheet data and list all employees with their DeptHead

df.join(df_dept, on="Department", how="left").select("EmployeeID", "Name", "Department", "DeptHead").show()


+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E101|Anita|        IT|   Anand|
|      E104|Meena|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
+----------+-----+----------+--------+



In [0]:
#11. Pivot table: total hours per employee per project

df.groupBy("EmployeeID").pivot("Project").sum("WorkHours").show()

+----------+-----+----+-----+
|EmployeeID|Alpha|Beta|Gamma|
+----------+-----+----+-----+
|      E103|    5|NULL| NULL|
|      E104| NULL|NULL|    6|
|      E101|   17|NULL| NULL|
|      E102| NULL|  15| NULL|
+----------+-----+----+-----+



In [0]:
#12. Unpivot example: Convert mode-specific hours into rows

from pyspark.sql.functions import expr

# Assume these are the pivoted columns; create manually
pivoted = df.groupBy("EmployeeID").pivot("Mode").sum("WorkHours").fillna(0)
unpivoted = pivoted.selectExpr("EmployeeID", "stack(2, 'Remote', Remote, 'Onsite', Onsite) as (Mode, Hours)")
unpivoted.show()

+----------+------+-----+
|EmployeeID|  Mode|Hours|
+----------+------+-----+
|      E103|Remote|    5|
|      E103|Onsite|    0|
|      E104|Remote|    0|
|      E104|Onsite|    6|
|      E101|Remote|   17|
|      E101|Onsite|    0|
|      E102|Remote|    8|
|      E102|Onsite|    7|
+----------+------+-----+



In [0]:
#13. Create UDF to classify work hours
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def workload_tag(hours):
    if hours >= 8:
        return "Full"
    elif hours >= 4:
        return "Partial"
    else:
        return "Light"

workload_udf = udf(workload_tag, StringType())
df.withColumn("WorkloadCategory", workload_udf("WorkHours")).show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|            Full|
+----------+-----+----------+-------+---------+----------+---------+-----

In [0]:
#14. Add WorkloadCategory using this UDF
df.withColumn("WorkloadCategory", workload_udf("WorkHours")).show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|            Full|
+----------+-----+----------+-------+---------+----------+---------+-----

In [0]:
#15. Introduce nulls in Mode column (for demonstration)

df_null = df.withColumn("Mode", expr("CASE WHEN EmployeeID = 'E102' THEN NULL ELSE Mode END"))
df_null.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|  NULL|Wednesday|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|  NULL| Saturday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+



In [0]:
#16. Fill nulls with "Not Provided"
df_filled = df_null.fillna({"Mode": "Not Provided"})
df_filled.show()

+----------+-----+----------+-------+---------+----------+---------+------------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|      Remote|Wednesday|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|Wednesday|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|      Onsite|   Friday|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Not Provided| Saturday|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+



In [0]:
#17. Drop rows where WorkHours < 4

df_filtered = df.filter(df.WorkHours >= 4)
df_filtered.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+



In [0]:
#18. Mark "Remote Worker" if >80% entries are Remote
from pyspark.sql.functions import count, when

df.groupBy("EmployeeID").agg(
    (sum(when(df.Mode == "Remote", 1).otherwise(0)) / count("*")).alias("RemoteRatio")
).withColumn("RemoteWorker", when(expr("RemoteRatio >= 0.8"), "Yes").otherwise("No")).show()


+----------+-----------+------------+
|EmployeeID|RemoteRatio|RemoteWorker|
+----------+-----------+------------+
|      E103|        1.0|         Yes|
|      E104|        0.0|          No|
|      E101|        1.0|         Yes|
|      E102|        0.5|          No|
+----------+-----------+------------+



In [0]:
#19. Add new column ExtraHours where hours > 8
from pyspark.sql.functions import when

df.withColumn("ExtraHours", when(df.WorkHours > 8, df.WorkHours - 8).otherwise(0)).show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|ExtraHours|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|         0|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         0|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         0|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|         1|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         0|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|         0|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------+



In [0]:
#20. Append dummy timesheet using unionByName()
from pyspark.sql.functions import lit
from pyspark.sql import Row

new_data = [("E999", "Intern1", "IT", "Alpha", 5, "2024-05-05", "Chennai", "Onsite")]
columns = ["EmployeeID", "Name", "Department", "Project", "WorkHours", "WorkDate", "Location", "Mode"]

df_dummy = spark.createDataFrame(new_data, columns)
df_dummy = df_dummy.withColumn("WorkDate", df_dummy["WorkDate"].cast("date"))

df_combined = df.unionByName(df_dummy, allowMissingColumns=True)
df_combined.show()


+----------+-------+----------+-------+---------+----------+---------+------+---------+
|EmployeeID|   Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-------+----------+-------+---------+----------+---------+------+---------+
|      E101|  Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E102|    Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|
|      E103|   John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|
|      E101|  Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|
|      E104|  Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|
|      E102|    Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|
|      E999|Intern1|        IT|  Alpha|        5|2024-05-05|  Chennai|Onsite|     NULL|
+----------+-------+----------+-------+---------+----------+---------+------+---------+



In [0]:
#21. Remove duplicate rows

df_combined.dropDuplicates().show()

+----------+-------+----------+-------+---------+----------+---------+------+---------+
|EmployeeID|   Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-------+----------+-------+---------+----------+---------+------+---------+
|      E101|  Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|
|      E102|    Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|
|      E102|    Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|
|      E101|  Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E103|   John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|
|      E104|  Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|
|      E999|Intern1|        IT|  Alpha|        5|2024-05-05|  Chennai|Onsite|     NULL|
+----------+-------+----------+-------+---------+----------+---------+------+---------+

