In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *


spark = SparkSession.builder.appName("PySparkTaskSet").getOrCreate()
spark


In [10]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')
# Set data paths
base_path = "/content/drive/MyDrive"
emp_path = f"{base_path}/employees.csv"
att_path = f"{base_path}/attendance.csv"
bonus_path = f"{base_path}/bonuses.json"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
#1. Ingestion & Exploration
#Read all 3 files (CSV + JSON) using PySpark
employees = spark.read.option("header", True).csv(emp_path, inferSchema=True)
attendance = spark.read.option("header", True).csv(att_path, inferSchema=True)
bonuses = spark.read.option("multiline", True).json(bonus_path)

In [12]:
#Show schemas and sample records
employees.printSchema()
attendance.printSchema()
bonuses.printSchema()

employees.show(truncate=False)
attendance.show(truncate=False)
bonuses.show(truncate=False)

root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

root
 |-- Bonus: long (nullable = true)
 |-- EmpID: long (nullable = true)
 |-- Year: long (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|Name  |Department |JoinDate  |Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|1    |Anita |HR         |2021-05-01|55000 |NULL     |
|2    |Raj   |Engineering|2020-03-15|80000 |1        |
|3    |Simran|Engineering|2022-07-10|75000 |1        |
|4    |Aamir |Marketing  |2019-11-20|60000 |1        |
|5    |Nisha |HR         |2023-01-05|50000 |1        |
+-----+------+-----------+----------+------+---------+

+-----+----------+-------+
|EmpID|Dat

In [13]:
#Count distinct departments
employees.select("Department").distinct().show()

+-----------+
| Department|
+-----------+
|Engineering|
|         HR|
|  Marketing|
+-----------+



In [14]:
#2. DataFrame Operations
#Add a column TenureYears using datediff() and round()
from pyspark.sql.functions import datediff, current_date, round, col

emp_with_tenure = employees.withColumn("TenureYears", round(datediff(current_date(), col("JoinDate"))/365, 2))
emp_with_tenure.show()

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|
+-----+------+-----------+----------+------+---------+-----------+



In [15]:
#Calculate TotalCompensation = Salary + Bonus
emp_bonus = emp_with_tenure.join(bonuses, "EmpID", "left")
emp_comp = emp_bonus.withColumn("TotalCompensation", col("Salary") + col("Bonus"))
emp_comp.select("EmpID", "Name", "Salary", "Bonus", "TotalCompensation").show()

+-----+------+------+-----+-----------------+
|EmpID|  Name|Salary|Bonus|TotalCompensation|
+-----+------+------+-----+-----------------+
|    1| Anita| 55000| 5000|            60000|
|    2|   Raj| 80000| 7000|            87000|
|    3|Simran| 75000| 6500|            81500|
|    4| Aamir| 60000| 6000|            66000|
|    5| Nisha| 50000| 4000|            54000|
+-----+------+------+-----+-----------------+



In [16]:
#Filter employees with more than 2 years
emp_comp.filter(col("TenureYears") > 2).show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11| 5000|2023|            60000|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24| 7000|2023|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92| 6500|2023|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56| 6000|2023|            66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43| 4000|2023|            54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+



In [17]:
#Employees who report to a manager
employees.filter(col("ManagerID").isNotNull()).show()

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+



In [18]:
#3. Aggregation
#Average salary per department
employees.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  77500.0|
|         HR|  52500.0|
|  Marketing|  60000.0|
+-----------+---------+



In [19]:
#Number of employees under each manager
employees.groupBy("ManagerID").count().withColumnRenamed("count", "EmployeesUnder").show()

+---------+--------------+
|ManagerID|EmployeesUnder|
+---------+--------------+
|     NULL|             1|
|        1|             4|
+---------+--------------+



In [20]:
#Count of absences per employee
attendance.filter(col("Status") == "Absent").groupBy("EmpID").count().withColumnRenamed("count", "Absences").show()

+-----+--------+
|EmpID|Absences|
+-----+--------+
|    4|       2|
|    2|       1|
+-----+--------+



In [21]:
#4. Joins
#Join employees + attendance → Attendance %
total_days = attendance.groupBy("EmpID").count().withColumnRenamed("count", "TotalDays")
present_days = attendance.filter(col("Status") == "Present").groupBy("EmpID").count().withColumnRenamed("count", "PresentDays")
attendance_pct = total_days.join(present_days, "EmpID").withColumn("AttendancePct", round(col("PresentDays") / col("TotalDays") * 100, 2))
attendance_pct.show()

+-----+---------+-----------+-------------+
|EmpID|TotalDays|PresentDays|AttendancePct|
+-----+---------+-----------+-------------+
|    1|        2|          2|        100.0|
|    3|        2|          2|        100.0|
|    5|        2|          2|        100.0|
|    2|        2|          1|         50.0|
+-----+---------+-----------+-------------+



In [22]:
#Join employees + bonuses → Top 3 by TotalCompensation
top3 = emp_comp.orderBy(col("TotalCompensation").desc()).limit(3)
top3.select("EmpID", "Name", "TotalCompensation").show()

+-----+------+-----------------+
|EmpID|  Name|TotalCompensation|
+-----+------+-----------------+
|    2|   Raj|            87000|
|    3|Simran|            81500|
|    4| Aamir|            66000|
+-----+------+-----------------+



In [39]:
#Multi-level join: employees + bonuses + attendance
multi_join = employees.join(bonuses, "EmpID", "left").join(attendance, "EmpID", "left")
multi_join.show()

+-----+------+-----------+----------+------+---------+-----+----+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|Bonus|Year|      Date| Status|
+-----+------+-----------+----------+------+---------+-----+----+----------+-------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL| 5000|2023|2024-04-02|Present|
|    1| Anita|         HR|2021-05-01| 55000|     NULL| 5000|2023|2024-04-01|Present|
|    2|   Raj|Engineering|2020-03-15| 80000|        1| 7000|2023|2024-04-02|Present|
|    2|   Raj|Engineering|2020-03-15| 80000|        1| 7000|2023|2024-04-01| Absent|
|    3|Simran|Engineering|2022-07-10| 75000|        1| 6500|2023|2024-04-02|Present|
|    3|Simran|Engineering|2022-07-10| 75000|        1| 6500|2023|2024-04-01|Present|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1| 6000|2023|2024-04-02| Absent|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1| 6000|2023|2024-04-01| Absent|
|    5| Nisha|         HR|2023-01-05| 50000|        1| 4000|2023|

In [24]:
#5. String & Date Functions
#Extract year and month from JoinDate
employees.withColumn("JoinYear", year("JoinDate")).withColumn("JoinMonth", month("JoinDate")).select("EmpID", "Name", "JoinYear", "JoinMonth").show()


+-----+------+--------+---------+
|EmpID|  Name|JoinYear|JoinMonth|
+-----+------+--------+---------+
|    1| Anita|    2021|        5|
|    2|   Raj|    2020|        3|
|    3|Simran|    2022|        7|
|    4| Aamir|    2019|       11|
|    5| Nisha|    2023|        1|
+-----+------+--------+---------+



In [25]:
#Mask names using regex_replace
employees.withColumn("MaskedName", regexp_replace("Name", r'[a-zA-Z]', '*')).show()

+-----+------+-----------+----------+------+---------+----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|MaskedName|
+-----+------+-----------+----------+------+---------+----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|     *****|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       ***|
|    3|Simran|Engineering|2022-07-10| 75000|        1|    ******|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|     *****|
|    5| Nisha|         HR|2023-01-05| 50000|        1|     *****|
+-----+------+-----------+----------+------+---------+----------+



In [26]:
#Create EmpCode like EMP001
employees.withColumn("EmpCode", format_string("EMP%03d", col("EmpID"))).show()

+-----+------+-----------+----------+------+---------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|EmpCode|
+-----+------+-----------+----------+------+---------+-------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL| EMP001|
|    2|   Raj|Engineering|2020-03-15| 80000|        1| EMP002|
|    3|Simran|Engineering|2022-07-10| 75000|        1| EMP003|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1| EMP004|
|    5| Nisha|         HR|2023-01-05| 50000|        1| EMP005|
+-----+------+-----------+----------+------+---------+-------+



In [27]:
#6. Conditional & Null Handling
#Label performance
emp_perf = emp_bonus.withColumn(
    "Performance",
    when(col("Bonus") > 6000, "High")
    .when((col("Bonus") >= 4000) & (col("Bonus") <= 6000), "Medium")
    .otherwise("Low")
)
emp_perf.select("EmpID", "Name", "Bonus", "Performance").show()

+-----+------+-----+-----------+
|EmpID|  Name|Bonus|Performance|
+-----+------+-----+-----------+
|    1| Anita| 5000|     Medium|
|    2|   Raj| 7000|       High|
|    3|Simran| 6500|       High|
|    4| Aamir| 6000|     Medium|
|    5| Nisha| 4000|     Medium|
+-----+------+-----+-----------+



In [38]:
#Handle missing ManagerID using fillna("No Manager")
employees.withColumn("ManagerID_str", col("ManagerID").cast("string")).fillna("No Manager", subset=["ManagerID_str"]).show()


+-----+------+-----------+----------+------+---------+-------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|ManagerID_str|
+-----+------+-----------+----------+------+---------+-------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|   No Manager|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|            1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|            1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|            1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|            1|
+-----+------+-----------+----------+------+---------+-------------+



In [29]:
#7. Spark SQL
#Create database
spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.sql("USE hr")

DataFrame[]

In [30]:
#Save as temp views (or as permanent tables in warehouse path if needed)
employees.createOrReplaceTempView("employees")
attendance.createOrReplaceTempView("attendance")
bonuses.createOrReplaceTempView("bonuses")

In [31]:
#Top paid employee in each department
spark.sql("""
SELECT Department, Name, Salary
FROM (
    SELECT *, ROW_NUMBER() OVER (PARTITION BY Department ORDER BY Salary DESC) as rnk
    FROM employees
) WHERE rnk = 1
""").show()


+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|Engineering|  Raj| 80000|
|         HR|Anita| 55000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+



In [32]:
#Attendance rate by department
spark.sql("""
SELECT e.Department, ROUND(SUM(CASE WHEN a.Status='Present' THEN 1 ELSE 0 END)*100.0/COUNT(*), 2) AS AttendanceRate
FROM employees e
JOIN attendance a ON e.EmpID = a.EmpID
GROUP BY e.Department
""").show()

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|         75.00|
|         HR|        100.00|
|  Marketing|          0.00|
+-----------+--------------+



In [33]:
#Employees joined after 2021 with salary > 70000
spark.sql("""
SELECT * FROM employees
WHERE year(JoinDate) > 2021 AND Salary > 70000
""").show()

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    3|Simran|Engineering|2022-07-10| 75000|        1|
+-----+------+-----------+----------+------+---------+



In [37]:
#8. Advanced
# UDF to classify departments
from pyspark.sql.types import StringType

def dept_type(dept):
    return "Tech" if dept in ["Engineering", "IT"] else "Non-Tech"

spark.udf.register("dept_type_udf", dept_type, StringType())

employees.withColumn("DeptType", expr("dept_type_udf(Department)")).show()


+-----+------+-----------+----------+------+---------+--------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|DeptType|
+-----+------+-----------+----------+------+---------+--------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|Non-Tech|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|    Tech|
|    3|Simran|Engineering|2022-07-10| 75000|        1|    Tech|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|Non-Tech|
|    5| Nisha|         HR|2023-01-05| 50000|        1|Non-Tech|
+-----+------+-----------+----------+------+---------+--------+



In [35]:
#Create a view emp_attendance_summary
emp_attendance_summary = employees.join(attendance_pct, "EmpID", "left")
emp_attendance_summary.createOrReplaceTempView("emp_attendance_summary")

In [36]:
#Save as Parquet partitioned by Department
emp_attendance_summary.write.mode("overwrite").partitionBy("Department").parquet("/content/drive/MyDrive/emp_attendance_summary_parquet")