In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window
spark = SparkSession.builder \
    .appName("HRAnalytics") \
    .getOrCreate()

**Task 1: Ingestion & Exploration**

In [3]:
# Read datasets
employees = spark.read.option("header", True).option("inferSchema", True).csv("employees.csv")
attendance = spark.read.option("header", True).option("inferSchema", True).csv("attendance.csv")
bonuses = spark.read.json("bonuses.json")

# Show schemas and sample records
print("Employees schema:")
employees.printSchema()
employees.show()

print("\nAttendance schema:")
attendance.printSchema()
attendance.show()

print("\nBonuses schema:")
bonuses.printSchema()
bonuses.show()

# Count distinct departments
print("\nDistinct departments:")
employees.select("Department").distinct().show()

Employees schema:
root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+


Attendance schema:
root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

+-----+----------+-------+
|EmpID|      Date| Status|
+-----+----------+-------+
|    1|2024-04-01|Present|
|

**Task 2: DataFrame Operations**

In [5]:
# Add TenureYears column
employees = employees.withColumn("TenureYears", round(datediff(current_date(), col("JoinDate"))/365, 1))
employees.select("EmpID", "JoinDate", "TenureYears").show()

# Calculate TotalCompensation (after joining with bonuses)
emp_bonus_df = employees.join(bonuses, on="EmpID", how="left")
emp_bonus_df = emp_bonus_df.withColumn("TotalCompensation", col("Salary") + col("Bonus"))
emp_bonus_df.select("EmpID", "Salary", "Bonus", "TotalCompensation").show()

# Filter employees with >2 years tenure
tenured_employees = employees.filter(col("TenureYears") > 2)
tenured_employees.show()

# Show employees with managers
has_manager = employees.filter(col("ManagerID").isNotNull())
has_manager.show()

+-----+----------+-----------+
|EmpID|  JoinDate|TenureYears|
+-----+----------+-----------+
|    1|2021-05-01|        4.1|
|    2|2020-03-15|        5.2|
|    3|2022-07-10|        2.9|
|    4|2019-11-20|        5.6|
|    5|2023-01-05|        2.4|
+-----+----------+-----------+

+-----+------+-----+-----------------+
|EmpID|Salary|Bonus|TotalCompensation|
+-----+------+-----+-----------------+
|    1| 55000| 5000|            60000|
|    2| 80000| 7000|            87000|
|    3| 75000| 6500|            81500|
|    4| 60000| 6000|            66000|
|    5| 50000| 4000|            54000|
+-----+------+-----+-----------------+

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|
|    3|Simran|Engineering|2022-07

**Task 3: Aggregation**

In [7]:
# Average salary per department
avg_salary = employees.groupBy("Department") \
.agg(avg("Salary").alias("AvgSalary")) \
.orderBy("AvgSalary", ascending=False)
avg_salary.show()

# Number of employees per manager
employees_per_manager = employees.groupBy("ManagerID") \
.agg(count("*").alias("TeamSize")) \
.filter(col("ManagerID").isNotNull())
employees_per_manager.show()

# Count of absences per employee
absences = attendance.filter(col("Status") == "Absent") \
.groupBy("EmpID") \
.agg(count("*").alias("AbsenceCount"))
absences.show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  77500.0|
|  Marketing|  60000.0|
|         HR|  52500.0|
+-----------+---------+

+---------+--------+
|ManagerID|TeamSize|
+---------+--------+
|        1|       4|
+---------+--------+

+-----+------------+
|EmpID|AbsenceCount|
+-----+------------+
|    4|           2|
|    2|           1|
+-----+------------+



**Task 4: Joins**

In [8]:
# Join employees and attendance for attendance percentage
stats = employees.join(attendance.groupBy("EmpID").agg(count("*").alias("TotalDays"),
sum(when(col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")),"EmpID").withColumn("AttendancePct", round(col("PresentDays")/col("TotalDays")*100, 2))
stats.show()

# Join employees and bonuses for compensation
compensation = employees.join(bonuses, "EmpID") \
.withColumn("TotalCompensation", col("Salary") + col("Bonus")) \
.orderBy("TotalCompensation", ascending=False)
compensation.show()

# Multi-level join
full_join = employees.join(bonuses, "EmpID") \
.join(stats, "EmpID")
full_join.show()

+-----+------+-----------+----------+------+---------+-----------+---------+-----------+-------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|TotalDays|PresentDays|AttendancePct|
+-----+------+-----------+----------+------+---------+-----------+---------+-----------+-------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|        2|          2|        100.0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|        2|          1|         50.0|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|        2|          2|        100.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|        2|          0|          0.0|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|        2|          2|        100.0|
+-----+------+-----------+----------+------+---------+-----------+---------+-----------+-------------+

+-----+------+-----------+----------+------+---------+-----------+-----+

**Task 5: String & Date Functions**

In [9]:
#Extract year and month from JoinDate
employees = employees.withColumn("JoinYear", year(col("JoinDate"))) \
.withColumn("JoinMonth", month(col("JoinDate")))
employees.select("EmpID", "JoinDate", "JoinYear", "JoinMonth").show()

# Mask employee names
employees = employees.withColumn("MaskedName",
regexp_replace(col("Name"), "(?<=.).", "*"))
employees.select("EmpID", "Name", "MaskedName").show()

# Create EmpCode
employees = employees.withColumn("EmpCode",
concat(lit("EMP"), lpad(col("EmpID"), 3, "0")))
employees.select("EmpID", "EmpCode").show()

+-----+----------+--------+---------+
|EmpID|  JoinDate|JoinYear|JoinMonth|
+-----+----------+--------+---------+
|    1|2021-05-01|    2021|        5|
|    2|2020-03-15|    2020|        3|
|    3|2022-07-10|    2022|        7|
|    4|2019-11-20|    2019|       11|
|    5|2023-01-05|    2023|        1|
+-----+----------+--------+---------+

+-----+------+----------+
|EmpID|  Name|MaskedName|
+-----+------+----------+
|    1| Anita|     A****|
|    2|   Raj|       R**|
|    3|Simran|    S*****|
|    4| Aamir|     A****|
|    5| Nisha|     N****|
+-----+------+----------+

+-----+-------+
|EmpID|EmpCode|
+-----+-------+
|    1| EMP001|
|    2| EMP002|
|    3| EMP003|
|    4| EMP004|
|    5| EMP005|
+-----+-------+



**Task 6: Conditional & Null Handling**

In [10]:
# Label performance based on bonus
bonuses = bonuses.withColumn("Performance",
when(col("Bonus") > 6000, "High").when((col("Bonus") >= 4000) & (col("Bonus") <= 6000), "Medium").otherwise("Low"))
bonuses.show()

# Handle missing ManagerID
employees = employees.na.fill({"ManagerID": "No Manager"})
employees.select("EmpID", "Name", "ManagerID").show()

+-----+-----+----+---------------+-----------+
|Bonus|EmpID|Year|_corrupt_record|Performance|
+-----+-----+----+---------------+-----------+
| NULL| NULL|NULL|              [|        Low|
| 5000|    1|2023|           NULL|     Medium|
| 7000|    2|2023|           NULL|       High|
| 6500|    3|2023|           NULL|       High|
| 6000|    4|2023|           NULL|     Medium|
| 4000|    5|2023|           NULL|     Medium|
| NULL| NULL|NULL|              ]|        Low|
+-----+-----+----+---------------+-----------+

+-----+------+---------+
|EmpID|  Name|ManagerID|
+-----+------+---------+
|    1| Anita|     NULL|
|    2|   Raj|        1|
|    3|Simran|        1|
|    4| Aamir|        1|
|    5| Nisha|        1|
+-----+------+---------+



**Task 7: Spark SQL**

In [13]:
# Create HR database
spark.sql("create database hr")
spark.catalog.setCurrentDatabase("hr")

# Save as tables
employees.write.mode("overwrite").saveAsTable("employees")
attendance.write.mode("overwrite").saveAsTable("attendance")
bonuses.write.mode("overwrite").saveAsTable("bonuses")

# SQL Queries
spark.sql("""select e.Department, e.Name, e.Salary
from employees e
join (select Department, max(Salary) as MaxSalary
from employees group by Department) max_sal
on e.Department = max_sal.Department and e.Salary = max_sal.MaxSalary;""").show()

spark.sql("""select e.Department,
round(sum(case when a.Status = 'Present' then 1 else 0 END) * 100.0/ count(a.Status), 2) as AttendanceRate
from employees e join attendance a ON e.EmpID = a.EmpID
group by e.Department;""").show()

spark.sql("""select * from employees
where year(JoinDate) > 2021 AND salary > 70000""").show()

+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|         HR|Anita| 55000|
|Engineering|  Raj| 80000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|         75.00|
|         HR|        100.00|
|  Marketing|          0.00|
+-----------+--------------+

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|    2022|        7|    S*****| EMP003|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+



**Task 8: Advanced**

In [18]:
# UDF for department classification
def dept_classifier(dept):
    return "Tech" if dept in ["Engineering", "IT"] else "Non-Tech"

dept_classifier_udf = udf(dept_classifier, StringType())
employees = employees.withColumn("DeptType", dept_classifier_udf(col("Department")))
employees.select("EmpID", "Department", "DeptType").show()

# Create view
stats.createOrReplaceTempView("emp_attendance_summary")

# Save as Parquet partitioned by Department
employees.write.partitionBy("Department") \
.mode("overwrite") \
.parquet("hr_employees_partitioned.parquet")

print("\nPerformance ratings:")
bonuses.select("EmpID", "Bonus", "Performance").show()

+-----+-----------+--------+
|EmpID| Department|DeptType|
+-----+-----------+--------+
|    1|         HR|Non-Tech|
|    2|Engineering|    Tech|
|    3|Engineering|    Tech|
|    4|  Marketing|Non-Tech|
|    5|         HR|Non-Tech|
+-----+-----------+--------+


Performance ratings:
+-----+-----+-----------+
|EmpID|Bonus|Performance|
+-----+-----+-----------+
| NULL| NULL|        Low|
|    1| 5000|     Medium|
|    2| 7000|       High|
|    3| 6500|       High|
|    4| 6000|     Medium|
|    5| 4000|     Medium|
| NULL| NULL|        Low|
+-----+-----+-----------+

