In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, when, udf, current_date, months_between, to_date, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = SparkSession.builder.getOrCreate()
spark

In [0]:
# Schema for employee_data
employee_schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Salary", IntegerType(), True)
])

employee_data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
df_emp = spark.createDataFrame(employee_data, schema=employee_schema)

# Schema for performance_data
performance_schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Rating", DoubleType(), True)
])

performance_data = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
df_perf = spark.createDataFrame(performance_data, schema=performance_schema)

# Schema for project_data
project_schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("HoursWorked", IntegerType(), True)
])

project_data = [
    ("Ananya", "HR Portal", 120),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
df_proj = spark.createDataFrame(project_data, schema=project_schema)


In [0]:
# 1. Join employee_data, performance_data, and project_data
from pyspark.sql.functions import *

df_joined = df_emp.join(df_perf, "Name").join(df_proj, "Name")
df_joined.show()



+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+



In [0]:
# 2. Total hours worked per department
from pyspark.sql.functions import *

df_joined = df_emp.join(df_perf, "Name").join(df_proj, "Name")
df_joined.show()


+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+



In [0]:
# 3. Average rating per project
from pyspark.sql.functions import avg

df_avg_rating = df_joined.groupBy("Project").agg(avg("Rating").alias("AverageRating"))
df_avg_rating.show()


+----------------+------------------+
|         Project|     AverageRating|
+----------------+------------------+
|       HR Portal|               4.3|
|   Data Platform|               4.6|
|Campaign Tracker|3.8499999999999996|
|     ML Pipeline|               4.7|
+----------------+------------------+



In [0]:
# 4. Add a row with None rating
from pyspark.sql import Row

new_row = Row(Name="Meena", Year=2023, Rating=None)
df_perf_null = df_perf.union(spark.createDataFrame([new_row], schema=performance_schema))
df_perf_null.show()


+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
| Meena|2023|  NULL|
+------+----+------+



In [0]:
#5 Filter rows with null values
df_perf_null.filter(col("Rating").isNull()).show()

+-----+----+------+
| Name|Year|Rating|
+-----+----+------+
|Meena|2023|  NULL|
+-----+----+------+



In [0]:
#6 Replace null ratings with the department average
from pyspark.sql.functions import col, avg, when, lit, coalesce

df_perf_dept = df_perf_null.join(df_emp, on="Name", how="left")
dept_avg = df_perf_dept.filter(col("Rating").isNotNull()) \
    .groupBy("Department").agg(avg("Rating").alias("DeptAvg"))
global_avg = df_perf_dept.filter(col("Rating").isNotNull()) \
    .agg(avg("Rating").alias("GlobalAvg")).first()["GlobalAvg"]
df_filled = df_perf_dept.join(dept_avg, on="Department", how="left") \
    .withColumn("Rating", when(col("Rating").isNull(), coalesce(col("DeptAvg"), lit(global_avg)))
                .otherwise(col("Rating"))) \
    .select("Name", "Department", "Year", "Rating")
df_filled.show()


+------+-----------+----+-----------------+
|  Name| Department|Year|           Rating|
+------+-----------+----+-----------------+
|Ananya|         HR|2023|              4.5|
| Rahul|Engineering|2023|              4.9|
| Priya|Engineering|2023|              4.3|
|  Zoya|  Marketing|2023|              3.8|
| Karan|         HR|2023|              4.1|
|Naveen|Engineering|2023|              4.7|
|Fatima|  Marketing|2023|              3.9|
| Meena|       NULL|2023|4.314285714285714|
+------+-----------+----+-----------------+



In [0]:
#7 Create a column PerformanceCategory
from pyspark.sql.functions import when

df_category = df_filled.withColumn(
    "PerformanceCategory",
    when(col("Rating") >= 4.7, "Excellent")
    .when(col("Rating") >= 4.0, "Good")
    .otherwise("Average")
)
df_category.show()


+------+-----------+----+-----------------+-------------------+
|  Name| Department|Year|           Rating|PerformanceCategory|
+------+-----------+----+-----------------+-------------------+
|Ananya|         HR|2023|              4.5|               Good|
| Rahul|Engineering|2023|              4.9|          Excellent|
| Priya|Engineering|2023|              4.3|               Good|
|  Zoya|  Marketing|2023|              3.8|            Average|
| Karan|         HR|2023|              4.1|               Good|
|Naveen|Engineering|2023|              4.7|          Excellent|
|Fatima|  Marketing|2023|              3.9|            Average|
| Meena|       NULL|2023|4.314285714285714|               Good|
+------+-----------+----+-----------------+-------------------+



In [0]:
#8 Create a UDF to assign bonus
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def bonus_udf(hours):
    return 10000 if hours > 200 else 5000

bonus = udf(bonus_udf, IntegerType())

df_bonus = df_joined.withColumn("Bonus", bonus(col("HoursWorked")))
df_bonus.select("Name", "Project", "HoursWorked", "Bonus").show()

+------+----------------+-----------+-----+
|  Name|         Project|HoursWorked|Bonus|
+------+----------------+-----------+-----+
|Ananya|       HR Portal|        120| 5000|
| Priya|   Data Platform|        180| 5000|
| Rahul|   Data Platform|        200| 5000|
|  Zoya|Campaign Tracker|        100| 5000|
| Karan|       HR Portal|        130| 5000|
|Naveen|     ML Pipeline|        220|10000|
|Fatima|Campaign Tracker|         90| 5000|
+------+----------------+-----------+-----+



In [0]:
 #9. Add JoinDate and MonthsWorked
from pyspark.sql.functions import current_date, months_between, to_date, lit

df_join_date = df_emp.withColumn("JoinDate", lit("2021-06-01")) \
    .withColumn("JoinDate", to_date("JoinDate")) \
    .withColumn("MonthsWorked", months_between(current_date(), col("JoinDate")).cast("int"))

df_join_date.select("Name", "Department", "JoinDate", "MonthsWorked").show()

+------+-----------+----------+------------+
|  Name| Department|  JoinDate|MonthsWorked|
+------+-----------+----------+------------+
|Ananya|         HR|2021-06-01|          48|
| Rahul|Engineering|2021-06-01|          48|
| Priya|Engineering|2021-06-01|          48|
|  Zoya|  Marketing|2021-06-01|          48|
| Karan|         HR|2021-06-01|          48|
|Naveen|Engineering|2021-06-01|          48|
|Fatima|  Marketing|2021-06-01|          48|
+------+-----------+----------+------------+



In [0]:
#10. Calculate employees joined before 2022
from pyspark.sql.functions import to_date

df_before_2022 = df_join_date.filter(col("JoinDate") < to_date(lit("2022-01-01")))
df_before_2022.select("Name", "JoinDate").show()

+------+----------+
|  Name|  JoinDate|
+------+----------+
|Ananya|2021-06-01|
| Rahul|2021-06-01|
| Priya|2021-06-01|
|  Zoya|2021-06-01|
| Karan|2021-06-01|
|Naveen|2021-06-01|
|Fatima|2021-06-01|
+------+----------+



In [0]:
#11. Union with extra employees
extra_employees = [
    ("Meena", "HR", 48000),
    ("Raj", "Marketing", 51000)
]
df_extra = spark.createDataFrame(extra_employees, schema=employee_schema)
df_emp_union = df_emp.union(df_extra)
df_emp_union.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
| Meena|         HR| 48000|
|   Raj|  Marketing| 51000|
+------+-----------+------+



In [0]:
#12. Save merged dataset as partitioned Parquet (by Department)
from pyspark.sql.functions import *

df_merged = df_emp.join(df_perf, "Name", "inner") \
                  .join(df_proj, "Name", "inner")

df_merged.write.mode("overwrite").partitionBy("Department").parquet("/tmp/merged_output_partitioned")
