In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import random
spark=SparkSession.builder\
      .appName("Combining-data")\
      .getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7f9476dd0590>

In [0]:
data = [("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)
df.show()

performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)
df_perf.show()

project_data = [
    ("Ananya", "HR Portal", 129),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
df_proj = spark.createDataFrame(project_data, ["Name", "Project", "HoursWorked"])
df_proj.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+

+------+----------------+-----------+
|  Name|         Project|HoursWorked|
+------+----------------+-----------+
|Ananya|       HR Portal|        129|
| Rahul|   Data Platform|        200|
| Priya|   Data Platform|        180|
|  Zoya|Campaign Tracker|        100|
| Karan|       HR Portal|        130|
|Naveen|     ML Pipeline|        220|
|Fatima|Campaign Tracker|         90|
+------+----------------+-----------+



In [0]:
#1. Join employee_data, performance_data, and project_data
merged_df = df.join(df_perf, "Name").join(df_proj, "Name")
display(merged_df)

#2. Compute total hours worked per department
hours_by_dept = merged_df.groupBy("Department").agg(sum("HoursWorked").alias("Total_Hours"))
display(hours_by_dept)

#3. Compute average rating per project
avg_rating = merged_df.groupBy("Project").agg(avg("Rating").alias("Avg_Rating"))
display(avg_rating)

Name,Department,Salary,Year,Rating,Project,HoursWorked
Ananya,HR,52000,2023,4.5,HR Portal,129
Rahul,Engineering,65000,2023,4.9,Data Platform,200
Priya,Engineering,60000,2023,4.3,Data Platform,180
Zoya,Marketing,48000,2023,3.8,Campaign Tracker,100
Karan,HR,53000,2023,4.1,HR Portal,130
Naveen,Engineering,70000,2023,4.7,ML Pipeline,220
Fatima,Marketing,45000,2023,3.9,Campaign Tracker,90


Department,Total_Hours
HR,259
Engineering,600
Marketing,190


Project,Avg_Rating
HR Portal,4.3
Data Platform,4.6
Campaign Tracker,3.85
ML Pipeline,4.7


In [0]:
#4. Add a row to performance_data with a None rating
from pyspark.sql import Row
from pyspark.sql.functions import when, avg
new_perf_row = Row("Zahira", 2025, None)
df_null = df_perf.union(spark.createDataFrame([new_perf_row], df_perf.schema))
display(df_null)

#5. Filter rows with null values
null_ratings = df_null.filter(col("Rating").isNull())
display(null_ratings)

#6. Replace null ratings with the department average
from pyspark.sql.window import Window
perf_with_dept = df_null.join(df, "Name")
window_spec = Window.partitionBy("Department")
avg_rating_by_dept = avg("Rating").over(window_spec)
filled_ratings = perf_with_dept.withColumn("Rating", coalesce(col("Rating"), avg_rating_by_dept))
display(filled_ratings.select("Name", "Department", "Rating"))

Name,Year,Rating
Ananya,2023,4.5
Rahul,2023,4.9
Priya,2023,4.3
Zoya,2023,3.8
Karan,2023,4.1
Naveen,2023,4.7
Fatima,2023,3.9
Zahira,2025,


Name,Year,Rating
Zahira,2025,


Name,Department,Rating
Rahul,Engineering,4.900000095367432
Priya,Engineering,4.300000190734863
Naveen,Engineering,4.699999809265137
Ananya,HR,4.5
Karan,HR,4.099999904632568
Zoya,Marketing,3.799999952316284
Fatima,Marketing,3.900000095367432


In [0]:
#7. Create PerformanceCategory column
performance_categories = merged_df.withColumn("PerformanceCategory",when(col("Rating") >= 4.7, "Excellent")
    .when((col("Rating") >= 4.0) & (col("Rating") < 4.7), "Good").otherwise("Average"))
performance_categories.select("Name", "Rating", "PerformanceCategory").show()

#8. Create a UDF to assign bonus
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def calculate_bonus(hours):
    return 110000 if hours > 200 else 15000
bonus_udf = udf(calculate_bonus, IntegerType())
bonus = merged_df.withColumn("Bonus", bonus_udf(col("HoursWorked")))
bonus.select("Name", "HoursWorked", "Bonus").show()

+------+------+-------------------+
|  Name|Rating|PerformanceCategory|
+------+------+-------------------+
|Ananya|   4.5|               Good|
| Rahul|   4.9|          Excellent|
| Priya|   4.3|               Good|
|  Zoya|   3.8|            Average|
| Karan|   4.1|               Good|
|Naveen|   4.7|          Excellent|
|Fatima|   3.9|            Average|
+------+------+-------------------+

+------+-----------+------+
|  Name|HoursWorked| Bonus|
+------+-----------+------+
|Ananya|        129| 15000|
| Rahul|        200| 15000|
| Priya|        180| 15000|
|  Zoya|        100| 15000|
| Karan|        130| 15000|
|Naveen|        220|110000|
|Fatima|         90| 15000|
+------+-----------+------+



In [0]:
#9. Add JoinDate and Monthsworked columns
join_date = merged_df.withColumn("JoinDate", to_date(lit("2021-06-01")))
months_worked = join_date.withColumn("MonthsWorked", months_between(current_date(), col("JoinDate"))).select("Name", "JoinDate", "MonthsWorked")
display(months_worked)

#10. Calculate how many employees joined before 2022
dated = join_date.filter(year(col("JoinDate")) < 2022).count()
print(f"Number of employees who joined before 2022: {dated}")

Name,JoinDate,MonthsWorked
Ananya,2021-06-01,48.32258065
Rahul,2021-06-01,48.32258065
Priya,2021-06-01,48.32258065
Zoya,2021-06-01,48.32258065
Karan,2021-06-01,48.32258065
Naveen,2021-06-01,48.32258065
Fatima,2021-06-01,48.32258065


Number of employees who joined before 2022: 7


In [0]:
#11. Union with extra employees
extra_employees = [
    ("Meena", "HR", 48000),
    ("Raj", "Marketing", 51000)
]
df_extra = spark.createDataFrame(extra_employees, ["Name", "Department", "Salary"])
all_employees = df.union(df_extra)
display(all_employees)

Name,Department,Salary
Ananya,HR,52000
Rahul,Engineering,65000
Priya,Engineering,60000
Zoya,Marketing,48000
Karan,HR,53000
Naveen,Engineering,70000
Fatima,Marketing,45000
Meena,HR,48000
Raj,Marketing,51000


In [0]:
merged_df.write.partitionBy("Department").parquet("employee_partitioned.parquet")
