In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import random
from datetime import datetime, timedelta

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("azureToColab").getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7fb4662d7c50>

In [0]:
data = [("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)
df.show()

performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)
df_perf.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+



In [0]:
#1. Get the average salary by department
avg_sal= df.groupBy("Department").agg(avg("Salary").alias("AvgSalary"))
display(avg_sal)

#2. Count of employees per department
emp_count= df.groupBy("Department").count()
display(emp_count)

#3. Maximum and minimum salary in Engineering

max_min = df.filter(col("Department") == "Engineering") \
.agg(max("Salary").alias("MaxSalary"), min("Salary").alias("MinSalary"))
display(max_min)

Department,AvgSalary
HR,52500.0
Engineering,65000.0
Marketing,46500.0


Department,count
HR,2
Engineering,3
Marketing,2


MaxSalary,MinSalary
70000,60000


In [0]:
#4. Perform an inner join between employee_data and performance_data on Name
joined_df = df.join(df_perf, "name", "inner")
display(joined_df)

#5. Show each employee's salary and performance ratin
rating = joined_df.select("name", "salary", "rating")
display(rating)

#6. Filter employees with rating > 4.5 and salary > 60000
high_perf = joined_df.filter((col("Rating") > 4.5) & (col("salary") > 60000))
display(high_perf)

Name,Department,Salary,Year,Rating
Ananya,HR,52000,2023,4.5
Rahul,Engineering,65000,2023,4.9
Priya,Engineering,60000,2023,4.3
Zoya,Marketing,48000,2023,3.8
Karan,HR,53000,2023,4.1
Naveen,Engineering,70000,2023,4.7
Fatima,Marketing,45000,2023,3.9


name,salary,rating
Ananya,52000,4.5
Rahul,65000,4.9
Priya,60000,4.3
Zoya,48000,3.8
Karan,53000,4.1
Naveen,70000,4.7
Fatima,45000,3.9


Name,Department,Salary,Year,Rating
Rahul,Engineering,65000,2023,4.9
Naveen,Engineering,70000,2023,4.7


In [0]:
#7. Rank employees by salary department-wise
dept_wise = Window.partitionBy("Department").orderBy(col("Salary").desc())
ranked_employees = joined_df.withColumn("SalaryRank", rank().over(dept_wise))
display(ranked_employees)

#8. Calculate cumulative salary in each department
sal_cumulative = Window.partitionBy("Department").orderBy("Salary").rowsBetween(Window.unboundedPreceding, Window.currentRow)
cumulative_sal = joined_df.withColumn("CumulativeSalary", sum("Salary").over(sal_cumulative))
display(cumulative_sal)

Name,Department,Salary,Year,Rating,SalaryRank
Naveen,Engineering,70000,2023,4.7,1
Rahul,Engineering,65000,2023,4.9,2
Priya,Engineering,60000,2023,4.3,3
Karan,HR,53000,2023,4.1,1
Ananya,HR,52000,2023,4.5,2
Zoya,Marketing,48000,2023,3.8,1
Fatima,Marketing,45000,2023,3.9,2


Name,Department,Salary,Year,Rating,CumulativeSalary
Priya,Engineering,60000,2023,4.3,60000
Rahul,Engineering,65000,2023,4.9,125000
Naveen,Engineering,70000,2023,4.7,195000
Ananya,HR,52000,2023,4.5,52000
Karan,HR,53000,2023,4.1,105000
Fatima,Marketing,45000,2023,3.9,45000
Zoya,Marketing,48000,2023,3.8,93000


In [0]:
#9. Add a new column JoinDate with random dates between 2020 and 2023
def random_date(start_year, end_year):
    start = datetime(start_year, 1, 1)
    end = datetime(end_year, 12, 31)
    delta = end - start
    random_days = random.randint(0, delta.days)
    return (start + timedelta(days=random_days)).strftime('%Y-%m-%d')
random_date_udf = udf(random_date, StringType())

emp_str = df.withColumn("JoinDate", random_date_udf(lit(2020), lit(2023)))
emp_with_date = emp_str.withColumn("JoinDate", to_date(col("JoinDate"), "yyyy-MM-dd"))

#10. Add column YearsWithCompany using current_date() and datediff()
years = emp_with_date.withColumn("YearsWithCompany",round(datediff(current_date(), col("JoinDate")) / 365, 1))
display(years)

Name,Department,Salary,JoinDate,YearsWithCompany
Ananya,HR,52000,2023-02-24,2.3
Rahul,Engineering,65000,2021-03-03,4.3
Priya,Engineering,60000,2022-03-31,3.2
Zoya,Marketing,48000,2020-09-07,4.8
Karan,HR,53000,2022-02-03,3.4
Naveen,Engineering,70000,2021-09-29,3.7
Fatima,Marketing,45000,2020-05-02,5.1


In [0]:
#11. Write the full employee DataFrame to CSV with headers
years.write.csv("/FileStore/employee_data", header=True, mode="overwrite")
display(spark.read.csv("/FileStore/employee_data", header=True))


#12. Save the joined DataFrame to a Parquet file
joined_df.write.parquet("/FileStore/employee_perf", mode="overwrite")
display(spark.read.parquet("/FileStore/employee_perf"))

Name,Department,Salary,JoinDate,YearsWithCompany
Naveen,Engineering,70000,2022-01-13,3.4
Rahul,Engineering,65000,2023-11-18,1.6
Priya,Engineering,60000,2021-09-22,3.7
Fatima,Marketing,45000,2020-09-29,4.7
Zoya,Marketing,48000,2020-04-24,5.1
Ananya,HR,52000,2023-11-06,1.6
Karan,HR,53000,2021-08-20,3.8


Name,Department,Salary,Year,Rating
Naveen,Engineering,70000,2023,4.7
Rahul,Engineering,65000,2023,4.9
Priya,Engineering,60000,2023,4.3
Fatima,Marketing,45000,2023,3.9
Zoya,Marketing,48000,2023,3.8
Ananya,HR,52000,2023,4.5
Karan,HR,53000,2023,4.1
