In [11]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder\
    .appName("EmployeeData")\
    .getOrCreate())

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
df= spark.read.csv("/content/drive/MyDrive/large_employee_dataset.csv", header=True, inferSchema=True)
df.show()

+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|          Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|        Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|          Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|       Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|       Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|         Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active|        East Robert|
|      8278|       Riley Johns

In [14]:
#1. Show the top 10 rows of the dataset.
df.show(10)
#2. Count the total number of employees.
print(df.count())
#3. Display unique departments.
df.select("Department").distinct().show()

+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|  New Thomas|
|      852

In [16]:
#Filter all employees in the "IT" department
df.filter(df.Department == "IT").show()

# 5. Show employees aged between 30 and 40
df.filter((df.Age >= 30) & (df.Age <= 40)).show()

# 6. Sort employees by Salary in descending order
df.orderBy(df.Salary.desc()).show()

+----------+-------------------+---+----------+------+-----------+--------+------------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|              City|
+----------+-------------------+---+----------+------+-----------+--------+------------------+
|      6598|        Mary Henson| 58|        IT| 63951| 2021-08-25|  Active|       Port Tricia|
|      8518|   Elizabeth Abbott| 22|        IT| 91732| 2022-11-05|  Active|       Douglasside|
|      9506|        Thomas Dunn| 45|        IT| 90340| 2020-07-12|On Leave|    Lindseychester|
|      9663|        Glenn Mason| 43|        IT|109189| 2020-03-27|On Leave|      Katelynburgh|
|      2106|     Richard Bailey| 45|        IT| 30950| 2021-06-29|Resigned|        North John|
|      8212|      Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave|South Veronicastad|
|      6354|     Nicole Gilmore| 35|        IT|104202| 2018-05-04|  Active|       East Joseph|
|      5716|         David Wang| 49|        IT| 94

In [20]:
# Get the average salary by department
from pyspark.sql.functions import avg, max
df.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()

# Count of employees by Status
df.groupBy("Status").count().show()

# 9. Highest salary in each city
df.groupBy("City").agg(max("Salary").alias("MaxSalary")).show()

+----------+-----------------+
|Department|        AvgSalary|
+----------+-----------------+
|     Sales|77488.54545454546|
|        HR|76091.27450980392|
|   Finance|72834.75630252101|
| Marketing| 71958.1888888889|
|        IT|73116.25555555556|
+----------+-----------------+

+--------+-----+
|  Status|count|
+--------+-----+
|Resigned|  159|
|  Active|  172|
|On Leave|  169|
+--------+-----+

+----------------+---------+
|            City|MaxSalary|
+----------------+---------+
|   Wilsonchester|    67025|
|     Bradshawton|   111116|
|       Steventon|    32009|
|     Lake Alyssa|    84903|
|      North Lisa|    57898|
|    North Marvin|    66252|
|     Jenniferton|    39907|
|     Buckleyview|    50109|
|     Burtonville|    98492|
|    Johnsonmouth|    48799|
|    South Joseph|    52456|
|  Lindseychester|    90340|
|   North Stephen|    91947|
|Port Nicoleshire|    57537|
|    Jerrychester|    53374|
|  North Jennifer|    82486|
|      Laurenstad|    44608|
|West Brendanbury|  

In [23]:
# Total number of employees who joined each year
from pyspark.sql.functions import year, to_date
df_with_year = df.withColumn("JoiningYear", year(to_date("JoiningDate")))
df_with_year.groupBy("JoiningYear").count().orderBy("JoiningYear").show()

# 11. Department-wise count of employees who are currently "Active"
df.filter(df.Status == "Active").groupBy("Department").count().show()

# 12. Average age of employees per department
df.groupBy("Department").agg(avg("Age").alias("AvgAge")).show()

+-----------+-----+
|JoiningYear|count|
+-----------+-----+
|       2015|   37|
|       2016|   49|
|       2017|   44|
|       2018|   52|
|       2019|   52|
|       2020|   56|
|       2021|   49|
|       2022|   49|
|       2023|   47|
|       2024|   38|
|       2025|   27|
+-----------+-----+

+----------+-----+
|Department|count|
+----------+-----+
|     Sales|   32|
|        HR|   37|
|   Finance|   45|
| Marketing|   32|
|        IT|   26|
+----------+-----+

+----------+------------------+
|Department|            AvgAge|
+----------+------------------+
|     Sales|40.535353535353536|
|        HR| 41.46078431372549|
|   Finance| 39.21008403361345|
| Marketing| 41.82222222222222|
|        IT| 38.68888888888889|
+----------+------------------+



In [24]:
#Create another dataset with City and Region, and join it
cities = df.select("City").distinct().collect()
regions = [("City_" + str(i), "Region_" + str((i % 5) + 1)) for i in range(len(cities))]
region_df = spark.createDataFrame(regions, ["City", "Region"])
joined_df = df.join(region_df, "City", "left")
joined_df.show()

# 14. Group salaries by Region after the join
joined_df.groupBy("Region").agg(avg("Salary").alias("AvgSalary")).show()

+-------------------+----------+--------------------+---+----------+------+-----------+--------+------+
|               City|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|Region|
+-------------------+----------+--------------------+---+----------+------+-----------+--------+------+
|          Allentown|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|  null|
|        Anthonyfort|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|  null|
|        Bennettstad|      2758|       Brittany Kerr| 58|     Sales| 71105| 2019-03-24|  Active|  null|
|        Caseborough|      1298|     Valerie Fleming| 42| Marketing| 70137| 2019-12-08|Resigned|  null|
|        East Robert|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active|  null|
|          Gilesstad|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|  null|
|       Jenniferfurt|      9146|          Brian Ball| 24|     Sa

In [31]:
#Calculate years of experience for each employee
from pyspark.sql.functions import current_date, datediff
exp = df.withColumn("Experience_Years", round(datediff(current_date(), df.JoiningDate)/365, 2))
exp.select("EmployeeID", "Name", "Experience_Years").show()

#List all employees with more than 5 years of experience
exp.filter(exp.Experience_Years > 5).show()

+----------+--------------------+----------------+
|EmployeeID|                Name|Experience_Years|
+----------+--------------------+----------------+
|      4128|     Charles Johnson|            6.91|
|      6094|       Dylan Camacho|            9.78|
|      5883| Mr. Ryan Bowman Jr.|            0.23|
|      9146|          Brian Ball|            9.68|
|      1918|       Angela Hooper|            5.81|
|      4600|Alexander Johnson...|            9.12|
|      6253|         Steven Lane|            3.86|
|      8278|       Riley Johnson|            9.84|
|      8520|    Emily Washington|            3.51|
|      1298|     Valerie Fleming|            5.49|
|      5157|     Tracy Hughes MD|            5.01|
|      7403|    Johnathan Harmon|            4.24|
|      3326|       Michael Brown|            1.62|
|      8989|       Scott Burnett|            9.11|
|      4676|  Christopher Fuller|             4.1|
|      6598|         Mary Henson|            3.78|
|      4136|       Jerome Torre