In [1]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("LargeEmployeeDatasetAnalysis").getOrCreate()

spark

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load CSV
df = spark.read.csv('/content/drive/MyDrive/large_employee_dataset.csv', header=True, inferSchema=True)

# Show top 5 rows
df.show(5)

+----------+-------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+-------------------+---+----------+------+-----------+--------+------------+
|      4128|    Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|      Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883|Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|         Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|      Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
+----------+-------------------+---+----------+------+-----------+--------+------------+
only showing top 5 rows



In [7]:
#Basic Exploration
#1.Top 10 rows
df.show(10)


+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|  New Thomas|
|      852

In [8]:
#2.Count total employees
df.count()


500

In [9]:
#3.Unique departments
df.select("Department").distinct().show()

+----------+
|Department|
+----------+
|     Sales|
|        HR|
|   Finance|
| Marketing|
|        IT|
+----------+



In [13]:
#Filtering & Sorting
# 4. All employees in IT department
it_df = df.filter(df.Department == "IT")
it_df.show(it_df.count(), truncate=False)





+----------+-------------------+---+----------+------+-----------+--------+--------------------+
|EmployeeID|Name               |Age|Department|Salary|JoiningDate|Status  |City                |
+----------+-------------------+---+----------+------+-----------+--------+--------------------+
|6598      |Mary Henson        |58 |IT        |63951 |2021-08-25 |Active  |Port Tricia         |
|8518      |Elizabeth Abbott   |22 |IT        |91732 |2022-11-05 |Active  |Douglasside         |
|9506      |Thomas Dunn        |45 |IT        |90340 |2020-07-12 |On Leave|Lindseychester      |
|9663      |Glenn Mason        |43 |IT        |109189|2020-03-27 |On Leave|Katelynburgh        |
|2106      |Richard Bailey     |45 |IT        |30950 |2021-06-29 |Resigned|North John          |
|8212      |Jacob Jackson      |35 |IT        |54812 |2020-09-18 |On Leave|South Veronicastad  |
|6354      |Nicole Gilmore     |35 |IT        |104202|2018-05-04 |Active  |East Joseph         |
|5716      |David Wang        

In [14]:
# 5. Employees aged between 30 and 40
age_df = df.filter((df.Age >= 30) & (df.Age <= 40))
age_df.show(age_df.count(), truncate=False)






+----------+-------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|Name               |Age|Department|Salary|JoiningDate|Status  |City               |
+----------+-------------------+---+----------+------+-----------+--------+-------------------+
|4676      |Christopher Fuller |30 |HR        |63042 |2021-04-30 |Resigned|South Donnaville   |
|4136      |Jerome Torres      |30 |Finance   |68213 |2024-11-30 |Active  |North Justinborough|
|1588      |Edwin Burns        |34 |Sales     |108208|2015-09-14 |Resigned|South David        |
|8074      |Fred Brewer        |30 |HR        |100736|2021-06-06 |On Leave|Port Wendyville    |
|3841      |April Allen        |36 |HR        |98845 |2020-05-20 |Active  |Rachelchester      |
|8212      |Jacob Jackson      |35 |IT        |54812 |2020-09-18 |On Leave|South Veronicastad |
|3325      |Brian Huynh        |40 |Finance   |59924 |2017-01-02 |On Leave|Johnside           |
|6180      |Robert Cortez      |35 |Mark

In [15]:
# 6. Sort employees by Salary descending
sorted_df = df.orderBy(df.Salary.desc())
sorted_df.show(sorted_df.count(), truncate=False)

+----------+---------------------+---+----------+------+-----------+--------+----------------------+
|EmployeeID|Name                 |Age|Department|Salary|JoiningDate|Status  |City                  |
+----------+---------------------+---+----------+------+-----------+--------+----------------------+
|8860      |Cody Williams        |30 |IT        |119978|2019-03-16 |Resigned|Susanville            |
|4585      |Sandra Webster       |30 |HR        |119940|2022-10-21 |On Leave|Thompsonport          |
|4667      |Ronald Hunt          |58 |Sales     |119677|2019-08-29 |Resigned|Griffithchester       |
|1602      |Deborah Williams     |25 |HR        |119397|2023-09-26 |On Leave|Port Terrimouth       |
|3374      |Amanda Green         |41 |HR        |119316|2021-04-08 |Resigned|West Shelleyton       |
|6329      |Heidi Shaffer        |36 |HR        |119165|2020-01-14 |Resigned|New Alexa             |
|2428      |Mary Stevens         |55 |Sales     |119137|2022-03-06 |On Leave|Travisport    

In [16]:
#AGGREGATION TASKS
# 7. Average salary by department
df.groupBy("Department").avg("Salary").show(truncate=False)



+----------+-----------------+
|Department|avg(Salary)      |
+----------+-----------------+
|Sales     |77488.54545454546|
|HR        |76091.27450980392|
|Finance   |72834.75630252101|
|Marketing |71958.1888888889 |
|IT        |73116.25555555556|
+----------+-----------------+



In [19]:
# 8. Count of employees by Status
df.groupBy("Status").count().show(truncate=False)



+--------+-----+
|Status  |count|
+--------+-----+
|Resigned|159  |
|Active  |172  |
|On Leave|169  |
+--------+-----+



In [20]:
# 9. Highest salary in each city
from pyspark.sql.functions import max
df.groupBy("City").agg(max("Salary").alias("MaxSalary")).show(df.select("City").distinct().count(), truncate=False)

+----------------------+---------+
|City                  |MaxSalary|
+----------------------+---------+
|Wilsonchester         |67025    |
|Bradshawton           |111116   |
|Steventon             |32009    |
|Lake Alyssa           |84903    |
|North Lisa            |57898    |
|North Marvin          |66252    |
|Jenniferton           |39907    |
|Buckleyview           |50109    |
|Burtonville           |98492    |
|Johnsonmouth          |48799    |
|South Joseph          |52456    |
|Lindseychester        |90340    |
|North Stephen         |91947    |
|Port Nicoleshire      |57537    |
|Jerrychester          |53374    |
|North Jennifer        |82486    |
|Laurenstad            |44608    |
|West Brendanbury      |90698    |
|Juliaberg             |50170    |
|New James             |54378    |
|East Robert           |64982    |
|Lisaburgh             |63953    |
|Mariebury             |71841    |
|Allentown             |64039    |
|Tonyamouth            |116423   |
|Aaronberg          

In [21]:
#GROUPBY AND ANALYSIS
from pyspark.sql.functions import year, to_date

# 10. Total employees joined each year
df = df.withColumn("JoinYear", year(to_date(df.JoiningDate, "yyyy-MM-dd")))
df.groupBy("JoinYear").count().orderBy("JoinYear").show(df.select("JoinYear").distinct().count(), truncate=False)



+--------+-----+
|JoinYear|count|
+--------+-----+
|2015    |37   |
|2016    |49   |
|2017    |44   |
|2018    |52   |
|2019    |52   |
|2020    |56   |
|2021    |49   |
|2022    |49   |
|2023    |47   |
|2024    |38   |
|2025    |27   |
+--------+-----+



In [22]:
# 11. Department-wise count of currently Active employees
df.filter(df.Status == "Active").groupBy("Department").count().show(truncate=False)



+----------+-----+
|Department|count|
+----------+-----+
|Sales     |32   |
|HR        |37   |
|Finance   |45   |
|Marketing |32   |
|IT        |26   |
+----------+-----+



In [23]:
# 12. Average age of employees per department
df.groupBy("Department").avg("Age").show(truncate=False)

+----------+------------------+
|Department|avg(Age)          |
+----------+------------------+
|Sales     |40.535353535353536|
|HR        |41.46078431372549 |
|Finance   |39.21008403361345 |
|Marketing |41.82222222222222 |
|IT        |38.68888888888889 |
+----------+------------------+



In [25]:
#JOINING WITH CITY-REGION
from pyspark.sql import Row

# 13. Create City-Region dataset and join
regions = [
    Row(City="Allentown", Region="East"),
    Row(City="Anthonyfort", Region="South"),
    Row(City="Gilesstad", Region="North"),
    Row(City="Jenniferfurt", Region="West"),
    Row(City="Lake Amystad", Region="East"),
    # Add more cities as needed...
]
region_df = spark.createDataFrame(regions)

# Join
joined_df = df.join(region_df, on="City", how="left")
joined_df.show(joined_df.count(), truncate=False)



+----------------------+----------+---------------------+---+----------+------+-----------+--------+--------+------+
|City                  |EmployeeID|Name                 |Age|Department|Salary|JoiningDate|Status  |JoinYear|Region|
+----------------------+----------+---------------------+---+----------+------+-----------+--------+--------+------+
|Wilsonchester         |5556      |Nicole Cook          |49 |Sales     |67025 |2020-02-20 |On Leave|2020    |NULL  |
|Bradshawton           |8747      |Jamie Pena           |38 |Sales     |111116|2019-04-03 |Active  |2019    |NULL  |
|Steventon             |6048      |Curtis Meza          |22 |Sales     |32009 |2022-02-07 |Resigned|2022    |NULL  |
|Lake Alyssa           |7023      |Ricky Gonzalez       |39 |Marketing |84903 |2022-09-05 |On Leave|2022    |NULL  |
|North Lisa            |8702      |Hannah Hebert        |25 |Marketing |57898 |2016-10-06 |Active  |2016    |NULL  |
|North Marvin          |1903      |Christopher Davis    |53 |HR 

In [26]:
# 14. Group salaries by Region
joined_df.groupBy("Region").sum("Salary").show(truncate=False)

+------+-----------+
|Region|sum(Salary)|
+------+-----------+
|NULL  |36796842   |
|South |34686      |
|East  |172812     |
|West  |87831      |
|North |64541      |
+------+-----------+



In [27]:
#DATE OPERATIONS

from pyspark.sql.functions import datediff, current_date

# 15. Calculate years of experience
df = df.withColumn("ExperienceYears", (datediff(current_date(), to_date(df.JoiningDate, "yyyy-MM-dd")) / 365).cast("int"))
df.select("Name", "JoiningDate", "ExperienceYears").show(df.count(), truncate=False)



+---------------------+-----------+---------------+
|Name                 |JoiningDate|ExperienceYears|
+---------------------+-----------+---------------+
|Charles Johnson      |2018-07-07 |6              |
|Dylan Camacho        |2015-08-25 |9              |
|Mr. Ryan Bowman Jr.  |2025-03-11 |0              |
|Brian Ball           |2015-10-01 |9              |
|Angela Hooper        |2019-08-14 |5              |
|Alexander Johnson PhD|2016-04-21 |9              |
|Steven Lane          |2021-07-25 |3              |
|Riley Johnson        |2015-08-03 |9              |
|Emily Washington     |2021-11-30 |3              |
|Valerie Fleming      |2019-12-08 |5              |
|Tracy Hughes MD      |2020-06-01 |5              |
|Johnathan Harmon     |2021-03-09 |4              |
|Michael Brown        |2023-10-21 |1              |
|Scott Burnett        |2016-04-25 |9              |
|Christopher Fuller   |2021-04-30 |4              |
|Mary Henson          |2021-08-25 |3              |
|Jerome Torr

In [28]:
# 16. List employees with > 5 years experience
experienced_df = df.filter(df.ExperienceYears > 5)
experienced_df.select("Name", "ExperienceYears").show(experienced_df.count(), truncate=False)

+---------------------+---------------+
|Name                 |ExperienceYears|
+---------------------+---------------+
|Charles Johnson      |6              |
|Dylan Camacho        |9              |
|Brian Ball           |9              |
|Alexander Johnson PhD|9              |
|Riley Johnson        |9              |
|Scott Burnett        |9              |
|Brittany Kerr        |6              |
|Edwin Burns          |9              |
|Mary Reynolds        |6              |
|Erin Berg            |7              |
|Jason Hines          |9              |
|Christopher Mcdaniel |10             |
|Victoria Kelly       |7              |
|Heather Nelson       |10             |
|Paul Porter          |7              |
|Brian Huynh          |8              |
|James West           |8              |
|Cameron Shelton      |9              |
|Nicole Gilmore       |7              |
|David Wang           |9              |
|Amanda Miranda       |9              |
|John Smith           |7              |
