In [46]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [47]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [48]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [49]:
emp_data_1 = [
    ["001","101","John Doe","30","Male","50000","2015-01-01"],
    ["002","101","Jane Smith","25","Female","45000","2016-02-15"],
    ["003","102","Bob Brown","35","Male","55000","2014-05-01"],
    ["004","102","Alice Lee","28","Female","48000","2017-09-30"],
    ["005","103","Jack Chan","40","Male","60000","2013-04-01"],
    ["006","103","Jill Wong","32","Female","52000","2018-07-01"],
    ["007","101","James Johnson","42","Male","70000","2012-03-15"],
    ["008","102","Kate Kim","29","Female","51000","2019-10-01"],
    ["009","103","Tom Tan","33","Male","58000","2016-06-01"],
    ["010","104","Lisa Lee","27","Female","47000","2018-08-01"]
]

emp_data_2 = [
    ["011","104","David Park","38","Male","65000","2015-11-01"],
    ["012","105","Susan Chen","31","Female","54000","2017-02-15"],
    ["013","106","Brian Kim","45","Male","75000","2011-07-01"],
    ["014","107","Emily Lee","26","Female","46000","2019-01-01"],
    ["015","106","Michael Lee","37","Male","63000","2014-09-30"],
    ["016","107","Kelly Zhang","30","Female","49000","2018-04-01"],
    ["017","105","George Wang","34","Male","57000","2016-03-15"],
    ["018","104","Nancy Liu","29","","50000","2017-06-01"],
    ["019","103","Steven Chen","36","Male","62000","2015-08-01"],
    ["020","102","Grace Kim","32","Female","53000","2018-11-01"]
]

In [50]:
emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

In [51]:
emp_data_1 = spark.createDataFrame(emp_data_1, emp_schema)
emp_data_2 = spark.createDataFrame(emp_data_2, emp_schema)

In [52]:
emp = emp_data_1.union(emp_data_2)

In [53]:
emp.show(5)

+-----------+-------------+----------+---+------+------+----------+
|employee_id|department_id|      name|age|gender|salary| hire_date|
+-----------+-------------+----------+---+------+------+----------+
|        001|          101|  John Doe| 30|  Male| 50000|2015-01-01|
|        002|          101|Jane Smith| 25|Female| 45000|2016-02-15|
|        003|          102| Bob Brown| 35|  Male| 55000|2014-05-01|
|        004|          102| Alice Lee| 28|Female| 48000|2017-09-30|
|        005|          103| Jack Chan| 40|  Male| 60000|2013-04-01|
+-----------+-------------+----------+---+------+------+----------+
only showing top 5 rows



In [54]:
emp = emp_data_1.unionAll(emp_data_2)

In [55]:
# emp_sorted = emp.orderBy(F.col('salary').desc())
emp_sorted = emp.orderBy(F.col('salary').asc())

In [56]:
emp_sorted.show(5)

+-----------+-------------+-----------+---+------+------+----------+
|employee_id|department_id|       name|age|gender|salary| hire_date|
+-----------+-------------+-----------+---+------+------+----------+
|        002|          101| Jane Smith| 25|Female| 45000|2016-02-15|
|        014|          107|  Emily Lee| 26|Female| 46000|2019-01-01|
|        010|          104|   Lisa Lee| 27|Female| 47000|2018-08-01|
|        004|          102|  Alice Lee| 28|Female| 48000|2017-09-30|
|        016|          107|Kelly Zhang| 30|Female| 49000|2018-04-01|
+-----------+-------------+-----------+---+------+------+----------+
only showing top 5 rows



In [57]:
emp_count = emp_sorted.groupBy(
    'department_id'
).agg(
    F.count('employee_id').alias('employee_count')
)

In [58]:
emp_count.show(5)

+-------------+--------------+
|department_id|employee_count|
+-------------+--------------+
|          101|             3|
|          102|             4|
|          103|             4|
|          104|             3|
|          105|             2|
+-------------+--------------+
only showing top 5 rows



In [59]:
emp_sum = emp_sorted.groupBy(
    'department_id'
).agg(
    F.sum('salary').alias('total_salary')
)

In [60]:
emp_sum.show(5)

+-------------+------------+
|department_id|total_salary|
+-------------+------------+
|          104|    162000.0|
|          102|    207000.0|
|          101|    165000.0|
|          107|     95000.0|
|          106|    138000.0|
+-------------+------------+
only showing top 5 rows



In [61]:
emp_avg = emp_sorted.groupBy(
    'department_id'
).agg(
    F.avg('salary').alias('avg_salary')
).where(F.col('avg_salary') > 50_000)

In [None]:
emp_avg.show()

In [42]:
emp_fixed = emp_data_1.unionByName(emp_data_2)

In [43]:
emp_fixed.show(5)

+-----------+-------------+----------+---+------+------+----------+
|employee_id|department_id|      name|age|gender|salary| hire_date|
+-----------+-------------+----------+---+------+------+----------+
|        001|          101|  John Doe| 30|  Male| 50000|2015-01-01|
|        002|          101|Jane Smith| 25|Female| 45000|2016-02-15|
|        003|          102| Bob Brown| 35|  Male| 55000|2014-05-01|
|        004|          102| Alice Lee| 28|Female| 48000|2017-09-30|
|        005|          103| Jack Chan| 40|  Male| 60000|2013-04-01|
+-----------+-------------+----------+---+------+------+----------+
only showing top 5 rows



In [45]:
emp.count()

20