# https://www.youtube.com/watch?v=_e0-QYbO8iI&list=PL2IsFZBGM_IHCl9zhRVC1EXTomkEp_1zm&index=11

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [4]:
emp_data = [
    ["001","101","John Doe","30","Male","50000","2015-01-01"],
    ["002","101","Jane Smith","25","Female","45000","2016-02-15"],
    ["003","102","Bob Brown","35","Male","55000","2014-05-01"],
    ["004","102","Alice Lee","28","Female","48000","2017-09-30"],
    ["005","103","Jack Chan","40","Male","60000","2013-04-01"],
    ["006","103","Jill Wong","32","Female","52000","2018-07-01"],
    ["007","101","James Johnson","42","Male","70000","2012-03-15"],
    ["008","102","Kate Kim","29","Female","51000","2019-10-01"],
    ["009","103","Tom Tan","33","Male","58000","2016-06-01"],
    ["010","104","Lisa Lee","27","Female","47000","2018-08-01"],
    ["011","104","David Park","38","Male","65000","2015-11-01"],
    ["012","105","Susan Chen","31","Female","54000","2017-02-15"],
    ["013","106","Brian Kim","45","Male","75000","2011-07-01"],
    ["014","107","Emily Lee","26","Female","46000","2019-01-01"],
    ["015","106","Michael Lee","37","Male","63000","2014-09-30"],
    ["016","107","Kelly Zhang","30","Female","49000","2018-04-01"],
    ["017","105","George Wang","34","Male","57000","2016-03-15"],
    ["018","104","Nancy Liu","29","","50000","2017-06-01"],
    ["019","103","Steven Chen","36","Male","62000","2015-08-01"],
    ["020","102","Grace Kim","32","Female","53000","2018-11-01"]
]

emp_schema = "employee_id string, department_id string, name string, age string, gender string, salary string, hire_date string"

dept_data = [
    ["101", "Sales", "NYC", "US", "1000000"],
    ["102", "Marketing", "LA", "US", "900000"],
    ["103", "Finance", "London", "UK", "1200000"],
    ["104", "Engineering", "Beijing", "China", "1500000"],
    ["105", "Human Resources", "Tokyo", "Japan", "800000"],
    ["106", "Research and Development", "Perth", "Australia", "1100000"],
    ["107", "Customer Service", "Sydney", "Australia", "950000"]
]

dept_schema = "department_id string, department_name string, city string, country string, budget string"

In [5]:
emp = spark.createDataFrame(data=emp_data, schema=emp_schema)
dept = spark.createDataFrame(data=dept_data, schema=dept_schema)

In [6]:
emp.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- hire_date: string (nullable = true)



In [7]:
dept.printSchema()

root
 |-- department_id: string (nullable = true)
 |-- department_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- budget: string (nullable = true)



In [8]:
emp.rdd.getNumPartitions()

24

In [9]:
dept.rdd.getNumPartitions()

24

In [13]:
# emp_partitioned = emp.repartition(4)
emp_partitioned = emp.repartition(10)

In [14]:
emp_partitioned.rdd.getNumPartitions()

10

In [15]:
# emp_partitioned = emp.repartitionByRange("salary", 2, 4)
emp_partitioned = emp.coalesce(5)

In [16]:
emp_partitioned.rdd.getNumPartitions()

5

In [22]:
emp_partitioned = emp.repartition(7, 'department_id')

In [23]:
emp_partitioned.rdd.getNumPartitions()

7

In [42]:

emp_1 = emp.repartition(4, 'department_id').withColumn(
    'partition_num', F.spark_partition_id()
)

In [43]:
emp_1.show(5)

+-----------+-------------+-----------+---+------+------+----------+-------------+
|employee_id|department_id|       name|age|gender|salary| hire_date|partition_num|
+-----------+-------------+-----------+---+------+------+----------+-------------+
|        003|          102|  Bob Brown| 35|  Male| 55000|2014-05-01|            0|
|        004|          102|  Alice Lee| 28|Female| 48000|2017-09-30|            0|
|        008|          102|   Kate Kim| 29|Female| 51000|2019-10-01|            0|
|        014|          107|  Emily Lee| 26|Female| 46000|2019-01-01|            0|
|        016|          107|Kelly Zhang| 30|Female| 49000|2018-04-01|            0|
+-----------+-------------+-----------+---+------+------+----------+-------------+
only showing top 5 rows



In [49]:
df_joined = emp.alias('e').join(dept.alias('d'), how='inner', on=emp.department_id == dept.department_id)

In [51]:
df_joined.select('e.name', 'd.department_id', 'd.department_name', 'e.salary').show(5)

+-------------+-------------+---------------+------+
|         name|department_id|department_name|salary|
+-------------+-------------+---------------+------+
|     John Doe|          101|          Sales| 50000|
|   Jane Smith|          101|          Sales| 45000|
|James Johnson|          101|          Sales| 70000|
|    Bob Brown|          102|      Marketing| 55000|
|    Alice Lee|          102|      Marketing| 48000|
+-------------+-------------+---------------+------+
only showing top 5 rows



In [52]:
df_joined = emp.alias('e').join(dept.alias('d'), how='left_outer', on=emp.department_id == dept.department_id)

In [53]:
df_joined.select('e.name', 'd.department_id', 'd.department_name', 'e.salary').show(5)

+----------+-------------+---------------+------+
|      name|department_id|department_name|salary|
+----------+-------------+---------------+------+
|  John Doe|          101|          Sales| 50000|
|Jane Smith|          101|          Sales| 45000|
| Bob Brown|          102|      Marketing| 55000|
| Alice Lee|          102|      Marketing| 48000|
| Jack Chan|          103|        Finance| 60000|
+----------+-------------+---------------+------+
only showing top 5 rows



In [56]:
df_final = emp.join(dept,
                    how='left_outer',
                    on=((emp.department_id == dept.department_id)
                        & (emp.department_id == '101')
                        | (emp.department_id == '102')
                        & (emp.salary.isNotNull())
                        ))

In [57]:
df_final.show()

+-----------+-------------+-------------+---+------+------+----------+-------------+--------------------+-------+---------+-------+
|employee_id|department_id|         name|age|gender|salary| hire_date|department_id|     department_name|   city|  country| budget|
+-----------+-------------+-------------+---+------+------+----------+-------------+--------------------+-------+---------+-------+
|        001|          101|     John Doe| 30|  Male| 50000|2015-01-01|          101|               Sales|    NYC|       US|1000000|
|        002|          101|   Jane Smith| 25|Female| 45000|2016-02-15|          101|               Sales|    NYC|       US|1000000|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|          101|               Sales|    NYC|       US|1000000|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|          102|           Marketing|     LA|       US| 900000|
|        003|          102|    Bob Brown| 35|  Male| 55000|2014-05-01|      