# https://www.youtube.com/watch?v=PHVFDgk3lok&list=PL2IsFZBGM_IHCl9zhRVC1EXTomkEp_1zm&index=20

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [4]:
df_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\employee_records.csv'

In [11]:
_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format('csv').schema(_schema).option('header', True).load(df_path)

In [12]:
df.rdd.getNumPartitions()

24

In [13]:
spark.sparkContext.defaultParallelism

24

In [16]:
emp_avg = emp.groupBy(
    'department_id'
).agg(
    F.avg('salary').alias('avg_salary')
)

emp_avg.show()

+-------------+------------------+
|department_id|        avg_salary|
+-------------+------------------+
|           10| 502682.2575766687|
|            1|504876.96401242825|
|            6|504428.12590014644|
|            9| 504945.3055672206|
|            7|504514.38453985273|
|            3| 504697.6808514883|
|            2| 503563.2174529479|
|            4| 505419.4963977089|
|            8| 505299.1226286386|
|            5| 504167.9429997006|
+-------------+------------------+



In [20]:
emp_avg.write.format('noop').mode('overwrite').save()

In [26]:
spark.conf.get('spark.sql.shuffle.partitions')

'80'

In [29]:
emp.withColumn(
    'partition_id', F.spark_partition_id()
).where('partition_id = 0').show(5)

+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+------------+
|first_name|last_name|           job_title|       dob|               email|             phone|  salary|department_id|partition_id|
+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+------------+
|   Richard| Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|512653.0|            8|           0|
|     Bobby| Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|(750)846-1602x7458|999836.0|            7|           0|
|    Dennis|   Norman|Land/geomatics su...|1990-06-24| jturner@example.net|  873.820.0518x825|131900.0|           10|           0|
|      John|   Monroe|        Retail buyer|1968-06-16|  erik33@example.net|  820-813-0557x624|485506.0|            1|           0|
|  Michelle|  Elliott|      Air cabin crew|1975-03-31|tiffanyjohnston@e...|     (70

In [30]:
emp_part = spark.read.format('csv').schema(_schema).option('header', True).load(df_path)

In [31]:
emp_avg = emp_part.groupBy(
    'department_id'
).agg(
    F.avg('salary').alias('avg_salary')
)

In [32]:
emp_avg.write.format('noop').mode('overwrite').save()

In [33]:
spark.stop()