# https://www.youtube.com/watch?v=JB98Loobc7k&list=PL2IsFZBGM_IHCl9zhRVC1EXTomkEp_1zm&index=21

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [5]:
df_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\employee_records.csv'

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load(df_path)

In [6]:
dept_names = {1 : 'Department 1',
              2 : 'Department 2',
              3 : 'Department 3',
              4 : 'Department 4',
              5 : 'Department 5',
              6 : 'Department 6',
              7 : 'Department 7',
              8 : 'Department 8',
              9 : 'Department 9',
              10 : 'Department 10'}

In [7]:
broadcast_dept_names = spark.sparkContext.broadcast(dept_names)

In [8]:
type(broadcast_dept_names)

pyspark.broadcast.Broadcast

In [11]:
@F.udf
def get_dept_names(dept_id):
    return broadcast_dept_names.value.get(dept_id)

In [12]:
emp_final = emp.withColumn('dept_name', get_dept_names(F.col('department_id')))

In [14]:
emp_final.show(5)

+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+-------------+
|first_name|last_name|           job_title|       dob|               email|             phone|  salary|department_id|    dept_name|
+----------+---------+--------------------+----------+--------------------+------------------+--------+-------------+-------------+
|   Richard| Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|512653.0|            8| Department 8|
|     Bobby| Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|(750)846-1602x7458|999836.0|            7| Department 7|
|    Dennis|   Norman|Land/geomatics su...|1990-06-24| jturner@example.net|  873.820.0518x825|131900.0|           10|Department 10|
|      John|   Monroe|        Retail buyer|1968-06-16|  erik33@example.net|  820-813-0557x624|485506.0|            1| Department 1|
|  Michelle|  Elliott|      Air cabin crew|1975-03-31|tiffanyjohnston@e...| 

In [17]:
emp.where('department_id == 6').groupBy(
    F.col('department_id')
).agg(
    F.sum('salary')
).show()

+-------------+---------------+
|department_id|    sum(salary)|
+-------------+---------------+
|            6|5.0294510721E10|
+-------------+---------------+



In [16]:
emp_final.cache().filter(
    emp_final['dept_name'] == 'Department 6'
).groupBy(
    F.col('dept_name')
).agg(
    F.sum('salary')
).show()

+------------+---------------+
|   dept_name|    sum(salary)|
+------------+---------------+
|Department 6|5.0294510721E10|
+------------+---------------+



In [18]:
dept_sal = spark.sparkContext.accumulator(0)

In [19]:
def calculate_salary(department_id, salary):
    if department_id == 6:
        dept_sal.add(salary)

emp.foreach(lambda row: calculate_salary(row.department_id, row.salary))

In [20]:
dept_sal.value

50294510721.0

In [21]:
spark.stop()