#

In [5]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [6]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [7]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config('spark.dynamicAllocation.shuffleTrackingEnabled', 'true')
    .config('spark.dynamicAllocation.executorIdleTimeout', '60')
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [9]:
spark.conf.set('spark.sql.adaptive.enabled', 'true')
spark.conf.set('spark.sql.adaptive.coalescePartitions.enabled', 'true')
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')

In [10]:
df_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\employee_records.csv'

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load(df_path)

In [11]:
dept_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\department_data.csv'

_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load(df_path)

In [12]:
df_joined = emp.join(dept, on=emp['department_id'] == dept['department_id'], how='left_outer')

In [13]:
df_joined.write.format('noop').mode('overwrite').save()

In [14]:
df_joined.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [department_id#7], [department_id#16], LeftOuter
   :- Sort [department_id#7 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(department_id#7, 80), ENSURE_REQUIREMENTS, [plan_id=88]
   :     +- FileScan csv [first_name#0,last_name#1,job_title#2,dob#3,email#4,phone#5,salary#6,department_id#7] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/F:/DataSpell/-pyspark_training/YouTube/PySpark - Zero to Hero/da..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<first_name:string,last_name:string,job_title:string,dob:string,email:string,phone:string,s...
   +- Sort [department_id#16 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(department_id#16, 80), ENSURE_REQUIREMENTS, [plan_id=89]
         +- Filter isnotnull(department_id#16)
            +- FileScan csv [department_id#16,department_name#17,description#18,city#19,state#20,country#21] Batch

In [16]:
spark.stop()