#

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config('spark.dynamicAllocation.shuffleTrackingEnabled', 'true')
    .config('spark.dynamicAllocation.executorIdleTimeout', '60')
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .enableHiveSupport()
    .getOrCreate()
)

In [4]:
spark.conf.set('spark.sql.adaptive.enabled', 'true')
spark.conf.set('spark.sql.adaptive.coalescePartitions.enabled', 'true')
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')

In [5]:
df_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\employee_records.csv'

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load(df_path)

In [6]:
dept_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\department_data.csv'

_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load(df_path)

In [7]:
spark.conf.get('spark.sql.catalogImplementation')

'hive'

In [8]:
db = spark.sql('show databases')

In [9]:
db.show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [10]:
spark.sql('show tables in default').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [11]:
emp.createOrReplaceTempView('emp_view')
dept.createOrReplaceTempView('dept_view')

In [12]:
spark.sql('show tables in default').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |dept_view|       true|
|         | emp_view|       true|
+---------+---------+-----------+



In [13]:
emp_filtered = spark.sql("""
    select * from emp_view
    where department_id = 1
""")

In [14]:
emp_filtered.show()

+-----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+
| first_name|last_name|           job_title|       dob|               email|               phone|  salary|department_id|
+-----------+---------+--------------------+----------+--------------------+--------------------+--------+-------------+
|       John|   Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|
|    Rachael|Rodriguez|         Media buyer|1966-12-02|griffinmary@examp...| +1-791-344-7586x548|544732.0|            1|
|Christopher| Callahan| Exhibition designer|1966-10-23| qwalter@example.com|001-947-745-3939x...|251057.0|            1|
|    Lindsey|   Huerta|Embryologist, cli...|1964-10-20|  psmith@example.net|   527.934.6665x1378|878257.0|            1|
|      David|   Harris|   Company secretary|1990-04-13|     nli@example.com|001-959-766-1180x...|249553.0|            1|
|      Brian|Hernandez|     Thea

In [15]:
emp_temp = spark.sql("""
    select emp.*, date_format(dob, 'yyyy') as year_of_birth
    from emp_view emp
""")

In [16]:
emp_temp.createOrReplaceTempView('emp_temp_view')

In [17]:
spark.sql('select * from emp_temp_view').show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|year_of_birth|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            8|         1973|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            7|         1974|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|         1990|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|         1968|
|  Michelle|   Elliott|      Air cabin crew|1975-03-31|

In [18]:
emp_final = spark.sql("""
    select /*+ BROADCAST(dept) */
    emp.*, dept.department_name
    from emp_temp_view emp left outer join dept_view as dept
    on emp.department_id = dept.department_id
""")

In [19]:
emp_final.show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-------------+---------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|year_of_birth|department_name|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-------------+---------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            8|         1973|           NULL|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            7|         1974|           NULL|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|         1990|           NULL|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x6

In [21]:
emp_final.write.format('parquet').saveAsTable('emp_final_')

In [22]:
spark.sql('show tables in default').show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|   emp_final_|      false|
|         |    dept_view|       true|
|         |emp_temp_view|       true|
|         |     emp_view|       true|
+---------+-------------+-----------+



In [24]:
emp_new = spark.read.table('emp_final_')

In [25]:
emp_new.show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-------------+---------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|year_of_birth|department_name|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-------------+---------------+
|      Mark|   Collins|Runner, broadcast...|1966-10-23|wardkimberly@exam...|    001-602-603-7814|491108.0|            9|         1966|           NULL|
|    Rodney|     Davis|            Musician|1976-07-12|rodrigueztracy@ex...|        932.767.5342|578043.0|            3|         1976|           NULL|
|    Bianca|   Gilbert|Nature conservati...|1978-06-22| james59@example.net|        200-744-9780|347570.0|            4|         1978|           NULL|
|      Jose|   Wallace| Electrical engineer|1982-09-25|thomas69@example.com|        234.542.90

In [28]:
emp_new = spark.sql("select * from emp_final_")

In [29]:
emp_new.show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-------------+---------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|year_of_birth|department_name|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+-------------+---------------+
|      Mark|   Collins|Runner, broadcast...|1966-10-23|wardkimberly@exam...|    001-602-603-7814|491108.0|            9|         1966|           NULL|
|    Rodney|     Davis|            Musician|1976-07-12|rodrigueztracy@ex...|        932.767.5342|578043.0|            3|         1976|           NULL|
|    Bianca|   Gilbert|Nature conservati...|1978-06-22| james59@example.net|        200-744-9780|347570.0|            4|         1978|           NULL|
|      Jose|   Wallace| Electrical engineer|1982-09-25|thomas69@example.com|        234.542.90

In [30]:
spark.conf.get('spark.sql.catalogImplementation')

'hive'

In [31]:
emp_final = spark.sql("""
    select /*+ BROADCAST(dept) */
    emp.*, dept.department_name
    from emp_view emp left outer join dept_view as dept
    on emp.department_id = dept.department_id
""")

In [32]:
emp_final.show()

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+---------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|department_name|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+---------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            8|           NULL|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            7|           NULL|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|           NULL|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|           NULL|
|  Michelle|   Elliott|      Air cabin cr

In [34]:
emp_final.write.format('parquet').saveAsTable('emp_final_data')

In [35]:
spark.sql('show tables in default').show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|  default|    emp_final_|      false|
|  default|emp_final_data|      false|
|         |     dept_view|       true|
|         | emp_temp_view|       true|
|         |      emp_view|       true|
+---------+--------------+-----------+



In [36]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [37]:
spark.sql("describe emp_final_data").show()

+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|     first_name|   string|   NULL|
|      last_name|   string|   NULL|
|      job_title|   string|   NULL|
|            dob|   string|   NULL|
|          email|   string|   NULL|
|          phone|   string|   NULL|
|         salary|   double|   NULL|
|  department_id|      int|   NULL|
|department_name|   string|   NULL|
+---------------+---------+-------+



In [40]:
spark.sql("describe extended emp_final_data").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|          first_name|              string|   NULL|
|           last_name|              string|   NULL|
|           job_title|              string|   NULL|
|                 dob|              string|   NULL|
|               email|              string|   NULL|
|               phone|              string|   NULL|
|              salary|              double|   NULL|
|       department_id|                 int|   NULL|
|     department_name|              string|   NULL|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|             Catalog|       spark_catalog|       |
|            Database|             default|       |
|               Table|      emp_final_data|       |
|               Owner|               Zygim|       |
|        Created Time|Tue Jan 28 21:01:...|       |
|         La

In [41]:
spark.stop()