# https://www.youtube.com/watch?v=2oaTQl1YzCw&list=PL2IsFZBGM_IHCl9zhRVC1EXTomkEp_1zm&index=24

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config('spark.dynamicAllocation.shuffleTrackingEnabled', 'true')
    .config('spark.dynamicAllocation.executorIdleTimeout', '60')
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [4]:
sales_schema = "transacted_at string, trx_id string, retailer_id string, description string, amount double, city_id string"

sales_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\new_sales.csv'

sales = spark.read.format("csv").schema(sales_schema).option("header", True).load(sales_path)

In [5]:
city_schema = "city_id string, city string, state string, state_abv string, country string"

cities_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\cities.csv'

city = spark.read.format('csv').schema(city_schema).option('header', True).load(cities_path)

In [6]:
df_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\employee_records.csv'

_schema = "first_name string, last_name string, job_title string, dob string, email string, phone string, salary double, department_id int"

emp = spark.read.format("csv").schema(_schema).option("header", True).load(df_path)

In [12]:
dept_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\department_data.csv'

_dept_schema = "department_id int, department_name string, description string, city string, state string, country string"

dept = spark.read.format("csv").schema(_dept_schema).option("header", True).load(df_path)

In [13]:
df_joined = emp.join(dept, on=emp['department_id'] == dept['department_id'], how='left_outer')
df_joined.write.format('noop').mode('overwrite').save()

In [14]:
df_sales_joined = sales.join(city, on=sales['city_id'] == city['city_id'], how='left_outer')

In [15]:
part_df = df_joined.withColumn(
    'partition_num', F.spark_partition_id()
).groupBy(
    'partition_num'
).agg(
    F.count(F.lit(1)).alias('count')
)

In [16]:
part_df.show()

+-------------+-----+
|partition_num|count|
+-------------+-----+
|            0|43487|
|            1|43466|
|            2|43468|
|            3|43466|
|            4|43462|
|            5|43478|
|            6|43425|
|            7|43478|
|            8|43497|
|            9|43429|
|           10|43479|
|           11|43485|
|           12|43494|
|           13|43450|
|           14|43440|
|           15|43482|
|           16|43445|
|           17|43476|
|           18|43414|
|           19|43459|
+-------------+-----+
only showing top 20 rows



In [17]:
emp.groupBy('department_id').agg(F.count(F.lit(1)).alias('count')).show()

+-------------+------+
|department_id| count|
+-------------+------+
|           10| 99780|
|            1| 99451|
|            6| 99706|
|            9|100014|
|            7| 99805|
|            3|100248|
|            2|100155|
|            4|100214|
|            8|100417|
|            5|100210|
+-------------+------+



In [28]:
import pyspark.sql.functions as F

salt_df = spark.range(0, 16).withColumnRenamed("id", "salt_id")

salted_emp = emp.withColumn(
    'salted_dept_id',
    F.concat(F.col('department_id'), F.lit('-'), (F.rand() * 16).cast('int'))
)

salted_emp.show()

dept = dept.withColumnRenamed("department_id", "dept_id")
salted_dept = dept.crossJoin(salt_df).withColumn(
    'salted_dept_id',
    F.concat(F.col('dept_id'), F.lit('-'), F.col('salt_id'))
)

+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------+
|first_name| last_name|           job_title|       dob|               email|               phone|  salary|department_id|salted_dept_id|
+----------+----------+--------------------+----------+--------------------+--------------------+--------+-------------+--------------+
|   Richard|  Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|       (699)525-4827|512653.0|            8|           8-1|
|     Bobby|  Mccarthy|   Barrister's clerk|1974-04-25|   llara@example.net|  (750)846-1602x7458|999836.0|            7|           7-7|
|    Dennis|    Norman|Land/geomatics su...|1990-06-24| jturner@example.net|    873.820.0518x825|131900.0|           10|         10-14|
|      John|    Monroe|        Retail buyer|1968-06-16|  erik33@example.net|    820-813-0557x624|485506.0|            1|           1-5|
|  Michelle|   Elliott|      Air cabin crew|1975

In [29]:
salted_dept.show()

+-------+---------------+--------------------+----------+--------------------+------------------+-------+--------------+
|dept_id|department_name|         description|      city|               state|           country|salt_id|salted_dept_id|
+-------+---------------+--------------------+----------+--------------------+------------------+-------+--------------+
|   NULL|       Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|      0|          NULL|
|   NULL|       Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|      1|          NULL|
|   NULL|       Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|      2|          NULL|
|   NULL|       Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|      3|          NULL|
|   NULL|       Morrison|Public relations ...|1973-05-05|melissagarcia@exa...|     (699)525-4827|      4|          NULL|
|   NULL|       Morrison|Public 

In [30]:
spark.stop()