# https://www.youtube.com/watch?v=FLrCwoy5Z6c&list=PL2IsFZBGM_IHCl9zhRVC1EXTomkEp_1zm&index=14

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .getOrCreate()
)

In [4]:
df_single_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\order_singleline.json'
df_single = spark.read.format('json').load(df_single_path)

In [6]:
df_single.show(truncate=False)

+------------------------+-----------+--------+------------------------------------+
|contact                 |customer_id|order_id|order_line_items                    |
+------------------------+-----------+--------+------------------------------------+
|[9000010000, 9000010001]|C001       |O101    |[{102.45, I001, 6}, {2.01, I003, 2}]|
+------------------------+-----------+--------+------------------------------------+



In [9]:
df_multi_path = r'F:\DataSpell\-pyspark_training\YouTube\PySpark - Zero to Hero\datasets\order_multiline.json'
df_multi = spark.read.format('json').option('multiline', True).load(df_multi_path)

In [11]:
df_multi.show(truncate=False)

+------------------------+-----------+--------+------------------------------------+
|contact                 |customer_id|order_id|order_line_items                    |
+------------------------+-----------+--------+------------------------------------+
|[9000010000, 9000010001]|C001       |O101    |[{102.45, I001, 6}, {2.01, I003, 2}]|
+------------------------+-----------+--------+------------------------------------+



In [12]:
df = spark.read.format('text').load(df_single_path)

In [14]:
df.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"order_id":"O101","customer_id":"C001","order_line_items":[{"item_id":"I001","qty":6,"amount":102.45},{"item_id":"I003","qty":2,"amount":2.01}],"contact":[9000010000,9000010001]}|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [15]:
_schema = 'customer_id string, order_id string, contact array<long>'

In [16]:
df_schema = spark.read.format('json').schema(_schema).load(df_single_path)

In [18]:
df_schema.show(truncate=False)

+-----------+--------+------------------------+
|customer_id|order_id|contact                 |
+-----------+--------+------------------------+
|C001       |O101    |[9000010000, 9000010001]|
+-----------+--------+------------------------+



In [19]:
_schema = "contact array<string>, customer_id string, order_id string, order_line_items array<struct<amount double, item_id string, qty long>>"

In [21]:
df_schema_new = spark.read.format('json').schema(_schema).load(df_single_path)

In [23]:
df_schema_new.show(truncate=False)

+------------------------+-----------+--------+------------------------------------+
|contact                 |customer_id|order_id|order_line_items                    |
+------------------------+-----------+--------+------------------------------------+
|[9000010000, 9000010001]|C001       |O101    |[{102.45, I001, 6}, {2.01, I003, 2}]|
+------------------------+-----------+--------+------------------------------------+



In [24]:
_schema = "contact array<string>, customer_id string, order_id string, order_line_items array<struct<amount double, item_id string, qty long>>"

In [25]:
df_expanded = df.withColumn(
    'parsed', F.from_json(F.col('value'), _schema)
)

In [27]:
df_expanded.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------+
|value                                                                                                                                                                              |parsed                                                                      |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------+
|{"order_id":"O101","customer_id":"C001","order_line_items":[{"item_id":"I001","qty":6,"amount":102.45},{"item_id":"I003","qty":2,"amount":2.01}],"contact":[9000010000,9000010001]}|{[9000010000, 9000010001], C001, O101, [{1

In [28]:
df_unparsed = df_expanded.withColumn(
    'unparsed', F.to_json(df_expanded['parsed'])
)

In [29]:
df_unparsed.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                              |parsed                                                                      |unparsed                                                                                                                                                                               |
+-----------------------------------------------------------------------------------------------------------------

In [30]:
df_1 = df_expanded.select('parsed.*')

In [31]:
df_1.show(truncate=False)

+------------------------+-----------+--------+------------------------------------+
|contact                 |customer_id|order_id|order_line_items                    |
+------------------------+-----------+--------+------------------------------------+
|[9000010000, 9000010001]|C001       |O101    |[{102.45, I001, 6}, {2.01, I003, 2}]|
+------------------------+-----------+--------+------------------------------------+



In [32]:
df_2 = df_1.withColumn(
    'expanded_line_items', F.explode(df_1['order_line_items'])
)

In [34]:
df_2.show(truncate=False)

+------------------------+-----------+--------+------------------------------------+-------------------+
|contact                 |customer_id|order_id|order_line_items                    |expanded_line_items|
+------------------------+-----------+--------+------------------------------------+-------------------+
|[9000010000, 9000010001]|C001       |O101    |[{102.45, I001, 6}, {2.01, I003, 2}]|{102.45, I001, 6}  |
|[9000010000, 9000010001]|C001       |O101    |[{102.45, I001, 6}, {2.01, I003, 2}]|{2.01, I003, 2}    |
+------------------------+-----------+--------+------------------------------------+-------------------+



In [35]:
df_3 = df_2.select(
    'contact', 'customer_id', 'order_id', 'expanded_line_items.*'
)

In [37]:
df_3.show(truncate=False)

+------------------------+-----------+--------+------+-------+---+
|contact                 |customer_id|order_id|amount|item_id|qty|
+------------------------+-----------+--------+------+-------+---+
|[9000010000, 9000010001]|C001       |O101    |102.45|I001   |6  |
|[9000010000, 9000010001]|C001       |O101    |2.01  |I003   |2  |
+------------------------+-----------+--------+------+-------+---+



In [38]:
df_final = df_3.withColumn(
    'contact_expanded', F.explode(df_3['contact'])
)

In [39]:
df_final.show(truncate=False)

+------------------------+-----------+--------+------+-------+---+----------------+
|contact                 |customer_id|order_id|amount|item_id|qty|contact_expanded|
+------------------------+-----------+--------+------+-------+---+----------------+
|[9000010000, 9000010001]|C001       |O101    |102.45|I001   |6  |9000010000      |
|[9000010000, 9000010001]|C001       |O101    |102.45|I001   |6  |9000010001      |
|[9000010000, 9000010001]|C001       |O101    |2.01  |I003   |2  |9000010000      |
|[9000010000, 9000010001]|C001       |O101    |2.01  |I003   |2  |9000010001      |
+------------------------+-----------+--------+------+-------+---+----------------+



In [40]:
spark.stop()