In [1]:
from email.feedparser import headerRE

from numpy.ma.extras import unique
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.4-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("PySpark Zero to Hero")
    .master("local[*]")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.cores", "4")
    .config("spark.sql.shuffle.partitions", "80")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.initialExecutors", "24")
    .config("spark.dynamicAllocation.maxExecutors", "50")
    .config('spark.dynamicAllocation.shuffleTrackingEnabled', 'true')
    .config('spark.dynamicAllocation.executorIdleTimeout', '60')
    .config("spark.shuffle.service.enabled", "true")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .enableHiveSupport()
    .getOrCreate()
)

In [4]:
spark.conf.set('spark.sql.adaptive.enabled', 'true')
spark.conf.set('spark.sql.adaptive.coalescePartitions.enabled', 'true')
spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '-1')

In [5]:
df_path = r"F:\Datasets\CSV datasets\synthetic_beverage_sales_data.csv"

In [6]:
df = spark.read.format('csv').option('header', 'true').load(df_path)

In [7]:
df.show(truncate=False)

+--------+-----------+-------------+------------------+-------------------+----------+--------+--------+-----------+------------------+----------+
|Order_ID|Customer_ID|Customer_Type|Product           |Category           |Unit_Price|Quantity|Discount|Total_Price|Region            |Order_Date|
+--------+-----------+-------------+------------------+-------------------+----------+--------+--------+-----------+------------------+----------+
|ORD1    |CUS1496    |B2B          |Vio Wasser        |Water              |1.66      |53      |0.1     |79.18      |Baden-Württemberg |2023-08-23|
|ORD1    |CUS1496    |B2B          |Evian             |Water              |1.56      |90      |0.1     |126.36     |Baden-Württemberg |2023-08-23|
|ORD1    |CUS1496    |B2B          |Sprite            |Soft Drinks        |1.17      |73      |0.05    |81.14      |Baden-Württemberg |2023-08-23|
|ORD1    |CUS1496    |B2B          |Rauch Multivitamin|Juices             |3.22      |59      |0.1     |170.98     |Ba

In [8]:
df.printSchema()

root
 |-- Order_ID: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Type: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Unit_Price: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Total_Price: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Order_Date: string (nullable = true)



In [9]:
df.count()

8999910

In [34]:
df = df.drop("Unit_Price", "Quantity", "Discount", "Total_Price")

In [10]:
df.select(
    F.col('Product'), F.col('Total_Price')
).show(truncate=False)

+------------------+-----------+
|Product           |Total_Price|
+------------------+-----------+
|Vio Wasser        |79.18      |
|Evian             |126.36     |
|Sprite            |81.14      |
|Rauch Multivitamin|170.98     |
|Gerolsteiner      |27.4       |
|Sauvignon Blanc   |18.18      |
|Tomato Juice      |84.74      |
|Vittel            |5.31       |
|San Pellegrino    |100.19     |
|Evian             |3.93       |
|Mountain Dew      |10.89      |
|Hohes C Orange    |11.55      |
|Red Bull          |32.38      |
|Chardonnay        |18.38      |
|Tanqueray         |1219.31    |
|Rotkäppchen Sekt  |246.2      |
|Mango Juice       |23.75      |
|Apollinaris       |10.13      |
|Mountain Dew      |141.47     |
|Riesling          |229.03     |
+------------------+-----------+
only showing top 20 rows



In [11]:
df.printSchema()

root
 |-- Order_ID: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Type: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Unit_Price: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Total_Price: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Order_Date: string (nullable = true)



In [12]:
df.show(truncate=False, n=5)

+--------+-----------+-------------+------------------+-----------+----------+--------+--------+-----------+-----------------+----------+
|Order_ID|Customer_ID|Customer_Type|Product           |Category   |Unit_Price|Quantity|Discount|Total_Price|Region           |Order_Date|
+--------+-----------+-------------+------------------+-----------+----------+--------+--------+-----------+-----------------+----------+
|ORD1    |CUS1496    |B2B          |Vio Wasser        |Water      |1.66      |53      |0.1     |79.18      |Baden-Württemberg|2023-08-23|
|ORD1    |CUS1496    |B2B          |Evian             |Water      |1.56      |90      |0.1     |126.36     |Baden-Württemberg|2023-08-23|
|ORD1    |CUS1496    |B2B          |Sprite            |Soft Drinks|1.17      |73      |0.05    |81.14      |Baden-Württemberg|2023-08-23|
|ORD1    |CUS1496    |B2B          |Rauch Multivitamin|Juices     |3.22      |59      |0.1     |170.98     |Baden-Württemberg|2023-08-23|
|ORD1    |CUS1496    |B2B         

In [13]:
from pyspark.sql.types import IntegerType, DecimalType, StringType, DateType

In [14]:
df = df.select(
    '*',
    F.col('Unit_Price').cast(DecimalType(10, 2)),
    F.col('Quantity').cast(IntegerType()),
    F.col('Discount').cast(DecimalType(10, 2)),
    F.col('Total_Price').cast(DecimalType(10, 2))
)

In [15]:
df.filter(F.col('Customer_Type').isNull()).count()

0

In [16]:
df.filter(F.col('Product').isNull()).count()

0

In [17]:
df = df.withColumn(
    'Order_Date', F.to_date(F.col('Order_Date'), 'yyyy-MM-dd')
)

In [18]:
df = df.select(
    '*',
    F.year(F.col('Order_Date')).alias('Order_Year'),
    F.month(F.col('Order_Date')).alias('Order_Month'),
    F.day(F.col('Order_Date')).alias('Order_Day')
)

In [33]:
df.filter(
    F.col('Discount') > 0.1
).show()

+--------+-----------+-------------+-------------------+-------------------+--------------------+----------+----------+-----------+---------+
|Order_ID|Customer_ID|Customer_Type|            Product|           Category|              Region|Order_Date|Order_Year|Order_Month|Order_Day|
+--------+-----------+-------------+-------------------+-------------------+--------------------+----------+----------+-----------+---------+
|    ORD5|    CUS7919|          B2B|          Tanqueray|Alcoholic Beverages|              Bayern|2023-08-05|      2023|          8|        5|
|    ORD6|     CUS533|          B2B|       Mountain Dew|        Soft Drinks|            Saarland|2023-02-18|      2023|          2|       18|
|   ORD11|    CUS3565|          B2B|         Krombacher|Alcoholic Beverages|         Brandenburg|2021-01-05|      2021|          1|        5|
|   ORD17|    CUS9592|          B2B|    Cranberry Juice|             Juices|       Niedersachsen|2021-07-13|      2021|          7|       13|
|   OR

In [30]:
df.printSchema()

root
 |-- Order_ID: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Type: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Unit_Price: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Discount: string (nullable = true)
 |-- Total_Price: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Order_Date: date (nullable = true)
 |-- Unit_Price: decimal(10,2) (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Discount: decimal(10,2) (nullable = true)
 |-- Total_Price: decimal(10,2) (nullable = true)
 |-- Order_Year: integer (nullable = true)
 |-- Order_Month: integer (nullable = true)
 |-- Order_Day: integer (nullable = true)

