In [2]:
# Initialize Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnull, count, regexp_extract, split, coalesce, lit, to_date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, ArrayType
import pyspark.sql.functions as F

# Create Spark Session
spark = SparkSession.builder \
    .appName("HighPerformancePySpark") \
    .getOrCreate()

# Set log level to WARN to reduce verbosity
spark.sparkContext.setLogLevel("WARN")

25/02/13 22:46:31 WARN Utils: Your hostname, codespaces-6baec6 resolves to a loopback address: 127.0.0.1; using 10.0.1.82 instead (on interface eth0)
25/02/13 22:46:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/13 22:46:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/13 22:46:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# Define schema for the dataset
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_details", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("product_category", StringType(), True),
    StructField("quantity", StringType(), True),
    StructField("price_per_unit", StringType(), True),
    StructField("tags", StringType(), True),
    StructField("items", StringType(), True)
])

# Load the dataset
df = spark.read.csv("/workspaces/high-performance-pyspark-advanced-strategies-for-optimal-data-processing-3919191/data/online_sales_data.csv", 
schema=schema, header=True)

# Display the dataset
print("Raw Dataset:")
df.show(10, truncate=False)

Raw Dataset:
+--------+-----------------------------------------+----------+----------------+--------+------------------+------------------+----------------------------+
|order_id|customer_details                         |order_date|product_category|quantity|price_per_unit    |tags              |items                       |
+--------+-----------------------------------------+----------+----------------+--------+------------------+------------------+----------------------------+
|ORD001  |Alice Johnson                            |NULL      |Electronics     |4       |15.769160684603047|['urgent', 'gift']|['Phone', 'Charger', 'Case']|
|ORD002  |Bob Smith | 584 Street Name, City 16     |2022-12-30|NULL            |-3      |fifty             |['bulk_order']    |['Book1', 'Book2']          |
|ORD003  |Charlie Brown | 598 Street Name, City 17 |2023-05-22|Books           |ten     |79.63563178465238 |NULL              |['Book1', 'Book2']          |
|ORD004  |David Wilson | 290 Street Name, Cit

In [4]:
df.describe().show()

25/02/13 22:46:41 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+--------+--------------------+----------+----------------+-----------------+------------------+--------------+--------------------+
|summary|order_id|    customer_details|order_date|product_category|         quantity|    price_per_unit|          tags|               items|
+-------+--------+--------------------+----------+----------------+-----------------+------------------+--------------+--------------------+
|  count|     100|                 100|        63|              70|               78|                62|            82|                  80|
|   mean|    NULL|                NULL|      NULL|            NULL|             1.68| 55.49113068483046|          NULL|                NULL|
| stddev|    NULL|                NULL|      NULL|            NULL|4.639845176868095|27.456621254927434|          NULL|                NULL|
|    min|  ORD001|       Alice Johnson|2022-12-30|           Books|               -1|15.769160684603047|['bulk_order']|  ['Book1', 'Book2']|
|    max|  OR

In [5]:
# Find all the NULLs in the dataframe

null_values_count = df.select([count(when(isnull(c) , c)).alias(c) for c in df.columns])

null_values_count.show()

+--------+----------------+----------+----------------+--------+--------------+----+-----+
|order_id|customer_details|order_date|product_category|quantity|price_per_unit|tags|items|
+--------+----------------+----------+----------------+--------+--------------+----+-----+
|       0|               0|        37|              30|      22|            38|  18|   20|
+--------+----------------+----------+----------------+--------+--------------+----+-----+



25/02/13 22:46:44 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_details: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price_per_unit: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- items: string (nullable = true)



# Filtering rows

In [7]:
# Detect rows with negative quantity or invalid price
df_invalid = df.filter((col("quantity") < 0) | (col("quantity").rlike("^[^0-9]")) | (col("price_per_unit").rlike("^[^0-9]")))
df_invalid.show()

+--------+--------------------+----------+----------------+--------+------------------+------------------+--------------------+
|order_id|    customer_details|order_date|product_category|quantity|    price_per_unit|              tags|               items|
+--------+--------------------+----------+----------------+--------+------------------+------------------+--------------------+
|  ORD002|Bob Smith | 584 S...|2022-12-30|            NULL|      -3|             fifty|    ['bulk_order']|  ['Book1', 'Book2']|
|  ORD003|Charlie Brown | 5...|2023-05-22|           Books|     ten| 79.63563178465238|              NULL|  ['Book1', 'Book2']|
|  ORD005|Eva Davis | 387 S...|      NULL|            NULL|      -4|             fifty|    ['bulk_order']|['Phone', 'Charge...|
|  ORD006|        Frank Miller|2023-05-22|            NULL|      10|             fifty|       urgent,gift|['Table', 'Chair'...|
|  ORD007|Grace Lee | 869 S...|2023-05-22|           Books|    NULL|             fifty|    ['bulk_order'

# Median imputation and correction

Data is skewed in quantity column. So we will impute median for NULLs in this column

In [None]:
# "quantity" is String type, first we will cast it to Numeric and then impute the Median
df = df.withColumn("quantity" , col("quantity").cast("double"))
# Step 1: Calculate the median of the 'quantity' column
median_quantity = df.approxQuantile("quantity", [0.5], 0.0)[0]  # 0.5 for median (50th percentile)
df = df.withColumn("quantity" , when(col("quantity").isNull() , median_quantity).otherwise(col("quantity")))

# Step 2: convert strings in `price_per_unit` columns to numeric
df = df.withColumn("price_per_unit" , when(col("price_per_unit") == "fifty", 50.00 ).otherwise(col("price_per_unit")))

# Step 3:Replace negative values in `quantity with 0`
df = df.withColumn("quantity" , when(col("quantity") < 0 , 0).otherwise(col("quantity")))
df.show(5)

+--------+--------------------+----------+----------------+--------+------------------+------------------+--------------------+
|order_id|    customer_details|order_date|product_category|quantity|    price_per_unit|              tags|               items|
+--------+--------------------+----------+----------------+--------+------------------+------------------+--------------------+
|  ORD001|       Alice Johnson|      NULL|     Electronics|     4.0|15.769160684603047|['urgent', 'gift']|['Phone', 'Charge...|
|  ORD002|Bob Smith | 584 S...|2022-12-30|            NULL|     0.0|              50.0|    ['bulk_order']|  ['Book1', 'Book2']|
|  ORD003|Charlie Brown | 5...|2023-05-22|           Books|     1.0| 79.63563178465238|              NULL|  ['Book1', 'Book2']|
|  ORD004|David Wilson | 29...|2022-12-30|           Books|     1.0|27.556430196566655|              NULL|['Laptop', 'Mouse...|
|  ORD005|Eva Davis | 387 S...|      NULL|            NULL|     0.0|              50.0|    ['bulk_order'

# Splitting combined columns
Use `split` function

Split the `customer_details` column at the delimiter `|`, front part will be `Customer Name` & back part is `Customer Address`

In [19]:
df = df.withColumn("Customer Name", split(col("customer_details") , "\\|")[0]).\
         withColumn("Customer Address" , split(col("customer_details"), "\\|")[1])

df = df.withColumn("Customer Address" , when(col("Customer Address").isNull() , lit("Unknown")).otherwise(col("Customer Address")))
df.show()

+--------+--------------------+----------+----------------+--------+------------------+------------------+--------------------+---------------+--------------------+
|order_id|    customer_details|order_date|product_category|quantity|    price_per_unit|              tags|               items|  Customer Name|    Customer Address|
+--------+--------------------+----------+----------------+--------+------------------+------------------+--------------------+---------------+--------------------+
|  ORD001|       Alice Johnson|      NULL|     Electronics|     4.0|15.769160684603047|['urgent', 'gift']|['Phone', 'Charge...|  Alice Johnson|             Unknown|
|  ORD002|Bob Smith | 584 S...|2022-12-30|            NULL|    -3.0|             fifty|    ['bulk_order']|  ['Book1', 'Book2']|     Bob Smith | 584 Street Name,...|
|  ORD003|Charlie Brown | 5...|2023-05-22|           Books|     1.0| 79.63563178465238|              NULL|  ['Book1', 'Book2']| Charlie Brown | 598 Street Name,...|
|  ORD004|

## Regex

If the combined column is more complicated you can use `regex_extract` to split them

In [20]:
from pyspark.sql.functions import regexp_extract

# Example of extracting components from customer_details (e.g., street name, city)
df_address_split = df.withColumn('street', regexp_extract(col('customer_details'), r'(\d+ Street Name)', 1)) \
                     .withColumn('city', regexp_extract(col('customer_details'), r'City (\d+)', 1))
df_address_split.show()


+--------+--------------------+----------+----------------+--------+------------------+------------------+--------------------+---------------+--------------------+---------------+----+
|order_id|    customer_details|order_date|product_category|quantity|    price_per_unit|              tags|               items|  Customer Name|    Customer Address|         street|city|
+--------+--------------------+----------+----------------+--------+------------------+------------------+--------------------+---------------+--------------------+---------------+----+
|  ORD001|       Alice Johnson|      NULL|     Electronics|     4.0|15.769160684603047|['urgent', 'gift']|['Phone', 'Charge...|  Alice Johnson|             Unknown|               |    |
|  ORD002|Bob Smith | 584 S...|2022-12-30|            NULL|    -3.0|             fifty|    ['bulk_order']|  ['Book1', 'Book2']|     Bob Smith | 584 Street Name,...|584 Street Name|  16|
|  ORD003|Charlie Brown | 5...|2023-05-22|           Books|     1.0| 7