In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName("E Commerece orders") \
.getOrCreate()

In [2]:
raw_orders = [
("O001","U001","Laptop,Mobile,Tablet",75000),
("O002","U002",["Mobile","Tablet"],32000),
("O003","U003","Laptop",72000),
("O004","U004",None,25000),
("O005","U005","Laptop|Mobile",68000)
]

In [3]:
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType)

1. Define a schema with ArrayType

In [5]:
from pyspark.sql.functions import ArrayType

In [8]:
orders_schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("user_id", StringType(), False),
    StructField("items", ArrayType(StringType()), True),
    StructField("amount", IntegerType(), True)
])

2. Normalize all item values into arrays
3. Handle multiple delimiters

In [9]:
def normalize_items(item):
  if item is None:
    return []
  if isinstance(item, list):
    return [item]
  if isinstance(item, str):
    item = item.replace("/"",")
    return [i.strip() for i in item.split(",")]
  return []

In [19]:
normalized_orders = [(oid, uid, normalize_items(items), amount)
for oid, uid, items, amount in raw_orders]

In [15]:
df_orders = spark.createDataFrame(normalized_orders, orders_schema)
df_orders.show()

+--------+-------+--------------------+------+
|order_id|user_id|               items|amount|
+--------+-------+--------------------+------+
|    O001|   U001|[Laptop, Mobile, ...| 75000|
|    O002|   U002|    [Mobile, Tablet]| 32000|
|    O003|   U003|            [Laptop]| 72000|
|    O004|   U004|                  []| 25000|
|    O005|   U005|    [Laptop, Mobile]| 68000|
+--------+-------+--------------------+------+



4. Replace null items with empty arrays

In [16]:
from pyspark.sql.functions import when, array, col
df_orders_fixed = df_orders.withColumn("items", when(col("items").isNull(), array()).otherwise(col("items")))
df_orders_fixed.show()

+--------+-------+--------------------+------+
|order_id|user_id|               items|amount|
+--------+-------+--------------------+------+
|    O001|   U001|[Laptop, Mobile, ...| 75000|
|    O002|   U002|    [Mobile, Tablet]| 32000|
|    O003|   U003|            [Laptop]| 72000|
|    O004|   U004|                  []| 25000|
|    O005|   U005|    [Laptop, Mobile]| 68000|
+--------+-------+--------------------+------+



5. Explode items into one row per item

In [17]:
from pyspark.sql.functions import explode
df_exploded = df_orders_fixed.withColumn("item", explode(col("items")))
df_exploded.show()

+--------+-------+--------------------+------+------+
|order_id|user_id|               items|amount|  item|
+--------+-------+--------------------+------+------+
|    O001|   U001|[Laptop, Mobile, ...| 75000|Laptop|
|    O001|   U001|[Laptop, Mobile, ...| 75000|Mobile|
|    O001|   U001|[Laptop, Mobile, ...| 75000|Tablet|
|    O002|   U002|    [Mobile, Tablet]| 32000|Mobile|
|    O002|   U002|    [Mobile, Tablet]| 32000|Tablet|
|    O003|   U003|            [Laptop]| 72000|Laptop|
|    O005|   U005|    [Laptop, Mobile]| 68000|Laptop|
|    O005|   U005|    [Laptop, Mobile]| 68000|Mobile|
+--------+-------+--------------------+------+------+



6. Count frequency of each item

In [20]:
item_frequency = (
    df_exploded.groupBy("item").count().orderBy(col("count").desc())
)
item_frequency.show()

+------+-----+
|  item|count|
+------+-----+
|Laptop|    3|
|Mobile|    3|
|Tablet|    2|
+------+-----+



7. Identify orders with more than 2 items

In [21]:
from pyspark.sql.functions import size
orders_more_than_2 = (
    df_orders_fixed.filter(size(col("items")) > 2)
)
orders_more_than_2.show()

+--------+-------+--------------------+------+
|order_id|user_id|               items|amount|
+--------+-------+--------------------+------+
|    O001|   U001|[Laptop, Mobile, ...| 75000|
+--------+-------+--------------------+------+

