In [118]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("city", StringType(), True),
    StructField("interests", StringType(), True)
])

In [119]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

raw_customers = [
    ("C001","Rahul","29","Bangalore","Electronics,Fashion"),
    ("C002","Sneha","Thirty Two","Delhi","Fashion"),
    ("C003","Aman",None,"Mumbai",["Home","Electronics"]),
    ("C004","Pallavi","27","Pune","Electronics|Beauty"),
    ("C005","", "35","Chennai",None)
]

df = spark.createDataFrame(raw_customers, schema)

# Clean Age Field

In [120]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

text_to_num = {
    "Thirty Two": 32,
    "Twenty Nine": 29,
}

def normalize_age(age):
    if age is None:
        return None
    try:
        return int(age)
    except:
        return text_to_num.get(age, None)

normalize_age_udf = udf(normalize_age, IntegerType())

df = df.withColumn("age_clean", normalize_age_udf(df["age"]))

# Clean Interests Field

In [121]:
from pyspark.sql.functions import col, when, regexp_replace

df = df.withColumn(
    "interests_clean",
    when(col("interests").isNull(), "")
    .when(col("interests").startswith("["),
          regexp_replace(col("interests"), "[\\[\\]']", ""))
    .otherwise(regexp_replace(col("interests"), "\\|", ","))
)

# Handle Missing Names

In [122]:
df = df.withColumn(
    "name_clean",
    when((col("name") == "") | col("name").isNull(), "Unknown")
    .otherwise(col("name"))
)

# Final Cleaned Dataset

In [96]:
cleaned_df = df.select(
    "customer_id",
    "name_clean",
    "age_clean",
    "city",
    "interests_clean"
)

In [97]:
df.show()

+-----------+-------+----------+---------+-------------------+---------+-------------------+----------+
|customer_id|   name|       age|     city|          interests|age_clean|    interests_clean|name_clean|
+-----------+-------+----------+---------+-------------------+---------+-------------------+----------+
|       C001|  Rahul|        29|Bangalore|Electronics,Fashion|       29|Electronics,Fashion|     Rahul|
|       C002|  Sneha|Thirty Two|    Delhi|            Fashion|       32|            Fashion|     Sneha|
|       C003|   Aman|      NULL|   Mumbai|[Home, Electronics]|     NULL|  Home, Electronics|      Aman|
|       C004|Pallavi|        27|     Pune| Electronics|Beauty|       27| Electronics,Beauty|   Pallavi|
|       C005|       |        35|  Chennai|               NULL|       35|                   |   Unknown|
+-----------+-------+----------+---------+-------------------+---------+-------------------+----------+



In [98]:
cleaned_df.show()

+-----------+----------+---------+---------+-------------------+
|customer_id|name_clean|age_clean|     city|    interests_clean|
+-----------+----------+---------+---------+-------------------+
|       C001|     Rahul|       29|Bangalore|Electronics,Fashion|
|       C002|     Sneha|       32|    Delhi|            Fashion|
|       C003|      Aman|     NULL|   Mumbai|  Home, Electronics|
|       C004|   Pallavi|       27|     Pune| Electronics,Beauty|
|       C005|   Unknown|       35|  Chennai|                   |
+-----------+----------+---------+---------+-------------------+



# Seller Dataset

In [127]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("seller_id", StringType(), True),
    StructField("seller_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("onboarding_date", StringType(), True)
])

# Load Raw data and nomalise

In [128]:
from datetime import datetime

raw_sellers = [
    ("S001","TechWorld","Electronics","2019-06-01"),
    ("S002","FashionHub","Fashion","01/07/2020"),
    ("S003","HomeEssentials","Home","2018/09/15"),
    ("S004","BeautyStore","Beauty","invalid_date")
]

def normalize_date(date_str):
    formats = ["%Y-%m-%d", "%d/%m/%Y", "%Y/%m/%d"]
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
        except:
            continue
    return None

normalized_sellers = [
    (sid, name, cat, normalize_date(date))
    for sid, name, cat, date in raw_sellers
]

In [129]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

schema = ["seller_id","seller_name","category","onboarding_date"]
df = spark.createDataFrame(normalized_sellers, schema)

df.show(truncate=False)

+---------+--------------+-----------+---------------+
|seller_id|seller_name   |category   |onboarding_date|
+---------+--------------+-----------+---------------+
|S001     |TechWorld     |Electronics|2019-06-01     |
|S002     |FashionHub    |Fashion    |2020-07-01     |
|S003     |HomeEssentials|Home       |2018-09-15     |
|S004     |BeautyStore   |Beauty     |NULL           |
+---------+--------------+-----------+---------------+



In [130]:
df.printSchema()

root
 |-- seller_id: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- onboarding_date: string (nullable = true)



# DATASET 3 — PRODUCT CATALOG

In [103]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("product_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("seller_id", StringType(), True),
    StructField("price", StringType(), True)
])

In [104]:
raw_products = [
    ("P001","Laptop","Electronics","S001","55000"),
    ("P002","Headphones","Electronics","S001","2500"),
    ("P003","T-Shirt","Fashion","S002","1200"),
    ("P004","Sofa","Home","S003","45000"),
    ("P005","Face Cream","Beauty","S004","800")
]

df = spark.createDataFrame(raw_products, schema)

In [105]:
from pyspark.sql.functions import col

df = df.withColumn("price_clean", col("price").cast("int"))

In [106]:
product_df=df

# Dataset 4

In [107]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True),
    StructField("revenue", StringType(), True)
])

In [108]:
raw_orders = [
    ("O001","C001","P001","2024-01-05","Delivered","55000"),
    ("O002","C002","P003","05/01/2024","Cancelled","0"),
    ("O003","C003","P004","2024/01/06","Delivered","45000"),
    ("O004","C004","P005","invalid_date","Delivered","800"),
    ("O005","C001","P002","2024-01-10","Delivered","2500"),
    ("O006","C005","P003","2024-01-12","Delivered","1200")
]

df = spark.createDataFrame(raw_orders, schema)

In [109]:
from pyspark.sql.functions import to_date, date_format, coalesce, col
from datetime import datetime

raw_orders = [
    ("O001","C001","P001","2024-01-05","Delivered","55000"),
    ("O002","C002","P003","05/01/2024","Cancelled","0"),
    ("O003","C003","P004","2024/01/06","Delivered","45000"),
    ("O004","C004","P005","invalid_date","Delivered","800"),
    ("O005","C001","P002","2024-01-10","Delivered","2500"),
    ("O006","C005","P003","2024-01-12","Delivered","1200")
]

def normalize_date(date_str):
    formats = ["%Y-%m-%d", "%d/%m/%Y", "%Y/%m/%d"]
    for fmt in formats:
        try:
            return datetime.strptime(date_str, fmt).strftime("%Y-%m-%d")
        except:
            continue
    return None

normalized_orders = [
    (oid, cid, pid, normalize_date(date), status, revenue)
    for oid, cid, pid, date, status, revenue in raw_orders
]

In [110]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.getOrCreate()

schema = ["order_id","customer_id","product_id","order_date","status","revenue"]
df = spark.createDataFrame(normalized_orders, schema)

In [111]:

df = df.withColumn("revenue_clean", col("revenue").cast("int"))


In [112]:
df.show()

+--------+-----------+----------+----------+---------+-------+-------------+
|order_id|customer_id|product_id|order_date|   status|revenue|revenue_clean|
+--------+-----------+----------+----------+---------+-------+-------------+
|    O001|       C001|      P001|2024-01-05|Delivered|  55000|        55000|
|    O002|       C002|      P003|2024-01-05|Cancelled|      0|            0|
|    O003|       C003|      P004|2024-01-06|Delivered|  45000|        45000|
|    O004|       C004|      P005|      NULL|Delivered|    800|          800|
|    O005|       C001|      P002|2024-01-10|Delivered|   2500|         2500|
|    O006|       C005|      P003|2024-01-12|Delivered|   1200|         1200|
+--------+-----------+----------+----------+---------+-------+-------------+



In [113]:
orderhistory=df

# Join Orders with Products


In [131]:
orders_products = orderhistory.join(
    product_df,
    on="product_id",
    how="inner"
)

In [132]:
orders_products.show()

+----------+--------+-----------+----------+---------+-------+-------------+------------+-----------+---------+-----+-----------+
|product_id|order_id|customer_id|order_date|   status|revenue|revenue_clean|product_name|   category|seller_id|price|price_clean|
+----------+--------+-----------+----------+---------+-------+-------------+------------+-----------+---------+-----+-----------+
|      P001|    O001|       C001|2024-01-05|Delivered|  55000|        55000|      Laptop|Electronics|     S001|55000|      55000|
|      P002|    O005|       C001|2024-01-10|Delivered|   2500|         2500|  Headphones|Electronics|     S001| 2500|       2500|
|      P003|    O002|       C002|2024-01-05|Cancelled|      0|            0|     T-Shirt|    Fashion|     S002| 1200|       1200|
|      P003|    O006|       C005|2024-01-12|Delivered|   1200|         1200|     T-Shirt|    Fashion|     S002| 1200|       1200|
|      P004|    O003|       C003|2024-01-06|Delivered|  45000|        45000|        Sofa| 

# Join Products with Sellers


In [133]:
orders_products_sellers = orders_products.join(
    df,
    on="seller_id",
    how="inner"
)

In [134]:
orders_products_sellers.show()

+---------+----------+--------+-----------+----------+---------+-------+-------------+------------+-----------+-----+-----------+--------------+-----------+---------------+
|seller_id|product_id|order_id|customer_id|order_date|   status|revenue|revenue_clean|product_name|   category|price|price_clean|   seller_name|   category|onboarding_date|
+---------+----------+--------+-----------+----------+---------+-------+-------------+------------+-----------+-----+-----------+--------------+-----------+---------------+
|     S001|      P002|    O005|       C001|2024-01-10|Delivered|   2500|         2500|  Headphones|Electronics| 2500|       2500|     TechWorld|Electronics|     2019-06-01|
|     S001|      P001|    O001|       C001|2024-01-05|Delivered|  55000|        55000|      Laptop|Electronics|55000|      55000|     TechWorld|Electronics|     2019-06-01|
|     S002|      P003|    O006|       C005|2024-01-12|Delivered|   1200|         1200|     T-Shirt|    Fashion| 1200|       1200|    Fa

# Join orders with customers

In [135]:
final_fact = orders_products_sellers.join(
    cleaned_df,
    on="customer_id",
    how="inner"
)

In [136]:
final_fact.show()

+-----------+---------+----------+--------+----------+---------+-------+-------------+------------+-----------+-----+-----------+--------------+-----------+---------------+----------+---------+---------+-------------------+
|customer_id|seller_id|product_id|order_id|order_date|   status|revenue|revenue_clean|product_name|   category|price|price_clean|   seller_name|   category|onboarding_date|name_clean|age_clean|     city|    interests_clean|
+-----------+---------+----------+--------+----------+---------+-------+-------------+------------+-----------+-----+-----------+--------------+-----------+---------------+----------+---------+---------+-------------------+
|       C001|     S001|      P002|    O005|2024-01-10|Delivered|   2500|         2500|  Headphones|Electronics| 2500|       2500|     TechWorld|Electronics|     2019-06-01|     Rahul|       29|Bangalore|Electronics,Fashion|
|       C001|     S001|      P001|    O001|2024-01-05|Delivered|  55000|        55000|      Laptop|Elect

# Broadcast

In [137]:
from pyspark.sql.functions import broadcast

final_fact = orderhistory \
    .join(broadcast(product_df), "product_id") \
    .join(broadcast(df), "seller_id") \
    .join(broadcast(cleaned_df), "customer_id")

In [138]:
final_fact.show()

+-----------+---------+----------+--------+----------+---------+-------+-------------+------------+-----------+-----+-----------+--------------+-----------+---------------+----------+---------+---------+-------------------+
|customer_id|seller_id|product_id|order_id|order_date|   status|revenue|revenue_clean|product_name|   category|price|price_clean|   seller_name|   category|onboarding_date|name_clean|age_clean|     city|    interests_clean|
+-----------+---------+----------+--------+----------+---------+-------+-------------+------------+-----------+-----+-----------+--------------+-----------+---------------+----------+---------+---------+-------------------+
|       C001|     S001|      P001|    O001|2024-01-05|Delivered|  55000|        55000|      Laptop|Electronics|55000|      55000|     TechWorld|Electronics|     2019-06-01|     Rahul|       29|Bangalore|Electronics,Fashion|
|       C002|     S002|      P003|    O002|2024-01-05|Cancelled|      0|            0|     T-Shirt|    F

In [139]:
final_fact.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [customer_id])
:- Project [seller_id#613, product_id#624, order_id#622, customer_id#623, order_date#625, status#626, revenue#627, revenue_clean#628, product_name#611, category#612, price#614, price_clean#615, seller_name#742, category#743, onboarding_date#744]
:  +- Join Inner, (seller_id#613 = seller_id#741)
:     :- Project [product_id#624, order_id#622, customer_id#623, order_date#625, status#626, revenue#627, revenue_clean#628, product_name#611, category#612, seller_id#613, price#614, price_clean#615]
:     :  +- Join Inner, (product_id#624 = product_id#610)
:     :     :- Project [order_id#622, customer_id#623, product_id#624, order_date#625, status#626, revenue#627, cast(revenue#627 as int) AS revenue_clean#628]
:     :     :  +- LogicalRDD [order_id#622, customer_id#623, product_id#624, order_date#625, status#626, revenue#627], false
:     :     +- ResolvedHint (strategy=broadcast)
:     :        +- Project [product_id#610, produc

# Orphan Orders Elimination

In [142]:
orphan_orders = orderhistory.join(product_df, "product_id", "left_anti")
orphan_products = product_df.join(df, "seller_id", "left_anti")

orphan_customers = orderhistory.join(cleaned_df, "customer_id", "left_anti")

In [143]:
orphan_customers.show()

+-----------+--------+----------+----------+------+-------+-------------+
|customer_id|order_id|product_id|order_date|status|revenue|revenue_clean|
+-----------+--------+----------+----------+------+-------+-------------+
+-----------+--------+----------+----------+------+-------+-------------+



# Total Revenue per Category

In [145]:
from pyspark.sql.functions import sum

orders_products = orderhistory.join(
    product_df.withColumnRenamed("category", "product_category"),
    on="product_id",
    how="inner"
)

orders_products_sellers = orders_products.join(
    df.withColumnRenamed("category", "seller_category"),
    on="seller_id",
    how="inner"
)

In [147]:

final_fact = orders_products_sellers.join(
    cleaned_df,
    on="customer_id",
    how="inner"
)

In [148]:
from pyspark.sql.functions import sum, col

revenue_per_category = final_fact.groupBy("product_category") \
    .agg(sum("revenue_clean").alias("total_revenue")) \
    .orderBy(col("total_revenue").desc())

revenue_per_category.show()

+----------------+-------------+
|product_category|total_revenue|
+----------------+-------------+
|     Electronics|        57500|
|            Home|        45000|
|         Fashion|         1200|
|          Beauty|          800|
+----------------+-------------+



# Total revenue per seller


In [149]:
revenue_per_seller = final_fact.groupBy("seller_id","seller_name") \
    .agg(sum("revenue_clean").alias("total_revenue")) \
    .orderBy(col("total_revenue").desc())
revenue_per_seller.show()

+---------+--------------+-------------+
|seller_id|   seller_name|total_revenue|
+---------+--------------+-------------+
|     S001|     TechWorld|        57500|
|     S003|HomeEssentials|        45000|
|     S002|    FashionHub|         1200|
|     S004|   BeautyStore|          800|
+---------+--------------+-------------+



# Total orders per customer

In [150]:
from pyspark.sql.functions import sum, count, avg, col


orders_per_customer = final_fact.groupBy("customer_id","name_clean") \
    .agg(count("order_id").alias("total_orders")) \
    .orderBy(col("total_orders").desc())
orders_per_customer.show()

+-----------+----------+------------+
|customer_id|name_clean|total_orders|
+-----------+----------+------------+
|       C001|     Rahul|           2|
|       C002|     Sneha|           1|
|       C003|      Aman|           1|
|       C004|   Pallavi|           1|
|       C005|   Unknown|           1|
+-----------+----------+------------+



# Average order value per customer


In [151]:
avg_order_value_per_customer = final_fact.groupBy("customer_id","name_clean") \
    .agg(avg("revenue_clean").alias("avg_order_value")) \
    .orderBy(col("avg_order_value").desc())
avg_order_value_per_customer.show()

+-----------+----------+---------------+
|customer_id|name_clean|avg_order_value|
+-----------+----------+---------------+
|       C003|      Aman|        45000.0|
|       C001|     Rahul|        28750.0|
|       C005|   Unknown|         1200.0|
|       C004|   Pallavi|          800.0|
|       C002|     Sneha|            0.0|
+-----------+----------+---------------+



#Window Functions

# Rank Customers by Total Spend (Overall)

In [153]:
from pyspark.sql.functions import sum
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

customer_spend = final_fact.groupBy("customer_id","name_clean") \
    .agg(sum("revenue_clean").alias("total_spend"))

window_spec = Window.orderBy(col("total_spend").desc())

ranked_customers = customer_spend.withColumn("rank", rank().over(window_spec))
ranked_customers.show()

+-----------+----------+-----------+----+
|customer_id|name_clean|total_spend|rank|
+-----------+----------+-----------+----+
|       C001|     Rahul|      57500|   1|
|       C003|      Aman|      45000|   2|
|       C005|   Unknown|       1200|   3|
|       C004|   Pallavi|        800|   4|
|       C002|     Sneha|          0|   5|
+-----------+----------+-----------+----+



# Rank Sellers by Revenue Within Each Category

In [154]:
seller_revenue = final_fact.groupBy("seller_id","seller_name","product_category") \
    .agg(sum("revenue_clean").alias("total_revenue"))

window_spec = Window.partitionBy("product_category").orderBy(col("total_revenue").desc())

ranked_sellers = seller_revenue.withColumn("rank_in_category", rank().over(window_spec))
ranked_sellers.show()

+---------+--------------+----------------+-------------+----------------+
|seller_id|   seller_name|product_category|total_revenue|rank_in_category|
+---------+--------------+----------------+-------------+----------------+
|     S004|   BeautyStore|          Beauty|          800|               1|
|     S001|     TechWorld|     Electronics|        57500|               1|
|     S002|    FashionHub|         Fashion|         1200|               1|
|     S003|HomeEssentials|            Home|        45000|               1|
+---------+--------------+----------------+-------------+----------------+



# Identify Top 2 Products per Category by Revenue

In [155]:
product_revenue = final_fact.groupBy("product_id","product_name","product_category") \
    .agg(sum("revenue_clean").alias("total_revenue"))

window_spec = Window.partitionBy("product_category").orderBy(col("total_revenue").desc())

top_products = product_revenue.withColumn("rank_in_category", rank().over(window_spec)) \
    .filter(col("rank_in_category") <= 2)

top_products.show()

+----------+------------+----------------+-------------+----------------+
|product_id|product_name|product_category|total_revenue|rank_in_category|
+----------+------------+----------------+-------------+----------------+
|      P005|  Face Cream|          Beauty|          800|               1|
|      P001|      Laptop|     Electronics|        55000|               1|
|      P002|  Headphones|     Electronics|         2500|               2|
|      P003|     T-Shirt|         Fashion|         1200|               1|
|      P004|        Sofa|            Home|        45000|               1|
+----------+------------+----------------+-------------+----------------+



# UDF

In [156]:
from pyspark.sql.functions import sum, when, col

customer_spend = final_fact.groupBy("customer_id","name_clean") \
    .agg(sum("revenue_clean").alias("total_spend"))

tiered_customers = customer_spend.withColumn(
    "spend_tier",
    when(col("total_spend") >= 50000, "High")
    .when(col("total_spend") >= 10000, "Medium")
    .otherwise("Low")
)
tiered_customers.show()

+-----------+----------+-----------+----------+
|customer_id|name_clean|total_spend|spend_tier|
+-----------+----------+-----------+----------+
|       C001|     Rahul|      57500|      High|
|       C002|     Sneha|          0|       Low|
|       C003|      Aman|      45000|    Medium|
|       C004|   Pallavi|        800|       Low|
|       C005|   Unknown|       1200|       Low|
+-----------+----------+-----------+----------+



# Sorting and ordering

In [157]:
from pyspark.sql.functions import sum, col

revenue_per_category = final_fact.groupBy("product_category") \
    .agg(sum("revenue_clean").alias("total_revenue")) \
    .orderBy(col("total_revenue").desc())

revenue_per_seller_cat = final_fact.groupBy("product_category","seller_id","seller_name") \
    .agg(sum("revenue_clean").alias("total_revenue")) \
    .orderBy(col("product_category"), col("total_revenue").desc())

# Product → Seller join


In [159]:
product_seller = product_df.join(df, "seller_id", "inner")
product_seller.explain(True)


== Parsed Logical Plan ==
'Join UsingJoin(Inner, [seller_id])
:- Project [product_id#610, product_name#611, category#612, seller_id#613, price#614, cast(price#614 as int) AS price_clean#615]
:  +- LogicalRDD [product_id#610, product_name#611, category#612, seller_id#613, price#614], false
+- LogicalRDD [seller_id#741, seller_name#742, category#743, onboarding_date#744], false

== Analyzed Logical Plan ==
seller_id: string, product_id: string, product_name: string, category: string, price: string, price_clean: int, seller_name: string, category: string, onboarding_date: string
Project [seller_id#613, product_id#610, product_name#611, category#612, price#614, price_clean#615, seller_name#742, category#743, onboarding_date#744]
+- Join Inner, (seller_id#613 = seller_id#741)
   :- Project [product_id#610, product_name#611, category#612, seller_id#613, price#614, cast(price#614 as int) AS price_clean#615]
   :  +- LogicalRDD [product_id#610, product_name#611, category#612, seller_id#613, pr

#Window ranking (e.g., sellers by category)

In [160]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, rank

seller_rev = final_fact.groupBy("product_category","seller_id","seller_name") \
    .agg(sum("revenue_clean").alias("total_revenue"))
w = Window.partitionBy("product_category").orderBy(col("total_revenue").desc())
seller_rank = seller_rev.withColumn("rank_in_category", rank().over(w))
seller_rank.explain(True)


== Parsed Logical Plan ==
'Project [unresolvedstarwithcolumns(rank_in_category, 'rank() windowspecdefinition('product_category, 'total_revenue DESC NULLS LAST, unspecifiedframe$()), None)]
+- Aggregate [product_category#1024, seller_id#613, seller_name#742], [product_category#1024, seller_id#613, seller_name#742, sum(revenue_clean#628) AS total_revenue#1478L]
   +- Project [customer_id#623, seller_id#613, product_id#624, order_id#622, order_date#625, status#626, revenue#627, revenue_clean#628, product_name#611, product_category#1024, price#614, price_clean#615, seller_name#742, seller_category#1025, onboarding_date#744, name_clean#549, age_clean#547, city#544, interests_clean#548]
      +- Join Inner, (customer_id#623 = customer_id#541)
         :- Project [seller_id#613, product_id#624, order_id#622, customer_id#623, order_date#625, status#626, revenue#627, revenue_clean#628, product_name#611, product_category#1024, price#614, price_clean#615, seller_name#742, seller_category#1025, on

# Sorting


In [161]:
sorted_rev = revenue_per_category.orderBy(col("total_revenue").desc())
sorted_rev.explain(True)

== Parsed Logical Plan ==
'Sort ['total_revenue DESC NULLS LAST], true
+- Sort [total_revenue#1436L DESC NULLS LAST], true
   +- Aggregate [product_category#1024], [product_category#1024, sum(revenue_clean#628) AS total_revenue#1436L]
      +- Project [customer_id#623, seller_id#613, product_id#624, order_id#622, order_date#625, status#626, revenue#627, revenue_clean#628, product_name#611, product_category#1024, price#614, price_clean#615, seller_name#742, seller_category#1025, onboarding_date#744, name_clean#549, age_clean#547, city#544, interests_clean#548]
         +- Join Inner, (customer_id#623 = customer_id#541)
            :- Project [seller_id#613, product_id#624, order_id#622, customer_id#623, order_date#625, status#626, revenue#627, revenue_clean#628, product_name#611, product_category#1024, price#614, price_clean#615, seller_name#742, seller_category#1025, onboarding_date#744]
            :  +- Join Inner, (seller_id#613 = seller_id#741)
            :     :- Project [product

# Broadcast

In [162]:
final_fact_optimized = orderhistory \
    .join(broadcast(product_df), "product_id") \
    .join(broadcast(df), "seller_id") \
    .join(broadcast(cleaned_df), "customer_id")

In [163]:
final_fact_optimized.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [customer_id])
:- Project [seller_id#613, product_id#624, order_id#622, customer_id#623, order_date#625, status#626, revenue#627, revenue_clean#628, product_name#611, category#612, price#614, price_clean#615, seller_name#742, category#743, onboarding_date#744]
:  +- Join Inner, (seller_id#613 = seller_id#741)
:     :- Project [product_id#624, order_id#622, customer_id#623, order_date#625, status#626, revenue#627, revenue_clean#628, product_name#611, category#612, seller_id#613, price#614, price_clean#615]
:     :  +- Join Inner, (product_id#624 = product_id#610)
:     :     :- Project [order_id#622, customer_id#623, product_id#624, order_date#625, status#626, revenue#627, cast(revenue#627 as int) AS revenue_clean#628]
:     :     :  +- LogicalRDD [order_id#622, customer_id#623, product_id#624, order_date#625, status#626, revenue#627], false
:     :     +- ResolvedHint (strategy=broadcast)
:     :        +- Project [product_id#610, produc