In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [17]:
spark = SparkSession.builder.appName("CustomerAnalysis").getOrCreate()

#PHASE 1


In [18]:

# 1. Read orders.csv as all StringType
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True)
])

orders_df = spark.read.csv("orders.csv", header=True, schema=schema)

# 2. Trim text columns
orders_df = orders_df.select([trim(col(c)).alias(c) for c in orders_df.columns])

# 3. Normalize city, category, product
orders_df = orders_df.withColumn("city", lower(col("city"))) \
                     .withColumn("category", lower(col("category"))) \
                     .withColumn("product", lower(col("product")))

# 4. Clean amount: Remove commas, Convert to IntegerType, Handle invalid values safely
# First remove commas
orders_df = orders_df.withColumn("amount_cleaned", regexp_replace(col("amount"), ",", ""))

# Use when + rlike to check if string is numeric before casting
orders_df = orders_df.withColumn("amount",
    when(col("amount_cleaned").rlike("^-?[0-9]+$"), col("amount_cleaned").cast(IntegerType()))
    .otherwise(lit(None).cast(IntegerType()))
).drop("amount_cleaned")

# 5. Parse order_date into DateType â†’ order_date_clean (handle multiple formats safely)
# Replace / with - first to normalize
orders_df = orders_df.withColumn("date_normalized", regexp_replace(col("order_date"), "/", "-"))

# Try parsing with safe approach
orders_df = orders_df.withColumn("order_date_clean",
    when(col("date_normalized").rlike("^[0-9]{4}-[0-9]{2}-[0-9]{2}$"),
         to_date(col("date_normalized"), "yyyy-MM-dd"))
    .when(col("date_normalized").rlike("^[0-9]{2}-[0-9]{2}-[0-9]{4}$"),
         to_date(col("date_normalized"), "dd-MM-yyyy"))
    .otherwise(lit(None).cast(DateType()))
).drop("date_normalized")

# 6. Remove duplicate order_id
orders_df = orders_df.dropDuplicates(["order_id"])

# 7. Keep only Completed orders and filter out null amounts and dates
clean_orders_df = orders_df.filter(
    (lower(col("status")) == "completed") &
    (col("amount").isNotNull()) &
    (col("order_date_clean").isNotNull())
)

clean_orders_df.show(5)

+-----------+-----------+---------+-----------+-------+------+----------+---------+----------------+
|   order_id|customer_id|     city|   category|product|amount|order_date|   status|order_date_clean|
+-----------+-----------+---------+-----------+-------+------+----------+---------+----------------+
|ORD00000001|    C000001|     pune|    grocery|  sugar| 35430|2024-01-02|Completed|      2024-01-02|
|ORD00000007|    C000007|     pune|    grocery|   rice| 45362|2024-01-08|Completed|      2024-01-08|
|ORD00000008|    C000008|bangalore|    fashion|  jeans| 10563|2024-01-09|Completed|      2024-01-09|
|ORD00000010|    C000010|bangalore|    grocery|  sugar| 66576|2024-01-11|Completed|      2024-01-11|
|ORD00000011|    C000011|  kolkata|electronics| tablet| 50318|12/01/2024|Completed|      2024-01-12|
+-----------+-----------+---------+-----------+-------+------+----------+---------+----------------+
only showing top 5 rows


#PHASE 2

In [21]:
#1. Total number of orders.
#2. Total spending.
#3. Average order value.
#4. First purchase date.
#5. Last purchase date.
#6. Number of distinct cities ordered from.
#7. Number of distinct categories ordered from.

customer_metrics = clean_orders_df.groupBy("customer_id").agg(
    count("order_id").alias("total_orders"),
    sum("amount").alias("total_spending"),
    avg("amount").alias("avg_order_value"),
    min("order_date_clean").alias("first_purchase_date"),
    max("order_date_clean").alias("last_purchase_date"),
    count_distinct(col("city")).alias("distinct_cities"),
    count_distinct(col("category")).alias("distinct_categories")
)

customer_metrics.show(5)

+-----------+------------+--------------+------------------+-------------------+------------------+---------------+-------------------+
|customer_id|total_orders|total_spending|   avg_order_value|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|
+-----------+------------+--------------+------------------+-------------------+------------------+---------------+-------------------+
|    C018237|           4|        226546|           56636.5|         2024-01-18|        2024-02-27|              3|                  3|
|    C044374|           6|        224785|37464.166666666664|         2024-01-15|        2024-02-24|              3|                  3|
|    C001115|           5|        163614|           32722.8|         2024-01-16|        2024-02-25|              5|                  3|
|    C012569|           6|        270399|           45066.5|         2024-01-10|        2024-02-19|              5|                  3|
|    C010142|           5|        245547|       

#PHASE 3

In [22]:
customer_segments = customer_metrics.withColumn("customer_segment",
    when((col("total_spending") >= 200000) & (col("total_orders") >= 5), "VIP")
    .when(col("total_spending") >= 100000, "Premium")
    .otherwise("Regular")
)

customer_segments.show(5)

# Count customers in each segment
segment_counts = customer_segments.groupBy("customer_segment").count()
segment_counts.show()

+-----------+------------+--------------+------------------+-------------------+------------------+---------------+-------------------+----------------+
|customer_id|total_orders|total_spending|   avg_order_value|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|customer_segment|
+-----------+------------+--------------+------------------+-------------------+------------------+---------------+-------------------+----------------+
|    C018237|           4|        226546|           56636.5|         2024-01-18|        2024-02-27|              3|                  3|         Premium|
|    C044374|           6|        224785|37464.166666666664|         2024-01-15|        2024-02-24|              3|                  3|             VIP|
|    C001115|           5|        163614|           32722.8|         2024-01-16|        2024-02-25|              5|                  3|         Premium|
|    C012569|           6|        270399|           45066.5|         2024-01-10|  

#PHASE 4

In [23]:

# 1. Rank customers by total spending (overall)
overall_window = Window.orderBy(col("total_spending").desc())
ranked_customers = customer_segments.withColumn("overall_rank", rank().over(overall_window))

# 2. Rank customers inside each city by total spending
city_data = clean_orders_df.groupBy("customer_id", "city").agg(
    sum("amount").alias("city_spending")
)
city_window = Window.partitionBy("city").orderBy(col("city_spending").desc())
city_ranked = city_data.withColumn("city_rank", rank().over(city_window))

# 3. Identify top 3 customers per city
top3_per_city = city_ranked.filter(col("city_rank") <= 3)
top3_per_city.show(20)

# 4. Identify top 10 customers across all cities
top10_customers = ranked_customers.filter(col("overall_rank") <= 10)
top10_customers.select("customer_id", "total_spending", "overall_rank").show(10)

+-----------+---------+-------------+---------+
|customer_id|     city|city_spending|city_rank|
+-----------+---------+-------------+---------+
|    C011518|bangalore|       332527|        1|
|    C024935|bangalore|       315622|        2|
|    C025451|bangalore|       303208|        3|
|    C028121|  chennai|       340890|        1|
|    C027841|  chennai|       287392|        2|
|    C030712|  chennai|       284466|        3|
|    C016309|    delhi|       325001|        1|
|    C022599|    delhi|       314625|        2|
|    C018688|    delhi|       306692|        3|
|    C032833|hyderabad|       318097|        1|
|    C023269|hyderabad|       292791|        2|
|    C013263|hyderabad|       291679|        3|
|    C032246|  kolkata|       304480|        1|
|    C022131|  kolkata|       296888|        2|
|    C028450|  kolkata|       296653|        3|
|    C048696|   mumbai|       334732|        1|
|    C047887|   mumbai|       307401|        2|
|    C022721|   mumbai|       306800|   

#PHASE 5

In [26]:
# Define loyalty: 3+ different dates AND 2+ different categories
loyalty_metrics = clean_orders_df.groupBy("customer_id").agg(
    count_distinct("order_date_clean").alias("distinct_dates"),
    count_distinct("category").alias("distinct_categories_loyalty")
)

loyal_customers = loyalty_metrics.withColumn("is_loyal",
    when((col("distinct_dates") >= 3) & (col("distinct_categories_loyalty") >= 2), True)
    .otherwise(False)
)

loyal_customers.show(5)

# 1. Count loyal customers per city
customer_city = clean_orders_df.select("customer_id", "city").distinct()
loyal_with_city = loyal_customers.join(customer_city, "customer_id")
loyal_count_per_city = loyal_with_city.groupBy("city", "is_loyal").count()
loyal_count_per_city.show()

# 2. Compare loyal vs non-loyal revenue
loyal_revenue = loyal_customers.join(customer_metrics, "customer_id") \
    .groupBy("is_loyal").agg(sum("total_spending").alias("revenue_contribution"))
loyal_revenue.show()


+-----------+--------------+---------------------------+--------+
|customer_id|distinct_dates|distinct_categories_loyalty|is_loyal|
+-----------+--------------+---------------------------+--------+
|    C009896|             3|                          3|    true|
|    C041802|             3|                          4|    true|
|    C041216|             3|                          3|    true|
|    C030828|             3|                          3|    true|
|    C043689|             3|                          4|    true|
+-----------+--------------+---------------------------+--------+
only showing top 5 rows
+---------+--------+-----+
|     city|is_loyal|count|
+---------+--------+-----+
|hyderabad|    true|26748|
|     pune|   false|  384|
|    delhi|   false|  382|
|bangalore|    true|26393|
|hyderabad|   false|  424|
|    delhi|    true|26635|
|   mumbai|   false|  401|
|     pune|    true|26601|
|  kolkata|   false|  378|
|   mumbai|    true|26416|
|bangalore|   false|  414|
|  c

#PHASE 6

In [27]:
# 1. Compute monthly revenue per city
monthly_city_revenue = clean_orders_df.withColumn("month", date_format(col("order_date_clean"), "yyyy-MM")) \
    .groupBy("month", "city").agg(sum("amount").alias("monthly_revenue"))
monthly_city_revenue.orderBy("month", "city").show(20)

# 2. Compute monthly order count per category
monthly_category_orders = clean_orders_df.withColumn("month", date_format(col("order_date_clean"), "yyyy-MM")) \
    .groupBy("month", "category").agg(count("order_id").alias("order_count"))
monthly_category_orders.orderBy("month", "category").show(20)

+-------+---------+---------------+
|  month|     city|monthly_revenue|
+-------+---------+---------------+
|2024-01|bangalore|      822339117|
|2024-01|  chennai|      818567389|
|2024-01|    delhi|      817332633|
|2024-01|hyderabad|      833063605|
|2024-01|  kolkata|      824920456|
|2024-01|   mumbai|      816636150|
|2024-01|     pune|      833507124|
|2024-02|bangalore|      792163305|
|2024-02|  chennai|      796361427|
|2024-02|    delhi|      805877007|
|2024-02|hyderabad|      796252807|
|2024-02|  kolkata|      785096186|
|2024-02|   mumbai|      795736235|
|2024-02|     pune|      797779557|
+-------+---------+---------------+

+-------+-----------+-----------+
|  month|   category|order_count|
+-------+-----------+-----------+
|2024-01|electronics|      33063|
|2024-01|    fashion|      32509|
|2024-01|    grocery|      32986|
|2024-01|       home|      33136|
|2024-02|electronics|      31889|
|2024-02|    fashion|      31810|
|2024-02|    grocery|      31761|
|2024-02|  

#PHASE 7

In [28]:
# 1. Cache reused DataFrames
clean_orders_df.cache()
customer_metrics.cache()

# 2. Use explain(True)
print("\n=== Customer Aggregation Explain ===")
customer_metrics.explain(True)

print("\n=== Window Ranking Explain ===")
ranked_customers.explain(True)

# 3. Check for shuffle stages (look for Exchange in explain output above)

# 4. Repartitioning strategy (if needed)
# clean_orders_df = clean_orders_df.repartition("customer_id")


=== Customer Aggregation Explain ===
== Parsed Logical Plan ==
'Aggregate ['customer_id], ['customer_id, 'count('order_id) AS total_orders#998, 'sum('amount) AS total_spending#999, 'avg('amount) AS avg_order_value#1000, 'min('order_date_clean) AS first_purchase_date#1001, 'max('order_date_clean) AS last_purchase_date#1002, 'count(distinct 'city) AS distinct_cities#1003, 'count(distinct 'category) AS distinct_categories#1004]
+- Filter (((lower(status#873) = completed) AND isnotnull(amount#879)) AND isnotnull(order_date_clean#881))
   +- Deduplicate [order_id#866]
      +- Project [order_id#866, customer_id#867, city#875, category#876, product#877, amount#879, order_date#872, status#873, order_date_clean#881]
         +- Project [order_id#866, customer_id#867, city#875, category#876, product#877, amount#879, order_date#872, status#873, date_normalized#880, CASE WHEN RLIKE(date_normalized#880, ^[0-9]{4}-[0-9]{2}-[0-9]{2}$) THEN to_date(date_normalized#880, Some(yyyy-MM-dd), Some(Etc/UTC

#PHASE 8

In [29]:
# Create segment lookup
segment_lookup_data = [
    ("VIP", 1),
    ("Premium", 2),
    ("Regular", 3)
]
segment_lookup_df = spark.createDataFrame(segment_lookup_data, ["segment_label", "segment_code"])

# Join with broadcast
from pyspark.sql.functions import broadcast
customer_with_code = customer_segments.join(
    broadcast(segment_lookup_df),
    customer_segments.customer_segment == segment_lookup_df.segment_label
)

customer_with_code.show(5)

# Verify BroadcastHashJoin in plan
print("\n=== Broadcast Join Explain ===")
customer_with_code.explain(True)

+-----------+------------+--------------+---------------+-------------------+------------------+---------------+-------------------+----------------+-------------+------------+
|customer_id|total_orders|total_spending|avg_order_value|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|customer_segment|segment_label|segment_code|
+-----------+------------+--------------+---------------+-------------------+------------------+---------------+-------------------+----------------+-------------+------------+
|    C016502|           6|        318813|        53135.5|         2024-01-03|        2024-02-12|              3|                  3|             VIP|          VIP|           1|
|    C036542|           5|        232053|        46410.6|         2024-01-03|        2024-02-12|              3|                  4|             VIP|          VIP|           1|
|    C041216|           5|        268589|        53717.8|         2024-01-17|        2024-02-26|              4|   

#PHASE 9

In [30]:
# 1. Sort customers
sorted_customers = customer_segments.orderBy(
    col("total_spending").desc(),
    col("total_orders").desc()
)
sorted_customers.show(10)

# 2. Create two sets
electronics_customers = clean_orders_df.filter(col("category") == "electronics") \
    .select("customer_id").distinct()

grocery_customers = clean_orders_df.filter(col("category") == "grocery") \
    .select("customer_id").distinct()

# 3. Find customers in both sets (intersection)
both_sets = electronics_customers.intersect(grocery_customers)
print(f"Customers in both Electronics and Grocery: {both_sets.count()}")

# Customers in only one set
only_electronics = electronics_customers.subtract(grocery_customers)
only_grocery = grocery_customers.subtract(electronics_customers)
print(f"Only Electronics: {only_electronics.count()}")
print(f"Only Grocery: {only_grocery.count()}")

+-----------+------------+--------------+-----------------+-------------------+------------------+---------------+-------------------+----------------+
|customer_id|total_orders|total_spending|  avg_order_value|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|customer_segment|
+-----------+------------+--------------+-----------------+-------------------+------------------+---------------+-------------------+----------------+
|    C043076|           6|        493949|82324.83333333333|         2024-01-17|        2024-02-26|              5|                  4|             VIP|
|    C034689|           6|        486879|          81146.5|         2024-01-10|        2024-02-19|              4|                  3|             VIP|
|    C039985|           6|        484057|80676.16666666667|         2024-01-06|        2024-02-15|              3|                  4|             VIP|
|    C026691|           6|        477147|          79524.5|         2024-01-12|        2

#PHASE 10

In [31]:
# 1. Write customer master dataset to Parquet, partitioned by customer_segment
customer_segments.write.mode("overwrite") \
    .partitionBy("customer_segment") \
    .parquet("customer_master.parquet")

# 2. Write monthly analytics to ORC
monthly_city_revenue.write.mode("overwrite").orc("monthly_analytics.orc")

# 3. Read back and validate
customer_master_read = spark.read.parquet("customer_master.parquet")
customer_master_read.show(5)

monthly_analytics_read = spark.read.orc("monthly_analytics.orc")
monthly_analytics_read.show(5)

+-----------+------------+--------------+------------------+-------------------+------------------+---------------+-------------------+----------------+
|customer_id|total_orders|total_spending|   avg_order_value|first_purchase_date|last_purchase_date|distinct_cities|distinct_categories|customer_segment|
+-----------+------------+--------------+------------------+-------------------+------------------+---------------+-------------------+----------------+
|    C014073|           5|        252076|           50415.2|         2024-01-14|        2024-02-23|              5|                  3|             VIP|
|    C002702|           6|        298831|49805.166666666664|         2024-01-03|        2024-02-12|              4|                  3|             VIP|
|    C002365|           6|        360985|60164.166666666664|         2024-01-06|        2024-02-15|              4|                  3|             VIP|
|    C030068|           6|        221767|36961.166666666664|         2024-01-09|  

#PHASE 11

DANGEROUS CODE:
df = df.groupBy("customer_id").sum("amount").show()

EXPLANATION:
1. What df becomes:
   - The groupBy().sum() returns a DataFrame with aggregated results
   - But .show() returns None (it doesn't return a DataFrame)
   - So df is reassigned to None

2. Why pipeline breaks:
   - After this line, df is None, not a DataFrame
   - Any subsequent operation on df will fail with AttributeError
   - The original df is lost

CORRECT APPROACH:
result_df = df.groupBy("customer_id").sum("amount")
result_df.show()

Or use agg() for clarity:
result_df = df.groupBy("customer_id").agg(sum("amount").alias("total_amount"))
result_df.show()

In [32]:
spark.stop()