# ML: Customer Segmentation (RFM + K-means)

Implements customer segmentation using RFM (Recency, Frequency, Monetary) analysis with K-means clustering.

## Business Value
- Target marketing campaigns to high-value customers
- Identify at-risk customers for retention efforts
- Personalize promotions by segment
- Optimize marketing spend allocation

## Technical Approach
- **Features:** RFM metrics (Recency, Frequency, Monetary value)
- **Algorithm:** K-means clustering with elbow method
- **Platform:** Synapse Data Science with PySpark MLlib
- **Segments:** Champion, Loyal, At-Risk, Lost, New, etc.

## Data Flow
```
Silver (fact_receipts, dim_customers) --> Gold (gold_customer_segments)
```

## Usage
Schedule this notebook to run **weekly** via Fabric pipeline to refresh customer segments.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from datetime import datetime, timezone
import os

In [None]:
# =============================================================================
# PARAMETERS
# =============================================================================

def get_env(var_name, default=None):
    return os.environ.get(var_name, default)

SILVER_DB = get_env("SILVER_DB", default="ag")
GOLD_DB = get_env("GOLD_DB", default="au")

# K-means configuration
MIN_K = 4
MAX_K = 10
RANDOM_SEED = 42

print(f"Configuration: SILVER_DB={SILVER_DB}, GOLD_DB={GOLD_DB}")
print(f"K-means range: {MIN_K}-{MAX_K} clusters")

In [None]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def ensure_database(name):
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")

def read_silver(table_name):
    return spark.table(f"{SILVER_DB}.{table_name}")

def save_gold(df, table_name):
    full_name = f"{GOLD_DB}.{table_name}"
    df.write.format("delta").mode("overwrite").saveAsTable(full_name)
    print(f"  {full_name}: {df.count()} rows")

ensure_database(GOLD_DB)

---
## Step 1: Calculate RFM Metrics

Calculate RFM (Recency, Frequency, Monetary) metrics for each customer:
- **Recency**: Days since last purchase
- **Frequency**: Total number of transactions
- **Monetary**: Total spending amount

In [None]:
print("="*60)
print("CALCULATING RFM METRICS")
print("="*60)

# Get current timestamp for recency calculation
analysis_date = datetime.now(timezone.utc)
print(f"\nAnalysis date: {analysis_date}")

# Read receipt data
df_receipts = read_silver("fact_receipts")

print(f"\nTotal receipts: {df_receipts.count():,}")
print(f"Date range: {df_receipts.agg(F.min('event_ts')).collect()[0][0]} to {df_receipts.agg(F.max('event_ts')).collect()[0][0]}")

In [None]:
# Calculate RFM metrics per customer
df_rfm = (
    df_receipts
    .groupBy("customer_id")
    .agg(
        # Recency: days since last purchase
        F.datediff(
            F.lit(analysis_date.date()),
            F.max(F.to_date("event_ts"))
        ).alias("recency_days"),
        
        # Frequency: total number of transactions
        F.count("receipt_id_ext").alias("frequency"),
        
        # Monetary: total spending (use total_amount or total_cents/100)
        F.sum(
            F.when(F.col("total_amount").isNotNull(), F.col("total_amount"))
            .otherwise(F.col("total_cents") / 100.0)
        ).alias("monetary_value"),
        
        # Additional metrics for profiling
        F.min(F.to_date("event_ts")).alias("first_purchase_date"),
        F.max(F.to_date("event_ts")).alias("last_purchase_date")
    )
    # Filter out customers with invalid data
    .filter(
        (F.col("recency_days").isNotNull()) &
        (F.col("frequency") > 0) &
        (F.col("monetary_value") > 0)
    )
    # Calculate average order value
    .withColumn("avg_order_value", F.col("monetary_value") / F.col("frequency"))
)

print(f"\nCustomers with RFM metrics: {df_rfm.count():,}")

# Show sample and statistics
print("\nSample RFM data:")
df_rfm.orderBy(F.rand()).limit(5).show()

print("\nRFM Statistics:")
df_rfm.select(
    F.mean("recency_days").alias("avg_recency"),
    F.mean("frequency").alias("avg_frequency"),
    F.mean("monetary_value").alias("avg_monetary"),
    F.mean("avg_order_value").alias("avg_order_value")
).show()

---
## Step 2: Feature Engineering & Scaling

Standardize RFM features for K-means clustering.

In [None]:
print("="*60)
print("FEATURE ENGINEERING & SCALING")
print("="*60)

# Cast to double for ML pipeline
df_features = df_rfm.select(
    "customer_id",
    F.col("recency_days").cast("double").alias("recency_days"),
    F.col("frequency").cast("double").alias("frequency"),
    F.col("monetary_value").cast("double").alias("monetary_value"),
    F.col("avg_order_value").cast("double").alias("avg_order_value"),
    "first_purchase_date",
    "last_purchase_date"
)

# Assemble feature vector
assembler = VectorAssembler(
    inputCols=["recency_days", "frequency", "monetary_value"],
    outputCol="features_raw"
)
df_assembled = assembler.transform(df_features)

# Standardize features (mean=0, std=1)
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withMean=True,
    withStd=True
)
scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)

print("\nFeature scaling complete.")
print(f"Scaled features ready: {df_scaled.count():,} customers")

---
## Step 3: Elbow Method for Optimal K

Determine optimal number of clusters using the elbow method (WCSS).

In [None]:
print("="*60)
print("ELBOW METHOD FOR OPTIMAL K")
print("="*60)

# Calculate Within-Cluster Sum of Squares (WCSS) for different K values
wcss_scores = []
silhouette_scores = []

evaluator = ClusteringEvaluator(
    predictionCol="prediction",
    featuresCol="features",
    metricName="silhouette"
)

print(f"\nTesting K from {MIN_K} to {MAX_K}...\n")

for k in range(MIN_K, MAX_K + 1):
    kmeans = KMeans(
        featuresCol="features",
        predictionCol="prediction",
        k=k,
        seed=RANDOM_SEED,
        maxIter=20
    )
    
    model = kmeans.fit(df_scaled)
    predictions = model.transform(df_scaled)
    
    # WCSS (Within-Cluster Sum of Squares)
    wcss = model.summary.trainingCost
    
    # Silhouette score
    silhouette = evaluator.evaluate(predictions)
    
    wcss_scores.append((k, wcss))
    silhouette_scores.append((k, silhouette))
    
    print(f"K={k}: WCSS={wcss:,.2f}, Silhouette={silhouette:.4f}")

print("\nElbow analysis complete.")
print("\nRecommendation: Review WCSS and Silhouette scores to select optimal K.")
print("Typically K=5-7 works well for customer segmentation.")

---
## Step 4: Train K-means with Optimal K

Based on elbow method, train K-means with optimal number of clusters.

In [None]:
print("="*60)
print("TRAINING K-MEANS MODEL")
print("="*60)

# Select optimal K (default to 6 for 6 common segments)
# User can adjust based on elbow analysis above
OPTIMAL_K = 6

print(f"\nTraining K-means with K={OPTIMAL_K}...")

kmeans = KMeans(
    featuresCol="features",
    predictionCol="cluster_id",
    k=OPTIMAL_K,
    seed=RANDOM_SEED,
    maxIter=50
)

model = kmeans.fit(df_scaled)
df_clustered = model.transform(df_scaled)

print("\nModel training complete.")
print(f"Final WCSS: {model.summary.trainingCost:,.2f}")

# Show cluster distribution
print("\nCluster distribution:")
df_clustered.groupBy("cluster_id").count().orderBy("cluster_id").show()

---
## Step 5: Segment Profiling & Labeling

Analyze cluster characteristics and assign meaningful business labels.

In [None]:
print("="*60)
print("SEGMENT PROFILING")
print("="*60)

# Calculate cluster statistics
df_cluster_profiles = (
    df_clustered
    .groupBy("cluster_id")
    .agg(
        F.count("*").alias("customer_count"),
        F.mean("recency_days").alias("avg_recency"),
        F.mean("frequency").alias("avg_frequency"),
        F.mean("monetary_value").alias("avg_monetary"),
        F.mean("avg_order_value").alias("avg_order_value"),
        F.percentile_approx("recency_days", 0.5).alias("median_recency"),
        F.percentile_approx("frequency", 0.5).alias("median_frequency"),
        F.percentile_approx("monetary_value", 0.5).alias("median_monetary")
    )
    .orderBy("cluster_id")
)

print("\nCluster profiles:")
df_cluster_profiles.show(truncate=False)

# Calculate percentiles for segment assignment
recency_percentiles = df_clustered.approxQuantile("recency_days", [0.33, 0.67], 0.05)
frequency_percentiles = df_clustered.approxQuantile("frequency", [0.33, 0.67], 0.05)
monetary_percentiles = df_clustered.approxQuantile("monetary_value", [0.33, 0.67], 0.05)

print(f"\nRecency percentiles (33%, 67%): {recency_percentiles}")
print(f"Frequency percentiles (33%, 67%): {frequency_percentiles}")
print(f"Monetary percentiles (33%, 67%): {monetary_percentiles}")

In [None]:
# Assign segment labels based on RFM characteristics
# Logic: Low Recency (recent) = Good, High Frequency = Good, High Monetary = Good

def assign_segment_label(avg_recency, avg_frequency, avg_monetary,
                        recency_p33, recency_p67,
                        frequency_p33, frequency_p67,
                        monetary_p33, monetary_p67):
    """
    Assign segment label based on cluster characteristics.
    
    Segments:
    - Champions: Low recency, high frequency, high monetary
    - Loyal Customers: Low recency, high frequency, medium monetary
    - Potential Loyalists: Low recency, medium frequency, medium monetary
    - New Customers: Low recency, low frequency, low monetary
    - At Risk: Medium recency, medium frequency, medium monetary
    - Hibernating: High recency, low frequency, low monetary
    - Lost: High recency, low frequency, low monetary (very high recency)
    """
    # Classify each metric as Low/Medium/High
    r_score = 3 if avg_recency < recency_p33 else (2 if avg_recency < recency_p67 else 1)
    f_score = 3 if avg_frequency > frequency_p67 else (2 if avg_frequency > frequency_p33 else 1)
    m_score = 3 if avg_monetary > monetary_p67 else (2 if avg_monetary > monetary_p33 else 1)
    
    # Assign segment based on RFM scores
    if r_score == 3 and f_score == 3 and m_score == 3:
        return "Champions"
    elif r_score == 3 and f_score >= 2 and m_score >= 2:
        return "Loyal Customers"
    elif r_score == 3 and f_score <= 2 and m_score <= 2:
        return "New Customers"
    elif r_score == 2 and f_score >= 2 and m_score >= 2:
        return "Potential Loyalists"
    elif r_score == 2:
        return "At Risk"
    elif r_score == 1 and avg_recency > 180:
        return "Lost"
    else:
        return "Hibernating"

# Collect cluster profiles for labeling
cluster_profiles_list = df_cluster_profiles.collect()

# Create segment mapping
segment_mapping = []
for row in cluster_profiles_list:
    cluster_id = row["cluster_id"]
    label = assign_segment_label(
        row["avg_recency"], row["avg_frequency"], row["avg_monetary"],
        recency_percentiles[0], recency_percentiles[1],
        frequency_percentiles[0], frequency_percentiles[1],
        monetary_percentiles[0], monetary_percentiles[1]
    )
    segment_mapping.append((cluster_id, label))
    print(f"Cluster {cluster_id}: {label}")

# Create mapping DataFrame
df_segment_mapping = spark.createDataFrame(segment_mapping, ["cluster_id", "segment_label"])

---
## Step 6: Create Output Table

Join segment labels and save to gold_customer_segments.

In [None]:
print("="*60)
print("CREATING OUTPUT TABLE")
print("="*60)

# Join segment labels
df_output = (
    df_clustered
    .join(df_segment_mapping, "cluster_id", "left")
    .select(
        "customer_id",
        "cluster_id",
        "segment_label",
        "recency_days",
        "frequency",
        "monetary_value",
        "avg_order_value",
        "first_purchase_date",
        "last_purchase_date",
        F.lit(analysis_date).cast("timestamp").alias("segmented_at")
    )
)

# Save to Gold layer
save_gold(df_output, "gold_customer_segments")

print("\nSegment summary:")
df_output.groupBy("segment_label").agg(
    F.count("*").alias("customers"),
    F.round(F.avg("monetary_value"), 2).alias("avg_ltv"),
    F.round(F.avg("frequency"), 1).alias("avg_frequency"),
    F.round(F.avg("recency_days"), 1).alias("avg_recency_days")
).orderBy(F.desc("avg_ltv")).show(truncate=False)

---
## Step 7: Segment Insights & Recommendations

Provide actionable insights for each segment.

In [None]:
print("="*60)
print("SEGMENT INSIGHTS & RECOMMENDATIONS")
print("="*60)

recommendations = {
    "Champions": "Reward with VIP benefits, early access, exclusive offers. Encourage referrals.",
    "Loyal Customers": "Upsell higher-value products, loyalty programs, personalized recommendations.",
    "Potential Loyalists": "Nurture with engagement campaigns, increase purchase frequency incentives.",
    "New Customers": "Onboarding programs, welcome offers, build engagement early.",
    "At Risk": "Re-engagement campaigns, special discounts, win-back offers.",
    "Hibernating": "Aggressive win-back campaigns, limited-time offers, surveys for feedback.",
    "Lost": "Final retention attempt with deep discounts or surveys to understand churn."
}

segment_summary = df_output.groupBy("segment_label").count().collect()

print("\n")
for row in segment_summary:
    segment = row["segment_label"]
    count = row["count"]
    recommendation = recommendations.get(segment, "No recommendation available")
    
    print(f"**{segment}** ({count:,} customers)")
    print(f"  - {recommendation}")
    print()

print("="*60)
print("CUSTOMER SEGMENTATION COMPLETE")
print("="*60)