In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys

sys.path.insert(0, "..")
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-whitegrid")

COLORS = ["#FF6B6B", "#4ECDC4", "#45B7D1", "#96CEB4", "#FFEAA7", "#DDA0DD"]

print("Setup complete!")

In [None]:
# Load data
from ml.utils.data_utils import load_transactions, load_customers

transactions = load_transactions()
customers = load_customers()

print(f"Transactions: {len(transactions):,} records")
print(f"Customers: {len(customers):,} customers")

## 1. RFM Segmentation

RFM (Recency, Frequency, Monetary) analysis segments customers based on:

- **Recency**: How recently they made a purchase
- **Frequency**: How often they purchase
- **Monetary**: How much they spend


In [None]:
from ml.pipelines.customer_analytics.rfm_segmentation import RFMAnalyzer

# Initialize and fit RFM analyzer
rfm_analyzer = RFMAnalyzer()
rfm_df = rfm_analyzer.fit(transactions)

print("RFM Analysis Complete!")
print(f"\nCustomers analyzed: {len(rfm_df):,}")
rfm_df.head(10)

In [None]:
# Get segment summaries
segment_summary = rfm_analyzer.get_segments()
print("\nCustomer Segments:")
print("=" * 60)
segment_summary

In [None]:
# Visualize RFM segments
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Segment distribution
segment_counts = rfm_df["segment"].value_counts()
colors = [COLORS[i % len(COLORS)] for i in range(len(segment_counts))]
axes[0, 0].barh(segment_counts.index, segment_counts.values, color=colors)
axes[0, 0].set_title("Customer Distribution by Segment", fontweight="bold")
axes[0, 0].set_xlabel("Number of Customers")

# Monetary value by segment
segment_monetary = rfm_df.groupby("segment")["monetary"].mean().sort_values()
axes[0, 1].barh(segment_monetary.index, segment_monetary.values, color=COLORS[1])
axes[0, 1].set_title("Average Monetary Value by Segment", fontweight="bold")
axes[0, 1].set_xlabel("Average Spend ($)")

# RFM Score distribution
axes[1, 0].scatter(
    rfm_df["frequency"], rfm_df["monetary"], c=rfm_df["r_score"], cmap="RdYlGn", alpha=0.5
)
axes[1, 0].set_title("Frequency vs Monetary (colored by Recency)", fontweight="bold")
axes[1, 0].set_xlabel("Frequency")
axes[1, 0].set_ylabel("Monetary ($)")

# Revenue by segment
segment_revenue = rfm_df.groupby("segment")["monetary"].sum().sort_values()
axes[1, 1].barh(segment_revenue.index, segment_revenue.values, color=COLORS[2])
axes[1, 1].set_title("Total Revenue by Segment", fontweight="bold")
axes[1, 1].set_xlabel("Total Revenue ($)")

plt.tight_layout()
plt.show()

In [None]:
# Get marketing recommendations
recommendations = rfm_analyzer.get_segment_recommendations()

print("\nMarketing Recommendations by Segment:")
print("=" * 60)
for segment, rec in recommendations.items():
    print(f"\n{segment}:")
    print(f"  Strategy: {rec['strategy']}")
    for action in rec["actions"][:2]:  # Show first 2 actions
        print(f"  - {action}")

## 2. Churn Prediction

Predict which customers are likely to churn (stop visiting) using machine learning.


In [None]:
from ml.pipelines.customer_analytics.churn_prediction import ChurnPredictor

# Initialize and train churn model
churn_predictor = ChurnPredictor(churn_threshold_days=60)
metrics = churn_predictor.train(transactions, customers)

print("\nChurn Prediction Model Metrics:")
print("=" * 40)
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Feature importance
importance = churn_predictor.feature_importance_

plt.figure(figsize=(10, 8))
top_features = importance.head(15)
plt.barh(top_features["feature"], top_features["importance"], color=COLORS[0])
plt.title("Top 15 Features for Churn Prediction", fontweight="bold")
plt.xlabel("Importance")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Get at-risk customers
# First create features for all customers
features = churn_predictor._create_features(transactions, customers)
at_risk = churn_predictor.get_at_risk_customers(features, threshold=0.5)

print(f"\nAt-Risk Customers (Churn Probability > 50%): {len(at_risk)}")
print("\nTop 10 At-Risk Customers:")
at_risk.head(10)

In [None]:
# Churn probability distribution
all_predictions = churn_predictor.predict(features)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Probability distribution
axes[0].hist(all_predictions["churn_probability"], bins=50, color=COLORS[0], edgecolor="white")
axes[0].axvline(0.5, color="red", linestyle="--", label="Threshold (0.5)")
axes[0].set_title("Churn Probability Distribution", fontweight="bold")
axes[0].set_xlabel("Churn Probability")
axes[0].set_ylabel("Number of Customers")
axes[0].legend()

# Risk categories
risk_counts = all_predictions["churn_risk"].value_counts()
colors = ["green", "yellow", "orange", "red"]
axes[1].pie(risk_counts, labels=risk_counts.index, autopct="%1.1f%%", colors=colors)
axes[1].set_title("Customer Churn Risk Distribution", fontweight="bold")

plt.tight_layout()
plt.show()

## 3. K-Means Customer Segmentation

Use unsupervised learning to discover natural customer groups.


In [None]:
from ml.pipelines.customer_analytics.customer_segmentation import CustomerSegmenter

# Initialize and fit segmenter (auto-determines optimal clusters)
segmenter = CustomerSegmenter(min_clusters=3, max_clusters=8)
segmenter.fit(transactions, customers)

print(f"\nOptimal number of clusters: {segmenter.n_clusters}")

In [None]:
# Get segment profiles
profiles = segmenter.get_segment_profiles()

print("\nCustomer Segment Profiles:")
print("=" * 60)
profiles[
    [
        "cluster",
        "segment_name",
        "customer_count",
        "pct_of_customers",
        "avg_order_value",
        "avg_orders_per_month",
    ]
]

In [None]:
# Segment summary with recommendations
summary = segmenter.get_segment_summary()

for segment in summary["segments"]:
    print(f"\n{segment['name']} (Cluster {segment['cluster']})")
    print(f"  Customers: {segment['customer_count']} ({segment['pct_of_customers']}%)")
    print(f"  Avg Order Value: ${segment['avg_order_value']:.2f}")
    print(f"  Description: {segment['description']}")
    print(f"  Strategy: {segment['strategy']}")

In [None]:
# Visualize cluster centers
centers = segmenter.get_cluster_centers()

# Radar chart for cluster profiles
from math import pi

categories = [
    "recency_days",
    "total_transactions",
    "total_spent",
    "avg_order_value",
    "orders_per_month",
    "spend_per_month",
]

# Normalize for visualization
centers_norm = centers[categories].copy()
for col in categories:
    centers_norm[col] = (centers_norm[col] - centers_norm[col].min()) / (
        centers_norm[col].max() - centers_norm[col].min()
    )

fig, ax = plt.subplots(figsize=(10, 8))

for i, row in centers_norm.iterrows():
    values = row.tolist()
    values += values[:1]  # Complete the loop
    angles = [n / float(len(categories)) * 2 * pi for n in range(len(categories))]
    angles += angles[:1]

    ax = plt.subplot(111, polar=True)
    ax.plot(angles, values, "o-", linewidth=2, label=f'Cluster {int(centers.iloc[i]["cluster"])}')
    ax.fill(angles, values, alpha=0.1)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_title("Cluster Profiles (Normalized)", fontweight="bold", size=14, y=1.1)
plt.legend(loc="upper right", bbox_to_anchor=(1.3, 1.0))
plt.tight_layout()
plt.show()

## 4. Customer Lifetime Value (LTV) Prediction

Predict the future value of each customer.


In [None]:
from ml.pipelines.customer_analytics.customer_ltv import CustomerLTV

# Initialize and train LTV model
ltv_model = CustomerLTV(prediction_horizon_days=365)
metrics = ltv_model.train(transactions, customers)

print("\nLTV Prediction Model Metrics:")
print("=" * 40)
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Feature importance for LTV
importance = ltv_model.feature_importance_

plt.figure(figsize=(10, 8))
top_features = importance.head(15)
plt.barh(top_features["feature"], top_features["importance"], color=COLORS[3])
plt.title("Top 15 Features for LTV Prediction", fontweight="bold")
plt.xlabel("Importance")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Get LTV distribution
ltv_dist = ltv_model.get_ltv_distribution(transactions, customers)

print("\nLTV Distribution Statistics:")
print("=" * 40)
print(f"  Total Customers: {ltv_dist['count']:,}")
print(f"  Mean LTV: ${ltv_dist['mean']:.2f}")
print(f"  Median LTV: ${ltv_dist['median']:.2f}")
print(f"  Max LTV: ${ltv_dist['max']:.2f}")
print(f"  Total Predicted LTV: ${ltv_dist['total_predicted_ltv']:,.2f}")
print(f"\nPercentiles:")
for pct, value in ltv_dist["percentiles"].items():
    print(f"    {pct}th: ${value:.2f}")

In [None]:
# High value customers
high_value = ltv_model.get_high_value_customers(transactions, customers, top_n=20)

print("\nTop 20 High-Value Customers:")
high_value

In [None]:
# LTV visualization
predictions = ltv_model.predict_for_transactions(transactions, customers)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# LTV distribution
axes[0].hist(predictions["predicted_ltv"], bins=50, color=COLORS[4], edgecolor="white")
axes[0].axvline(
    predictions["predicted_ltv"].mean(),
    color="red",
    linestyle="--",
    label=f"Mean: ${predictions['predicted_ltv'].mean():.2f}",
)
axes[0].set_title("Predicted LTV Distribution", fontweight="bold")
axes[0].set_xlabel("Predicted LTV ($)")
axes[0].set_ylabel("Number of Customers")
axes[0].legend()

# LTV segments
segment_counts = predictions["ltv_segment"].value_counts()
colors = [COLORS[i % len(COLORS)] for i in range(len(segment_counts))]
axes[1].pie(segment_counts, labels=segment_counts.index, autopct="%1.1f%%", colors=colors)
axes[1].set_title("Customer LTV Segments", fontweight="bold")

plt.tight_layout()
plt.show()

## 5. Combining Insights

Let's combine all customer analytics for a comprehensive view.


In [None]:
# Combine all customer insights
customer_insights = rfm_df[["customer_id", "segment", "rfm_score"]].copy()
customer_insights = customer_insights.merge(
    all_predictions[["customer_id", "churn_probability", "churn_risk"]], on="customer_id"
)
customer_insights = customer_insights.merge(
    predictions[["customer_id", "predicted_ltv", "ltv_segment"]], on="customer_id"
)

print("\nCombined Customer Insights:")
print(f"Total customers with complete insights: {len(customer_insights):,}")
customer_insights.head(10)

In [None]:
# Prioritize customers: High LTV + High Churn Risk
priority_customers = customer_insights[
    (customer_insights["ltv_segment"].isin(["High", "Above Average"]))
    & (customer_insights["churn_risk"].isin(["High", "Critical"]))
]

print(f"\nðŸš¨ Priority Customers (High LTV + High Churn Risk): {len(priority_customers)}")
print("These customers need immediate retention efforts!")
priority_customers.head(10)

In [None]:
# Save combined insights
from ml.utils.data_utils import DATA_DIR

customer_insights.to_csv(DATA_DIR / "customer_insights.csv", index=False)
print(f"\nâœ… Customer insights saved to {DATA_DIR / 'customer_insights.csv'}")