# 06. Storytelling & Deployment (CRISP-DM Phase 6)

## Executive Summary

This notebook presents the key findings and actionable recommendations from the **Customer Segmentation** project for RetailRocket.

---

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

# Load data
df = pd.read_csv('../dataset/customers_segmented_remediated.csv')
events = pd.read_csv('../dataset/events.csv')
events['datetime'] = pd.to_datetime(events['timestamp'], unit='ms', utc=True)

---

## 📊 Key Business Metrics

In [None]:
# Calculate key metrics
total_visitors = len(df)
total_transactors = (df['Num_Transactions'] > 0).sum()
total_revenue = df['Monetary_Raw'].sum()
conversion_rate = total_transactors / total_visitors * 100

print("="*60)
print("KEY BUSINESS METRICS")
print("="*60)
print(f"\n👥 Total Visitors:     {total_visitors:,}")
print(f"💳 Paying Customers:   {total_transactors:,}")
print(f"📈 Conversion Rate:    {conversion_rate:.2f}%")
print(f"💰 Total Revenue:      {total_revenue:,.0f}")
print(f"📊 Avg Revenue/Buyer:  {total_revenue/total_transactors:,.0f}")

---

## 🔄 Conversion Funnel

In [None]:
# Calculate funnel
viewers = events[events['event'] == 'view']['visitorid'].nunique()
cart_adders = events[events['event'] == 'addtocart']['visitorid'].nunique()
purchasers = events[events['event'] == 'transaction']['visitorid'].nunique()

# Funnel visualization
fig, ax = plt.subplots(figsize=(8, 6))

stages = ['View', 'Add to Cart', 'Purchase']
values = [viewers, cart_adders, purchasers]
colors = ['#3498db', '#f39c12', '#27ae60']

# Create horizontal bar chart (funnel style)
y_pos = np.arange(len(stages))
bars = ax.barh(y_pos, values, color=colors, height=0.6)

ax.set_yticks(y_pos)
ax.set_yticklabels(stages)
ax.invert_yaxis()
ax.set_xlabel('Number of Visitors')
ax.set_title('Customer Conversion Funnel', fontsize=14, fontweight='bold')

# Add labels
for i, (bar, val) in enumerate(zip(bars, values)):
    ax.text(val + 5000, bar.get_y() + bar.get_height()/2, 
            f'{val:,}', va='center', fontweight='bold')
    if i > 0:
        pct = val / values[i-1] * 100
        ax.text(values[i-1]/2, bar.get_y() + bar.get_height()/2, 
                f'{pct:.1f}%', va='center', ha='center', color='white', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n📊 Funnel Conversion Rates:")
print(f"   View → Cart: {cart_adders/viewers*100:.2f}%")
print(f"   Cart → Purchase: {purchasers/cart_adders*100:.2f}%")
print(f"   View → Purchase: {purchasers/viewers*100:.2f}%")

---

## 👥 Customer Segments

In [None]:
# Segment distribution
segment_counts = df['Segment_Name'].value_counts()

# Create pie chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# All segments
colors = plt.cm.Set2(np.linspace(0, 1, len(segment_counts)))
explode = [0.05 if s == 'Window Shoppers' else 0 for s in segment_counts.index]

axes[0].pie(segment_counts.values, labels=segment_counts.index, autopct='%1.1f%%', 
            colors=colors, explode=explode, startangle=90)
axes[0].set_title('All Customer Segments', fontsize=14, fontweight='bold')

# Transactors only
transactor_segments = df[df['Num_Transactions'] > 0]['Segment_Name'].value_counts()
axes[1].pie(transactor_segments.values, labels=transactor_segments.index, autopct='%1.1f%%', 
            colors=colors[:len(transactor_segments)], startangle=90)
axes[1].set_title('Paying Customer Segments', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

---

## 💰 Revenue Concentration (Pareto)

In [None]:
# Revenue by segment
transactors = df[df['Num_Transactions'] > 0]
segment_revenue = transactors.groupby('Segment_Name')['Monetary_Raw'].sum().sort_values(ascending=False)

# Calculate cumulative percentage
cum_pct = segment_revenue.cumsum() / segment_revenue.sum() * 100

# Pareto chart
fig, ax1 = plt.subplots(figsize=(10, 5))

# Bar chart
bars = ax1.bar(range(len(segment_revenue)), segment_revenue.values, color='steelblue', alpha=0.7)
ax1.set_xticks(range(len(segment_revenue)))
ax1.set_xticklabels(segment_revenue.index, rotation=45, ha='right')
ax1.set_ylabel('Total Revenue', color='steelblue')

# Cumulative line
ax2 = ax1.twinx()
ax2.plot(range(len(cum_pct)), cum_pct.values, 'o-', color='red', linewidth=2)
ax2.axhline(y=80, color='green', linestyle='--', alpha=0.7, label='80% Line')
ax2.set_ylabel('Cumulative %', color='red')
ax2.set_ylim(0, 105)

plt.title('Revenue Pareto Analysis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Find 80/20 insight
for i, pct in enumerate(cum_pct.values):
    if pct >= 80:
        top_segments = segment_revenue.index[:i+1].tolist()
        print(f"\n💰 Pareto Insight: Top {i+1} segment(s) {top_segments} contribute 80% of revenue")
        break

---

## 📈 Customer Lifetime Value (CLV) - Leading Indicator

In [None]:
# Load CLV data
try:
    clv_df = pd.read_csv('../dataset/clv_customer.csv')
    df_clv = df.merge(clv_df[['visitorid', 'CLV_1_month', 'probability_alive']], on='visitorid', how='left')
    
    # CLV Distribution by Segment (Bar chart, not pie)
    clv_by_segment = df_clv[df_clv['Num_Transactions'] > 0].groupby('Segment_Name')['CLV_1_month'].mean().sort_values(ascending=True)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    bars = ax.barh(clv_by_segment.index, clv_by_segment.values, color='steelblue')
    
    # Color code: Green for high CLV, Red for low
    for bar, val in zip(bars, clv_by_segment.values):
        if val > clv_by_segment.median():
            bar.set_color('#27ae60')  # Green
        else:
            bar.set_color('#e74c3c')  # Red
    
    ax.set_xlabel('Average CLV (1 Month)')
    ax.set_ylabel('Segment')
    ax.set_title('Customer Lifetime Value by Segment', fontsize=14, fontweight='bold')
    ax.axvline(x=clv_by_segment.median(), color='orange', linestyle='--', label=f'Median: {clv_by_segment.median():.0f}')
    ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    print(f"\n📊 CLV Insight:")
    top_clv = clv_by_segment.idxmax()
    print(f"   Highest CLV Segment: {top_clv} (${clv_by_segment[top_clv]:,.0f})")
    print(f"   → Focus retention efforts here for maximum ROI")
except FileNotFoundError:
    print("clv_customer.csv not found. Skipping CLV analysis.")


---

## 🎯 Propensity to Buy - Predictive Indicator

In [None]:
# Load propensity scores
try:
    propensity_df = pd.read_csv('../dataset/customer_propensity_scores.csv')
    df_prop = df.merge(propensity_df[['visitorid', 'propensity_score']], on='visitorid', how='left')
    
    # Propensity distribution histogram
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram
    df_prop['propensity_score'].hist(bins=50, ax=axes[0], color='steelblue', edgecolor='white')
    axes[0].axvline(x=0.5, color='red', linestyle='--', label='50% threshold')
    axes[0].set_xlabel('Propensity Score')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Distribution of Propensity Scores', fontweight='bold')
    axes[0].legend()
    
    # Propensity by Segment
    prop_by_segment = df_prop.groupby('Segment_Name')['propensity_score'].mean().sort_values(ascending=True)
    bars = axes[1].barh(prop_by_segment.index, prop_by_segment.values, color='steelblue')
    
    # Color code
    for bar, val in zip(bars, prop_by_segment.values):
        if val > 0.5:
            bar.set_color('#27ae60')  # Green - high propensity
        elif val > 0.2:
            bar.set_color('#f39c12')  # Orange - medium
        else:
            bar.set_color('#e74c3c')  # Red - low
    
    axes[1].axvline(x=0.5, color='red', linestyle='--', label='50% threshold')
    axes[1].set_xlabel('Average Propensity Score')
    axes[1].set_ylabel('Segment')
    axes[1].set_title('Purchase Propensity by Segment (7-day)', fontweight='bold')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()
    
    high_prop = df_prop[df_prop['propensity_score'] > 0.5].shape[0]
    print(f"\n📊 Propensity Insight:")
    print(f"   Customers with >50% propensity: {high_prop:,}")
    print(f"   → Target these for immediate conversion campaigns")
except FileNotFoundError:
    print("customer_propensity_scores.csv not found. Skipping propensity analysis.")


---

## 📋 Segment Action Plan

In [None]:
# Action plan table
action_plan = pd.DataFrame({
    'Segment': ['VIP Champions', 'Big Spenders', 'Loyal Regulars', 'At Risk', 'Window Shoppers'],
    'Priority': ['🔴 High', '🔴 High', '🟡 Medium', '🟠 Medium-High', '🟢 Low'],
    'Strategy': ['Retain & Reward', 'Upsell', 'Cross-sell', 'Win-Back', 'Convert'],
    'Actions': [
        'VIP access, exclusive offers, personal manager',
        'Premium bundles, tier upgrades',
        'Personalized recommendations, rewards',
        '"We miss you" offers, limited-time deals',
        'First-purchase discount, retargeting'
    ],
    'KPIs': [
        'Retention Rate, CLTV',
        'AOV, Cross-sell Rate',
        'Purchase Frequency',
        'Reactivation Rate',
        'Conversion Rate'
    ]
})

print("\n📋 SEGMENT ACTION PLAN")
print("="*80)
print(action_plan.to_string(index=False))

---

## 📊 Lagging vs Leading Indicators Summary

In [None]:
print("\n" + "="*70)
print("METRICS CLASSIFICATION (60/40 Rule)")
print("="*70)

print("\n🔴 LAGGING INDICATORS (60%) - What Happened:")
print(f"   • Total Revenue: {total_revenue:,.0f}")
print(f"   • Total Transactions: {(df['Num_Transactions'].sum()):,}")
print(f"   • Customer Count: {total_visitors:,}")

print("\n🟢 LEADING INDICATORS (40%) - What Will Happen:")
print(f"   • Conversion Rate: {conversion_rate:.2f}%")

try:
    avg_clv = df_clv['CLV_1_month'].mean()
    avg_prob = df_clv['probability_alive'].mean()
    print(f"   • Avg CLV (1 Month): {avg_clv:.2f}")
    print(f"   • Avg Probability Alive: {avg_prob:.2%}")
except:
    pass

try:
    avg_propensity = df_prop['propensity_score'].mean()
    print(f"   • Avg Propensity Score: {avg_propensity:.2%}")
except:
    pass

print("\n📌 Dashboard Recommendation:")
print("   Monitor Leading Indicators weekly to predict future performance")
print("   Review Lagging Indicators monthly to assess results")


---

## 🎯 Recommendations

In [None]:
print("\n" + "="*70)
print("EXECUTIVE RECOMMENDATIONS")
print("="*70)

print("\n🎯 IMMEDIATE ACTIONS (0-30 days):")
print("   1. Deploy segment labels to CRM")
print("   2. Launch VIP Champions retention program")
print("   3. Trigger At Risk win-back campaign")

print("\n📈 SHORT-TERM (1-3 months):")
print("   1. A/B test conversion strategies for Window Shoppers")
print("   2. Build personalized recommendation engine")
print("   3. Implement segment-based email sequences")

print("\n🔄 ONGOING:")
print("   1. Monitor segment migration monthly")
print("   2. Re-cluster quarterly to capture behavior changes")
print("   3. Track segment-level KPIs in dashboard")

print("\n✅ PROJECT DELIVERABLES:")
print("   • Segmented customer data: customers_segmented_remediated.csv")
print("   • Interactive dashboard: streamlit run dashboard/app.py")
print("   • CRISP-DM documentation: CRISPDM/*.ipynb")

---

## 📝 Data Quality Notes (Strict Audit Compliance)

In [None]:
print("\n" + "="*70)
print("DATA QUALITY & METHODOLOGY")
print("="*70)

print("\n✅ Revenue Integrity:")
print("   • Source: transactionid column (actual transaction amounts)")
print(f"   • Total verified revenue: {total_revenue:,.0f}")
print("   • No price imputation used")

print("\n✅ Feature Engineering:")
print("   • Log-transformed: Recency, Frequency, Monetary")
print("   • Conversion Rate: Transactions / Sessions")
print("   • No redundant PCA")

print("\n✅ Segmentation:")
print("   • Model: Gaussian Mixture Model (GMM)")
print("   • Window Shoppers separated (99.2%)")
print("   • BIC-optimized cluster count")