## Importing important Libraries

In [2]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

## Taking inputs and paths from local folders

In [5]:
# Please set this to the CSV (the Online_Retail file)
INPUT_CSV = r"C:\Users\venne\Documents\Mentormind\Snapdeal_project\Online Retail.csv"
# Output folder (will be created)
OUT_DIR = r"C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# Output file names (inside OUT_DIR)
CLEANED_CSV = os.path.join(OUT_DIR, "Cleaned_Online_Retail_for_KMeans.csv")
CUSTOMER_SEGMENTS_CSV = os.path.join(OUT_DIR, "Customer_Segments_RFM.csv")
CLUSTER_SUMMARY_CSV = os.path.join(OUT_DIR, "Cluster_Summary.csv")
ELBOW_PNG = os.path.join(OUT_DIR, "elbow_wcss.png")
PCA_PNG = os.path.join(OUT_DIR, "pca_clusters.png")
BOXPLOT_PNG = os.path.join(OUT_DIR, "boxplots_per_cluster.png")
PROFILE_TXT = os.path.join(OUT_DIR, "Cluster_Profiles_Summary.txt")
PPTX_PATH = os.path.join(OUT_DIR, "Snapdeal_Customer_Segmentation_Report.pptx")
BUSINESS_REPORT_MD = os.path.join(OUT_DIR, "Business_Report.md")

## Load Data

In [6]:
# 1. Load the data (with safe encoding)
# ---------------------------
print("Loading dataset:", INPUT_CSV)
df = pd.read_csv(INPUT_CSV, encoding="latin1")  # latin1 commonly works for retail csvs
print("Initial rows:", df.shape[0], "columns:", df.shape[1])

Loading dataset: C:\Users\venne\Documents\Mentormind\Snapdeal_project\Online Retail.csv
Initial rows: 541909 columns: 8


## Cleaning and Feature Creation

In [7]:
# 2. Basic cleaning and feature creation
# ---------------------------
# Drop fully empty rows and exact duplicates
df.dropna(how="all", inplace=True)
df.drop_duplicates(inplace=True)
# Parse dates and create revenue if columns exist
if "InvoiceDate" in df.columns:
    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
if "Quantity" in df.columns and "UnitPrice" in df.columns:
    df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]

# Remove canceled invoices (InvoiceNo starts with 'C') and negative values
if "InvoiceNo" in df.columns:
    df = df[~df["InvoiceNo"].astype(str).str.startswith("C", na=False)]
if "Quantity" in df.columns and "UnitPrice" in df.columns:
    df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]

# Save cleaned intermediate file
df.to_csv(CLEANED_CSV, index=False)
print("Saved cleaned intermediate CSV ->", CLEANED_CSV)

Saved cleaned intermediate CSV -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\Cleaned_Online_Retail_for_KMeans.csv


## Building RFM

In [8]:
# ---------------------------
# 3. Build RFM-like features at customer level
# ---------------------------
# Make CustomerID string and remove bad values
df["CustomerID"] = df["CustomerID"].astype(str).str.strip()
df = df[df["CustomerID"].notna() & (df["CustomerID"] != "") & (df["CustomerID"].str.lower() != "nan")]

snapshot_date = df["InvoiceDate"].max() + timedelta(days=1)

aggregation = {
    "InvoiceDate": ["min", "max", "nunique"],
    "TotalPrice": ["sum", "mean"],
    "InvoiceNo": "nunique"
}
cust = df.groupby("CustomerID").agg(aggregation)
cust.columns = ["FirstPurchaseDate", "LastPurchaseDate", "UniquePurchaseDays", "TotalRevenue", "AvgOrderValue", "NumInvoices"]
cust = cust.reset_index()

cust["Recency"] = (snapshot_date - cust["LastPurchaseDate"]).dt.days
cust["Frequency"] = cust["NumInvoices"]
cust["Monetary"] = cust["TotalRevenue"]

rfm = cust[["CustomerID", "Recency", "Frequency", "Monetary", "AvgOrderValue"]].copy()

# Cap very large outliers at 99th percentile (stabilizes clustering)
for col in ["Monetary", "Frequency"]:
    upper = rfm[col].quantile(0.99)
    rfm[col] = np.where(rfm[col] > upper, upper, rfm[col])

print("Prepared RFM for customers:", rfm.shape[0])

Prepared RFM for customers: 4338


## Standardize numeric features

In [11]:
# ---------------------------
# 4. Standardize numeric features (after handling NaNs)
# ---------------------------
features = ["Recency", "Frequency", "Monetary", "AvgOrderValue"]

# Handle missing values before clustering
rfm[features] = rfm[features].fillna({
    "Recency": rfm["Recency"].median(),
    "Frequency": 0,
    "Monetary": 0,
    "AvgOrderValue": 0
})

# Alternatively: drop rows with any remaining NaNs
rfm = rfm.dropna(subset=features)

X = rfm[features].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## K using Elbow Method

In [12]:
# ====================================================
# STEP 5: Determine Optimal k using Elbow Method
# ====================================================

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

K_MAX = 6  # Maximum number of clusters to check
sample_size = min(1200, len(rfm))  # sample if dataset is large

# Select rows for elbow computation
if sample_size < len(rfm):
    sample_idx = rfm.sample(n=sample_size, random_state=42).index
    X_for_elbow = X_scaled[sample_idx]
else:
    X_for_elbow = X_scaled

# Compute WCSS for k=1 to K_MAX
wcss = []
for k in range(1, K_MAX+1):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_for_elbow)
    wcss.append(km.inertia_)
    print(f"Computed WCSS for k={k}: {km.inertia_:.2f}")

# -----------------------------
# Elbow plot
# -----------------------------
k_values = list(range(1, len(wcss)+1))  # x-axis matches WCSS length
plt.figure(figsize=(6, 4))
plt.plot(k_values, wcss, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("WCSS")
plt.title("Elbow Method for Optimal k")
plt.grid(True)
plt.tight_layout()
plt.savefig(ELBOW_PNG, dpi=150)
plt.close()
print(f"💾 Saved Elbow plot -> {ELBOW_PNG}")

# Optional: manually choose k_opt from the plot
# k_opt = 3  # Example

Computed WCSS for k=1: 4121.50
Computed WCSS for k=2: 2371.35
Computed WCSS for k=3: 1582.09
Computed WCSS for k=4: 1176.56
Computed WCSS for k=5: 953.42
Computed WCSS for k=6: 762.64
💾 Saved Elbow plot -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\elbow_wcss.png


## Clustering and saving Results

In [13]:
# ====================================================
# STEP 6: Run K-Means Clustering and Save Results
# ====================================================

from sklearn.cluster import KMeans

# Set the optimal number of clusters (you can choose based on Elbow from Step 5)
k_opt = 3  # Example, replace with your chosen k

# 1️⃣ Run K-Means clustering
kmeans = KMeans(n_clusters=k_opt, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)

# 2️⃣ Add cluster labels to the RFM dataframe
rfm["Cluster"] = clusters

print(f"\n✅ K-Means clustering done with {k_opt} clusters.")
print(rfm.head())

# 3️⃣ Save the clustered RFM data
rfm.to_csv(CUSTOMER_SEGMENTS_CSV, index=False)
print(f"💾 Saved clustered RFM dataset -> {CUSTOMER_SEGMENTS_CSV}")

# 4️⃣ Generate a summary of clusters (mean/median per cluster)
cluster_summary = rfm.groupby("Cluster")[features].agg(["mean", "median", "count"]).reset_index()
cluster_summary.to_csv(CLUSTER_SUMMARY_CSV, index=False)
print(f"💾 Saved cluster summary -> {CLUSTER_SUMMARY_CSV}")


✅ K-Means clustering done with 3 clusters.
  CustomerID  Recency  Frequency    Monetary  AvgOrderValue  Cluster
0    12346.0    125.0        1.0  19780.4878   77183.600000        2
1    12347.0     96.0        7.0   4310.0000      23.681319        0
2    12348.0    221.0        4.0   1797.2400      57.975484        0
3    12349.0    125.0        1.0   1757.5500      24.076027        0
4    12350.0    312.0        1.0    334.4000      19.670588        0
💾 Saved clustered RFM dataset -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\Customer_Segments_RFM.csv
💾 Saved cluster summary -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\Cluster_Summary.csv


## Visualizing clusters and plots

In [14]:
# ====================================================
# STEP 7: Visualize Clusters and Save Plots
# ====================================================

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# -----------------------------
# 1️⃣ PCA 2D cluster visualization
# -----------------------------
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(7, 5))
sns.scatterplot(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    hue=rfm["Cluster"],
    palette="Set2",
    s=50,
    alpha=0.7
)
plt.title("Customer Clusters (PCA 2D)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster", loc="best")
plt.tight_layout()
plt.savefig(PCA_PNG, dpi=150)
plt.close()
print(f"💾 Saved PCA cluster plot -> {PCA_PNG}")

# -----------------------------
# 2️⃣ Boxplots per cluster for RFM features
# -----------------------------
# Melt dataframe to long format for seaborn
rfm_melted = rfm.melt(
    id_vars="Cluster",
    value_vars=features,  # ["Recency", "Frequency", "Monetary", "AvgOrderValue"]
    var_name="Feature",
    value_name="Value"
)

plt.figure(figsize=(12, 6))
sns.boxplot(
    x="Feature",
    y="Value",
    hue="Cluster",
    data=rfm_melted,
    palette="Set2"
)
plt.title("Boxplots of RFM Features per Cluster")
plt.tight_layout()
plt.savefig(BOXPLOT_PNG, dpi=150)
plt.close()
print(f"💾 Saved boxplots per cluster -> {BOXPLOT_PNG}")

💾 Saved PCA cluster plot -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\pca_clusters.png
💾 Saved boxplots per cluster -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\boxplots_per_cluster.png


## Cluster Summary

In [15]:
# ====================================================
# STEP 8: Cluster Summary Stats and Boxplots
# ====================================================

import matplotlib.pyplot as plt

# -----------------------------
# 1️⃣ Cluster summary stats (count, mean, median, std)
# -----------------------------
cluster_summary = rfm.groupby("Cluster")[features].agg(["count", "mean", "median", "std"]).round(2)

# Flatten multi-level columns
cluster_summary_flat = cluster_summary.copy()
cluster_summary_flat.columns = ['_'.join(col).strip() for col in cluster_summary_flat.columns.values]
cluster_summary_flat = cluster_summary_flat.reset_index()

# Save to CSV
cluster_summary_flat.to_csv(CLUSTER_SUMMARY_CSV, index=False)
print(f"💾 Saved cluster summary CSV -> {CLUSTER_SUMMARY_CSV}")

# -----------------------------
# 2️⃣ Boxplots comparing features across clusters
# -----------------------------
fig, axes = plt.subplots(2, 2, figsize=(12, 9))
axes = axes.flatten()

for i, feat in enumerate(features):
    # Collect data per cluster
    data_to_plot = [rfm[rfm["Cluster"] == c][feat] for c in sorted(rfm["Cluster"].unique())]
    axes[i].boxplot(data_to_plot, labels=[f"C{c}" for c in sorted(rfm["Cluster"].unique())])
    axes[i].set_title(feat)
    axes[i].set_ylabel(feat)

plt.suptitle("Boxplots of RFM Features per Cluster", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for suptitle

# Save boxplot figure
plt.savefig(BOXPLOT_PNG, bbox_inches='tight', dpi=150)
plt.close()
print(f"💾 Saved boxplots -> {BOXPLOT_PNG}")

💾 Saved cluster summary CSV -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\Cluster_Summary.csv
💾 Saved boxplots -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\boxplots_per_cluster.png


## Cluster profile and Business Report

In [16]:
# ====================================================
# STEP 9: Auto-generate Cluster Profile and Business Report
# ====================================================

# -----------------------------
# 1️⃣ Cluster profile (text file)
# -----------------------------
with open(PROFILE_TXT, "w", encoding="utf-8") as f:
    f.write("Cluster Profiles Summary (auto-generated)\n\n")
    f.write(cluster_summary_flat.to_string(index=False))

print(f"💾 Saved cluster profile text -> {PROFILE_TXT}")

# -----------------------------
# 2️⃣ Short professional business report (Markdown)
# -----------------------------
with open(BUSINESS_REPORT_MD, "w", encoding="utf-8") as f:
    f.write("# Customer Segmentation — Business Report\n\n")
    f.write("**Objective:** Segment customers using RFM features to design targeted actions.\n\n")
    
    f.write("## Key Outputs\n")
    f.write(f"- Clusters produced: {cluster_summary_flat.shape[0]}\n")
    f.write(f"- Customer segments file: `{os.path.basename(CUSTOMER_SEGMENTS_CSV)}`\n")
    f.write(f"- Cluster summary: `{os.path.basename(CLUSTER_SUMMARY_CSV)}`\n\n")
    
    f.write("## Cluster Highlights (auto-extracted)\n\n")
    f.write(cluster_summary_flat.to_markdown(index=False))
    f.write("\n\n")
    
    f.write("## Actionable Recommendations (example)\n")
    f.write("- Reward high-value & frequent customers with loyalty perks to increase LTV.\n")
    f.write("- Re-engage inactive customers (high Recency) with personalized offers.\n")
    f.write("- Cross-sell for customers with low monetary but decent frequency.\n\n")
    
    f.write("## Visuals\n")
    f.write(f"- PCA cluster plot: `{os.path.basename(PCA_PNG)}`\n")
    f.write(f"- Elbow plot: `{os.path.basename(ELBOW_PNG)}`\n")
    f.write(f"- Boxplots per cluster: `{os.path.basename(BOXPLOT_PNG)}`\n\n")

print(f"💾 Saved business report (Markdown) -> {BUSINESS_REPORT_MD}")

💾 Saved cluster profile text -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\Cluster_Profiles_Summary.txt
💾 Saved business report (Markdown) -> C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs\Business_Report.md


## Output Saving

In [17]:
# -----------------------------
## Saving all the outputs
print("\n✅ All outputs saved in folder:", OUT_DIR)
print("Files created:")
for fn in sorted(os.listdir(OUT_DIR)):
    print(" -", fn)
print("\n✅ Output is saved")


✅ All outputs saved in folder: C:\Users\venne\Documents\Mentormind\Snapdeal_project\segmentation_outputs
Files created:
 - Business_Report.md
 - Cleaned_Online_Retail_for_KMeans.csv
 - Cluster_Profiles_Summary.txt
 - Cluster_Summary.csv
 - Customer_Segments_RFM.csv
 - boxplots_per_cluster.png
 - elbow_wcss.png
 - pca_clusters.png

✅ Output is saved
