In [None]:
# customer_clustering_with_recommendations.py
# pip install pandas numpy scikit-learn matplotlib seaborn

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# ======================
# 1. Load Dataset
# ======================
INPUT_CSV = "Customers (1).csv"  # change to your CSV file path
OUTDIR = "clustering_outputs"
os.makedirs(OUTDIR, exist_ok=True)

df = pd.read_csv("/content/Customers.csv")

print("Shape:", df.shape)
print("Missing values:\n", df.isna().sum())

# ======================
# 2. Data Preprocessing
# ======================
num_df = df.select_dtypes(include=[np.number]).copy()

# Drop constant columns
constant_cols = [c for c in num_df.columns if num_df[c].nunique(dropna=True) <= 1]
if constant_cols:
    num_df = num_df.drop(columns=constant_cols)

# Impute missing numeric values with median
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(num_df)

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

feature_names = num_df.columns.tolist()

# ======================
# 3. Determine Optimal k
# ======================
K_MAX = 10
wcss = []
sil_scores = {}
for k in range(1, K_MAX + 1):
    km = KMeans(n_clusters=k, n_init=10, random_state=42)
    km.fit(X_scaled)
    wcss.append(km.inertia_)
    if k >= 2:
        sil_scores[k] = silhouette_score(X_scaled, km.labels_)

optimal_k = max(sil_scores, key=sil_scores.get)
print(f"Optimal k (by silhouette): {optimal_k}")

# Plot Elbow Method
plt.figure()
plt.plot(range(1, K_MAX + 1), wcss, marker="o")
plt.title("Elbow Method: WCSS vs k")
plt.xlabel("k")
plt.ylabel("WCSS")
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "elbow_wcss.png"), dpi=150)
plt.close()

# ======================
# 4. Final Clustering
# ======================
final_km = KMeans(n_clusters=optimal_k, n_init=10, random_state=42)
cluster_labels = final_km.fit_predict(X_scaled)
df["cluster"] = cluster_labels

# Save clustered dataset
clustered_csv = os.path.join(OUTDIR, "customers_clustered.csv")
df.to_csv(clustered_csv, index=False)
print("Saved clustered dataset ->", clustered_csv)

# ======================
# 5. Visualizations
# ======================

# PCA Scatter Plot
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)
centers_pca = pca.transform(final_km.cluster_centers_)

plt.figure()
for cl in np.unique(cluster_labels):
    idx = cluster_labels == cl
    plt.scatter(X_pca[idx, 0], X_pca[idx, 1], label=f"Cluster {cl}", alpha=0.8)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], marker="X", s=200, color="black", label="Centroids")
plt.title(f"PCA Scatter Plot (k={optimal_k})")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "pca_scatter.png"), dpi=150)
plt.close()

# Heatmap of Cluster Centroids (Original Scale)
centers_original = scaler.inverse_transform(final_km.cluster_centers_)
centroids_df = pd.DataFrame(centers_original, columns=feature_names)
centroids_df.index = [f"Cluster {i}" for i in range(optimal_k)]

plt.figure(figsize=(8, 5))
sns.heatmap(centroids_df, annot=True, fmt=".1f", cmap="coolwarm")
plt.title("Cluster Centroids (Original Scale)")
plt.tight_layout()
plt.savefig(os.path.join(OUTDIR, "centroids_heatmap.png"), dpi=150)
plt.close()

# Optional Pair Plots (Top 3 high-variance features)
variances = X_imputed.var(axis=0)
top_features = pd.Series(variances, index=feature_names).nlargest(3).index.tolist()
sns.pairplot(pd.DataFrame(X_imputed, columns=feature_names)[top_features]
             .assign(cluster=cluster_labels), hue="cluster", diag_kind="kde", palette="tab10")
plt.savefig(os.path.join(OUTDIR, "pair_plots.png"), dpi=150)
plt.close()

# Save centroids to CSV
centroids_csv = os.path.join(OUTDIR, "cluster_centroids_original_scale.csv")
centroids_df.to_csv(centroids_csv)

# ======================
# 6. Recommendations
# ======================
print("\n=== Cluster Recommendations ===")
for i, row in centroids_df.iterrows():
    age = row.get("Age", None)
    income = row.get("Annual Income ($)", None)
    spend = row.get("Spending Score (1-100)", None)

    desc = []
    if spend is not None:
        if spend > 70:
            desc.append("High spender")
        elif spend < 40:
            desc.append("Low spender")
        else:
            desc.append("Moderate spender")
    if income is not None:
        if income > 80_000:
            desc.append("High income")
        elif income < 40_000:
            desc.append("Low income")
    if age is not None:
        if age < 30:
            desc.append("Young segment")
        elif age > 50:
            desc.append("Older segment")

    print(f"{i}: {', '.join(desc)}")

print("\nSuggested actions:")
print("- High spenders → Target with loyalty programs & premium product offers.")
print("- Low spenders but high income → Target with exclusive promotions to increase spending.")
print("- Young high spenders → Push trending/new products.")
print("- Older moderate spenders → Focus on value deals and long-term memberships.")

print(f"\nAll visuals and CSVs saved to folder: {OUTDIR}")


Shape: (2000, 8)
Missing values:
 CustomerID                 0
Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                35
Work Experience            0
Family Size                0
dtype: int64
Optimal k (by silhouette): 2
Saved clustered dataset -> clustering_outputs/customers_clustered.csv

=== Cluster Recommendations ===
Cluster 0: Moderate spender, High income, Older segment
Cluster 1: Moderate spender

Suggested actions:
- High spenders → Target with loyalty programs & premium product offers.
- Low spenders but high income → Target with exclusive promotions to increase spending.
- Young high spenders → Push trending/new products.
- Older moderate spenders → Focus on value deals and long-term memberships.

All visuals and CSVs saved to folder: clustering_outputs
