# 02_clustering_kmeans_dbscan_gmm.ipynb

📌 **Customer Segmentation - Clustering Techniques: KMeans, DBSCAN, GMM**

In [None]:

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import joblib
import warnings

warnings.filterwarnings("ignore")
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# 2. Load Dataset
df = pd.read_csv("C:/Users/hh/Desktop/Target_Retail_Sales_Forecasting.csv")
df.dropna(subset=['Customer ID', 'Age', 'Gender', 'Total Amount'], inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df.drop_duplicates(inplace=True)

# 3. Feature Engineering
customer_df = df.groupby('Customer ID').agg({
    'Age': 'first',
    'Gender': 'first',
    'Total Amount': ['sum', 'mean'],
    'Transaction ID': 'count'
}).reset_index()

customer_df.columns = ['Customer_ID', 'Age', 'Gender', 'Total_Spent', 'Avg_Spent', 'Num_Transactions']
customer_df['Gender'] = customer_df['Gender'].map({'Male': 0, 'Female': 1})

# 4. Scaling Features
features = ['Age', 'Gender', 'Total_Spent', 'Avg_Spent', 'Num_Transactions']
X = customer_df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Elbow & Silhouette Method for KMeans
wcss, sil_scores = [], []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)
    sil_scores.append(silhouette_score(X_scaled, kmeans.labels_))

plt.subplot(1, 2, 1)
plt.plot(range(2, 11), wcss, marker='o')
plt.title("Elbow Method")
plt.xlabel("k"); plt.ylabel("WCSS")

plt.subplot(1, 2, 2)
plt.plot(range(2, 11), sil_scores, marker='o', color='green')
plt.title("Silhouette Scores")
plt.xlabel("k"); plt.ylabel("Silhouette Score")
plt.tight_layout()
plt.show()

# 6. KMeans Clustering (k = 4)
kmeans = KMeans(n_clusters=4, random_state=42)
customer_df['Cluster_KMeans'] = kmeans.fit_predict(X_scaled)

# 7. PCA for Visualization
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)

sns.scatterplot(x=components[:, 0], y=components[:, 1], hue=customer_df['Cluster_KMeans'], palette="Set1")
plt.title("KMeans Clusters (PCA Reduced)")
plt.xlabel("PCA1"); plt.ylabel("PCA2"); plt.legend(title='Cluster')
plt.show()

# 8. DBSCAN Clustering
dbscan = DBSCAN(eps=1.0, min_samples=5)
customer_df['Cluster_DBSCAN'] = dbscan.fit_predict(X_scaled)

sns.scatterplot(x=components[:, 0], y=components[:, 1], hue=customer_df['Cluster_DBSCAN'], palette="Set2")
plt.title("DBSCAN Clustering")
plt.xlabel("PCA1"); plt.ylabel("PCA2")
plt.show()

# 9. GMM Clustering
gmm = GaussianMixture(n_components=4, random_state=42)
customer_df['Cluster_GMM'] = gmm.fit_predict(X_scaled)

sns.scatterplot(x=components[:, 0], y=components[:, 1], hue=customer_df['Cluster_GMM'], palette="Set3")
plt.title("GMM Clustering")
plt.xlabel("PCA1"); plt.ylabel("PCA2")
plt.show()

# 10. Save Models and Clustered Data
customer_df.to_csv("clustered_customers.csv", index=False)
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(kmeans, 'kmeans_model.pkl')

# Optional: Show segment profiles
customer_df.groupby('Cluster_KMeans')[features].mean()
