In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
print("Starting...")



Starting...


In [6]:
# 1. Load dataset
df = pd.read_csv("Mall_Customers.csv")
df.columns = df.columns.str.strip()  # remove extra spaces
print(" Data loaded – Columns:", df.columns.tolist())

# 2. Fix column names if needed
df = df.rename(columns={
    'Annual Income (₹)': 'Annual Income (k$)',
    'Spending Score (1-100) ': 'Spending Score (1-100)'
})


# 3. Select features and scale
X = df[['Age', 'Annual_Income_(k$)', 'Spending_Score']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



 Data loaded – Columns: ['CustomerID', 'Genre', 'Age', 'Annual_Income_(k$)', 'Spending_Score']


In [8]:
# 4. Elbow method
inertia = []
for k in range(2, 11):
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X_scaled)
    inertia.append(model.inertia_)

plt.figure()
plt.plot(range(2, 11), inertia, 'bo-')
plt.title("Elbow Method")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.savefig("elbow_curve.png")
plt.close()
print(" elbow_curve.png saved")

# 5. Silhouette Score
silhouette_scores = []
for k in range(2, 11):
    model = KMeans(n_clusters=k, random_state=42)
    labels = model.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

plt.figure()
plt.plot(range(2, 11), silhouette_scores, 'ro-')
plt.title("Silhouette Scores")
plt.xlabel("k")
plt.ylabel("Score")
plt.savefig("silhouette_curve.png")
plt.close()
print(" silhouette_curve.png saved")



 elbow_curve.png saved
 silhouette_curve.png saved


In [10]:
# 6. Final Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

df.to_csv("mall_customers_with_clusters.csv", index=False)
print(" mall_customers_with_clusters.csv saved")

# 7. 3D Plot
fig = px.scatter_3d(
    df,
    x='Age',
    y='Annual_Income_(k$)',
    z='Spending_Score',
    color='Cluster',
    title="Customer Segmentation 3D"
)

fig.write_html("3d_cluster_plot.html")
print(" 3d_cluster_plot.html saved")

print(" All files generated successfully!")

 mall_customers_with_clusters.csv saved
 3d_cluster_plot.html saved
 All files generated successfully!


In [12]:
# Average values per cluster
cluster_profile = df.groupby('Cluster')[['Age', 'Annual_Income_(k$)', 'Spending_Score']].mean().round(1)
cluster_profile['Customer Count'] = df['Cluster'].value_counts().sort_index()

# Gender distribution (optional)
gender_dist = df.groupby(['Cluster', 'Genre']).size().unstack().fillna(0).astype(int)

print("Cluster Profile:")
print(cluster_profile)

print("\nGender Distribution:")
print(gender_dist)

Cluster Profile:
          Age  Annual_Income_(k$)  Spending_Score  Customer Count
Cluster                                                          
0        55.3                47.6            41.7              58
1        32.9                86.1            81.5              40
2        25.8                26.1            74.8              26
3        26.7                54.3            40.9              45
4        44.4                89.8            18.5              31

Gender Distribution:
Genre    Female  Male
Cluster              
0            33    25
1            22    18
2            15    11
3            27    18
4            15    16
