In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
print('Success')

In [None]:
df = pd.read_csv('../CSV/Mall_Customers.csv')
# Dataset inspection
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicate Rows:", df.duplicated().sum())
print("\nData Types:\n", df.dtypes)
print("\nSummary Statistics:\n", df.describe())

In [5]:
# Step 2: Data Preprocessing
# Select relevant features   
X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Step 3: Clustering - Determine optimal clusters using Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot Elbow Method
plt.figure(figsize=(10,6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.xticks(range(1, 11))
plt.grid()
plt.savefig('elbow_method.png', dpi=800, bbox_inches='tight')
plt.show()

# Based on the elbow plot, select optimal K (example: K=5)
optimal_k = 5

# Apply K-Means clustering
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df['Cluster'] = clusters

In [None]:
# Step 4: Visualization
# PCA for 2D visualization
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
df_pca = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])
df_pca['Cluster'] = clusters

plt.figure(figsize=(10,6))
sns.scatterplot(data=df_pca, x='PC1', y='PC2', hue='Cluster', palette='viridis', s=100)
plt.title('Customer Segmentation Clusters (PCA)')
plt.savefig('pca_clusters.png', dpi=800, bbox_inches='tight')
plt.show()

plt.figure(figsize=(10,8))
sns.pairplot(df, hue='Cluster', palette='tab10', corner=False)
plt.suptitle('Pair Plots by Cluster', y=1.02)
plt.savefig('pair_plots.png', dpi=800, bbox_inches='tight')
plt.show()

# Cluster Centroids Visualization (Original Features)
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
centroid_df = pd.DataFrame(cluster_centers, columns=['Age', 'Annual Income', 'Spending Score'])
centroid_df['Cluster'] = [f'Centroid {i}' for i in range(optimal_k)]

print("\nCluster Centroids (Original Scale):\n", centroid_df)
