In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from google.colab import files

# Upload the dataset file using file upload in Google Colab
uploaded = files.upload()

In [None]:
file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)


In [None]:
# Convert 'Gender' to numerical values: 0 for 'Male' and 1 for 'Female'
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})

# Select 'Age', 'Gender', 'Annual Income (k$)', and 'Spending Score (1-100)' for clustering
X = data[['Age', 'Gender', 'Annual Income (k$)', 'Spending Score (1-100)']].copy()  # Use .copy() to avoid chained indexing

# Check for missing values or infinite values
print("Checking for missing values:")
print(X.isnull().sum())

print("\nChecking for infinite values:")
print(np.isinf(X).sum())

# Since there are no missing or infinite values, proceed with scaling
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Using the Elbow method to determine the optimal number of clusters (k)
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)

# Plot the Elbow graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Sum of squared distances (Inertia)')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
# Based on the elbow graph, choose an appropriate k value
# the elbow graph indicates k = 5 is the optimal number of clusters
k_optimal = 5

# Apply KMeans with the optimal number of clusters
kmeans = KMeans(n_clusters=k_optimal, random_state=0)
kmeans.fit(X_scaled)

# Add the cluster labels to the original dataset
data['Cluster'] = kmeans.labels_

# Analyze the clusters by plotting them
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', data=data, palette='Set1')
plt.title('Customer Segments based on Age, Gender, Annual Income, and Spending Score')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

# Summarize the characteristics of each cluster
cluster_summary = data.groupby('Cluster').mean()
print(cluster_summary)