In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df=pd.read_csv("smartcart_customers.csv")

In [None]:
df.head()

## 1. Handle Missing Values

In [None]:
#handle missing data
df["Income"]=df["Income"].fillna(df["Income"].median())

## Feature engineering

In [None]:
#Age - Turning year birth to customers age
df["Age"]=2026-df["Year_Birth"]


In [None]:
# Customer joining data
df["Dt_Customer"]=pd.to_datetime(df["Dt_Customer"],dayfirst=True)

refrence_data=df["Dt_Customer"].max()

df["Cutomer_Tenure_Days"]=(refrence_data-df["Dt_Customer"]).dt.days

In [None]:
df.columns

In [None]:
#Spending
df["Total_Spending"]=df["MntWines"]+df["MntFruits"]+df["MntMeatProducts"]+df["MntFishProducts"]+df["MntSweetProducts"]+df["MntGoldProds"]

In [None]:
#children
df["Total_Children"]=df["Kidhome"]+df["Teenhome"]

In [None]:
#Eductaion
# Education

df["Education"].value_counts()

df["Education"] = df["Education"].replace({
    "Basic": "Undergraduate", "2n Cycle": "Undergraduate",
    "Graduation": "Graduate",
    "Master": "Postgraduate", "PhD": "Postgraduate"
})


In [None]:
# Marital Status

df["Living_With"] = df["Marital_Status"].replace({
    "Married": "Partner", "Together": "Partner",
    "Single": "Alone", "Divorced": "Alone",
    "Widow": "Alone", "Absurd": "Alone", "YOLO": "Alone"
})

## Drop Columns


In [None]:
cols = ["ID", "Year_Birth", "Marital_Status", "Kidhome", "Teenhome", "Dt_Customer"]
spending_cols = ["MntWines", "MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds"]

cols_to_drop = cols + spending_cols

df_cleaned = df.drop(columns=cols_to_drop)

In [None]:
df_cleaned.shape

## Outliers

In [None]:
cols = ["Income", "Recency", "Response", "Age", "Total_Spending", "Total_Children"]
# relative plots of some features-  pair plots
sns.pairplot(df_cleaned[cols])

In [None]:
#Remove Outliers

print("data size with outliers :",len(df_cleaned))

df_cleaned=df_cleaned[(df_cleaned["Age"]<90)]
df_cleaned=df_cleaned[(df_cleaned["Income"]<600_000)]

print("data size without outliers :",len(df_cleaned))


## HEatMap

In [None]:
corr=df_cleaned.corr(numeric_only=True)


In [None]:
plt.figure(figsize=(8, 6))

sns.heatmap(
    corr,
    annot=True,
    annot_kws={"size": 6},
    cmap="coolwarm"
)

## Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder()

cat_cols = ["Education", "Living_With"]

enc_cols = ohe.fit_transform(df_cleaned[cat_cols])

In [None]:
enc_df = pd.DataFrame(enc_cols.toarray(), columns=ohe.get_feature_names_out(cat_cols), index=df_cleaned.index)

In [None]:
df_encoded = pd.concat([df_cleaned.drop(columns=cat_cols),enc_df], axis=1)

In [None]:
df_encoded.shape

# Scaling

In [None]:
from sklearn.preprocessing import  StandardScaler

In [None]:
X = df_encoded

In [None]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

## Visualize

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=3)

X_pca = pca.fit_transform(X_scaled)

In [None]:
pca.explained_variance_ratio_

In [None]:
# plot
fig = plt.figure(figsize=(8, 6))

ax = fig.add_subplot(111, projection="3d")

ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2])

ax.set_xlabel("PCA1")
ax.set_ylabel("PCA2")
ax.set_zlabel("PCA3")
ax.set_title("3d projection")

# Analyze K value
## 1. Elbow Method

In [None]:
from sklearn.cluster import KMeans
from kneed import KneeLocator

wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit_predict(X_pca)
    wcss.append(kmeans.inertia_)
    

In [None]:
knee = KneeLocator(range(1, 11), wcss, curve="convex", direction="decreasing")
optimal_k = knee.elbow

In [None]:
print("best k =", optimal_k)

In [None]:
# plot

plt.plot(range(1, 11), wcss, marker='o')
plt.xlabel("K")
plt.ylabel("WCSS")

## 2. Silhouette Score

In [None]:
from sklearn.metrics import silhouette_score

scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_pca)
    score = silhouette_score(X_pca, labels)
    scores.append(score)

# plot
plt.plot(range(2, 11), scores, marker='o')
plt.xlabel("K")
plt.ylabel("Silhouette score")

In [None]:
# combined plot

k_range = range(2, 11)

fig, ax1 = plt.subplots(figsize=(8, 6))

ax1.plot(k_range, wcss[:len(k_range)], marker="o", color="blue") 
ax1.set_xlabel("K")
ax1.set_ylabel("WCSS")

ax2 = ax1.twinx()
ax2.plot(k_range, scores[:len(k_range)], marker="x", color="red", linestyle="--")
ax2.set_ylabel("SS")

# Clustering

In [None]:
# K_means

kmeans = KMeans(n_clusters=4, random_state=42)
labels_kmeans = kmeans.fit_predict(X_pca)

In [None]:
fig = plt.figure(figsize=(8, 6))

ax = fig.add_subplot(111, projection="3d")

ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=labels_kmeans)

In [None]:
# Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering

In [None]:
agg_clf = AgglomerativeClustering(n_clusters=4, linkage="ward")
labels_agg = agg_clf.fit_predict(X_pca)

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=labels_agg)

# Characterization of Clusters

In [None]:

X["cluster"] = labels_agg

In [None]:
pal = ["red", "blue", "yellow", "green"]

sns.countplot(x=X["cluster"], palette=pal, hue=X["cluster"])

In [None]:
# Income & Spending patterns

sns.scatterplot(x=X["Total_Spending"], y=X["Income"], hue=X["cluster"], palette=pal)

In [None]:
# Cluster Summary

cluster_summary = X.groupby("cluster").mean()
print(cluster_summary)