In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import (
    MinMaxScaler,
)

from .utils._logger import logger
from .utils._validation import config_args

In [None]:
# Load dataset
from pandas import DataFrame

try:
    df: DataFrame = pd.read_csv(config_args.data_path)
    logger.info("Dataset loaded successfully.")
except FileNotFoundError:
    logger.error("Error: Customer segmentation CSV file not found.")
    raise
except Exception:
    logger.exception("An unexpected error occurred while loading the dataset:")
    raise

In [None]:
# Data Exploration
df_info = df.info()
logger.info(f"DataFrame Info:\n{df_info}")
print(df.head())

In [None]:
df_copy: DataFrame = df.copy()
logger.info("DataFrame copy created.")

In [None]:
num_cols: list[str] = [
    col for col in df_copy.columns if pd.api.types.is_numeric_dtype(df_copy[col])
]
cat_cols: list[str] = [col for col in df_copy.columns if col not in num_cols]

In [None]:
scaler = MinMaxScaler()
df_copy[num_cols] = scaler.fit_transform(df_copy[num_cols])

In [None]:
df_copy["Gender"] = df_copy["Gender"].map({"Male": 1, "Female": 0})

In [None]:
scores = []
range_values = range(1, 11)

for i in range_values:
    kmeans = KMeans(n_clusters=i, random_state=42, n_init="auto")
    kmeans.fit(df_copy)
    scores.append(kmeans.inertia_)
    logger.info(f"K-Means: {kmeans.inertia_}")
logger.info("K-Means clustering complete.")

plt.plot(scores, "bx-")

In [None]:
# Plot the histogram of various clusters

df_copy["cluster"] = kmeans.fit_predict(df_copy)
df_copy[num_cols] = scaler.inverse_transform(df_copy[num_cols])


for col in df_copy.columns:
    if col != "cluster":
        plt.figure(figsize=(20, 4))
        for i in range(10):
            plt.subplot(1, 10, i + 1)
            plt.tight_layout()
            cluster: DataFrame = df_copy[df_copy["cluster"] == i]
            cluster[col].hist(bins=10)
            plt.title(f"{col}\nCluster{i}")