In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
import yaml
import logging
from datetime import datetime

# YAML config
try:
    with open(r".\config.yaml", "r") as f:
        config = yaml.safe_load(f)
except Exception as e:
    raise

# Logger
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s",
    filename=config["log_dir"] +
    f"{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log",
    filemode="w"
)
logger = logging.getLogger(__name__)

logger.info("Config file and logger setup completed.")

In [None]:
# Load dataset
try:
    df = pd.read_csv(config["data_path"])
    logger.info("Dataset loaded successfully.")
except FileNotFoundError:
    logger.error("Error: Customer segmentation CSV file not found.")
    raise
except Exception as e:
    logger.exception("An unexpected error occurred while loading the dataset:")
    raise

In [None]:
# Data Exploration
df_info = df.info()
logger.info(f"DataFrame Info:\n{df_info}")
print(df.head())

In [None]:
df_copy = df.copy()
logger.info("DataFrame copy created.")

In [None]:
num_cols = [
    col for col in df_copy.columns if pd.api.types.is_numeric_dtype(df_copy[col])]
cat_cols = [col for col in df_copy.columns if col not in num_cols]

In [None]:
scaler = MinMaxScaler()
df_copy[num_cols] = scaler.fit_transform(df_copy[num_cols])

In [None]:
df_copy["Gender"] = df_copy["Gender"].map({"Male": 1, "Female": 0})

In [None]:
scores = []
range_values = range(1, 11)

for i in range_values:
    kmeans = KMeans(n_clusters=i, random_state=42, n_init="auto")
    kmeans.fit(df_copy)
    scores.append(kmeans.inertia_)
    logger.info(f"K-Means: {kmeans.inertia_}")
logger.info("K-Means clustering complete.")

plt.plot(scores, "bx-")

In [None]:
# Plot the histogram of various clusters
df_copy["cluster"] = kmeans.fit_predict(df_copy)
df_copy[num_cols] = scaler.inverse_transform(df_copy[num_cols])


for col in df_copy.columns:
    if col != "cluster":
        plt.figure(figsize=(20, 4))
        for i in range(10):
            plt.subplot(1, 10, i+1)
            plt.tight_layout()
            cluster = df_copy[df_copy["cluster"] == i]
            cluster[col].hist(bins=10)
            plt.title(f"{col}\nCluster{i}")