In [3]:
import sys
import os

# Add project root (parent of part2) to PYTHONPATH
PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)


### imports

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from experiments import (
    experiment1_kmeans_original,
    experiment2_gmm_original,
    experiment3_pca_kmeans,
    experiment4_pca_gmm,
    experiment5_autoencoder_kmeans,
    experiment6_autoencoder_gmm
)


### Load dataset

In [5]:
df = pd.read_csv("../data/breast_cancer.csv")
df.head()

X = df.drop(columns=["id", "diagnosis"])
y = df["diagnosis"].map({"M": 1, "B": 0}).values

X = (X - X.mean()) / X.std()
X = X.values



## Experiment 1

In [6]:
results_exp1 = experiment1_kmeans_original(
    X=X,
    y=y,
    k_values=range(2, 11),
    random_state=42
)

df_exp1 = pd.DataFrame(results_exp1)
df_exp1


Unnamed: 0,experiment,k,silhouette,davies_bouldin,calinski_harabasz,wcss,ARI,NMI,purity
0,Original + KMeans,2,,0.0,,,0.0,-1.7e-05,0.627417
1,Original + KMeans,3,,0.0,,,0.0,-1.7e-05,0.627417
2,Original + KMeans,4,,0.0,,,0.0,-1.7e-05,0.627417
3,Original + KMeans,5,,0.0,,,0.0,-1.7e-05,0.627417
4,Original + KMeans,6,,0.0,,,0.0,-1.7e-05,0.627417
5,Original + KMeans,7,,0.0,,,0.0,-1.7e-05,0.627417
6,Original + KMeans,8,,0.0,,,0.0,-1.7e-05,0.627417
7,Original + KMeans,9,,0.0,,,0.0,-1.7e-05,0.627417
8,Original + KMeans,10,,0.0,,,0.0,-1.7e-05,0.627417


## Evaluation

In [7]:
#WCSS Elbow Curve
plt.figure()
plt.plot(df_exp1["k"], df_exp1["inertia"], marker="o")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("WCSS")
plt.title("Elbow Curve (Original Data + K-Means)")
plt.show()

# Internal Metrics
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].plot(df_exp1["k"], df_exp1["silhouette"])
axes[0].set_title("Silhouette Score")

axes[1].plot(df_exp1["k"], df_exp1["davies_bouldin"])
axes[1].set_title("Davies-Bouldin Index")

axes[2].plot(df_exp1["k"], df_exp1["calinski_harabasz"])
axes[2].set_title("Calinski-Harabasz Index")

plt.show()

# External Metrics
plt.figure(figsize=(8, 4))
plt.plot(df_exp1["k"], df_exp1["ARI"], label="ARI")
plt.plot(df_exp1["k"], df_exp1["NMI"], label="NMI")
plt.plot(df_exp1["k"], df_exp1["purity"], label="Purity")
plt.legend()
plt.title("External Metrics (Original Data + K-Means)")
plt.xlabel("k")
plt.show()


KeyError: 'inertia'

<Figure size 640x480 with 0 Axes>

## Experiment 2

In [None]:
results_exp2 = experiment2_gmm_original(
    X=X,
    y=y,
    component_values=range(2, 8),
    covariance_types=["full", "tied", "diagonal", "spherical"]
)

df_exp2 = pd.DataFrame(results_exp2)
df_exp2.head()


## Evaluation

In [None]:
best_gmm = df_exp2.sort_values("log_likelihood", ascending=False).iloc[0]
best_gmm

# BIC and AIC Plots
plt.figure(figsize=(10, 4))
sns.lineplot(data=df_exp2, x="k", y="BIC", hue="covariance_type")
plt.title("BIC vs Number of Components")
plt.show()

plt.figure(figsize=(10, 4))
sns.lineplot(data=df_exp2, x="k", y="AIC", hue="covariance_type")
plt.title("AIC vs Number of Components")
plt.show()

# External Metrics Plots
plt.figure(figsize=(8, 4))
sns.lineplot(data=df_exp2, x="k", y="ARI", hue="covariance_type")
plt.title("ARI for GMM (Original Data)")
plt.show()



## Experiment 3

In [None]:
pca_dims = [2, 5, 10, 15, 20]

results_exp3 = experiment3_kmeans_pca(
    X=X,
    y=y,
    pca_dims=pca_dims,
    k=2,
    random_state=42
)

df_exp3 = pd.DataFrame(results_exp3)
df_exp3


## Evaluation

In [None]:
# PCA Reconstruction Error Plot
plt.figure()
plt.plot(df_exp3["pca_dim"], df_exp3["reconstruction_error"], marker="o")
plt.xlabel("Number of PCA Components")
plt.ylabel("Reconstruction Error (MSE)")
plt.title("PCA Reconstruction Error")
plt.show()

