In [0]:
%sh
cd ../
pip install --upgrade -r requirements.txt
pip install umap-learn
pip install hdbscan

In [0]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage

import matplotlib.pyplot as plt
import seaborn as sns

# Optional UMAP
try:
    import umap
    HAS_UMAP = True
except:
    HAS_UMAP = False
    print("UMAP not installed. Install with: pip install umap-learn")

# Optional HDBSCAN
try:
    import hdbscan
    HAS_HDBSCAN = True
except:
    HAS_HDBSCAN = False
    print("HDBSCAN not installed. Install with: pip install hdbscan")

import torch
from torch import nn


In [0]:
features = [
    'Overall Length', 'Overall Width', 'Overall Height',
    'Front End Length', 'Rear End Length',
    'Side Glass Height', 'Body Side Height',
    'Wheelbase', 'Front Overhang', 'Rear Overhang', 
    'Roof Width', 'Track Width Front', 'Track Width Rear',
    # 'Curb Weight'
]

df = pd.read_csv("../data/cleaned_data.csv")
X = df[features].dropna().copy()
valid_idx = X.index

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [0]:
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)
print("PCA variance explained:", pca.explained_variance_ratio_.sum())


In [0]:
bic_scores = []
models = []
K = range(1, 12)

for k in K:
    gmm = GaussianMixture(n_components=k, covariance_type='full', random_state=42)
    gmm.fit(X_pca)
    bic_scores.append(gmm.bic(X_pca))
    models.append(gmm)

optimal_k = K[np.argmin(bic_scores)]
print("Optimal cluster count (GMM+BIC):", optimal_k)

gmm_labels = models[np.argmin(bic_scores)].predict(X_pca)
df["GMM_Cluster"] = np.nan
df.loc[valid_idx, "GMM_Cluster"] = gmm_labels



In [0]:
plt.plot(K, bic_scores, marker="o")
plt.title("GMM BIC Score")
plt.xlabel("Clusters")
plt.ylabel("BIC")
plt.show()


In [0]:
if HAS_HDBSCAN:
    clusterer = hdbscan.HDBSCAN(min_cluster_size=200)
    hdb_labels = clusterer.fit_predict(X_pca)
    df["HDBSCAN_Cluster"] = np.nan
    df.loc[valid_idx, "HDBSCAN_Cluster"] = hdb_labels
else:
    print("HDBSCAN not available.")


In [0]:
linked = linkage(X_pca[:400], method='ward')  # sample first 400 for speed

plt.figure(figsize=(12, 6))
dendrogram(linked)
plt.title("Hierarchical Clustering Dendrogram")
plt.show()


In [0]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim=len(features), latent_dim=3):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32), nn.ReLU(),
            nn.Linear(32, 16), nn.ReLU(),
            nn.Linear(16, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16), nn.ReLU(),
            nn.Linear(16, 32), nn.ReLU(),
            nn.Linear(32, input_dim)
        )
    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z), z

X_t = torch.tensor(X_scaled, dtype=torch.float32)
ae = Autoencoder()
opt = torch.optim.Adam(ae.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()


In [0]:
for epoch in range(200):
    opt.zero_grad()
    recon, z = ae(X_t)
    loss = loss_fn(recon, X_t)
    loss.backward()
    opt.step()
    if epoch % 50 == 0:
        print(f"Epoch {epoch}: {loss.item():.5f}")

latent = ae(X_t)[1].detach().numpy()


In [0]:
if HAS_HDBSCAN:
    ae_cluster = hdbscan.HDBSCAN(min_cluster_size=200).fit_predict(latent)
    df["AE_HDBSCAN_Type"] = ae_cluster


In [0]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans


In [0]:
tsne = TSNE(n_components=2, perplexity=40, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

labels = KMeans(n_clusters=5, random_state=42).fit_predict(X_tsne)

df["TSNE_KMeans_Cluster"] = np.nan
df.loc[valid_idx, "TSNE_KMeans_Cluster"] = labels

plt.scatter(X_tsne[:,0], X_tsne[:,1], c=gmm_labels, s=8, cmap="tab10")
plt.title("t-SNE by GMM Vehicle Clusters")
plt.show()


In [0]:
if HAS_UMAP:
    X_umap = umap.UMAP(n_neighbors=25, min_dist=0.1).fit_transform(X_scaled)
    plt.scatter(X_umap[:,0], X_umap[:,1], c=gmm_labels, s=8, cmap="tab10")
    plt.title("UMAP Vehicle Clusters")
    plt.show()
#     df["UMAP_x"] = np.nan
#     df["UMAP_y"] = np.nan
#     df.loc[valid_idx, ["UMAP_x", "UMAP_y"]] = X_umap
# else:
#     df["UMAP_x"] = df["UMAP_y"] = np.nan

In [0]:
df.to_csv("../data/vehicles_with_all_clusters.csv", index=False)
