In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
import preprocessing

In [3]:
# Load and scale data with RobustScaler
df_unscaled = pd.read_csv(r"Preprocessed_Data\unscaled_filtered.csv", index_col=0)
scaler = RobustScaler()
df_scaled_robust = pd.DataFrame(
    scaler.fit_transform(df_unscaled),
    columns=df_unscaled.columns,
    index=df_unscaled.index
)

# Dimensionality reduction
X_reduced = preprocessing.pca_umap_reduction(
    df_scaled_robust,
    pca_components=0.95,  # Keeps 95% of variance
    umap_components=10,   # Reduces to 10 dimensions
    use_umap=True
)


# Create DataFrame from reduced data
df_reduced = pd.DataFrame(X_reduced, index=df_unscaled.index).dropna()

# Ensure matching index for interpretation
df_unscaled = df_unscaled.loc[df_reduced.index]

print("Original dimensions:", df_scaled_robust.shape)
print("Reduced dimensions:", df_reduced.shape)

  warn(


Original dimensions: (487, 126)
Reduced dimensions: (487, 10)


In [14]:
from sklearn.cluster import Birch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline



# 1) Build a pipeline to scale & cluster
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('birch', Birch())
])

# 2) Define hyperparameter grid
param_grid = {
    'birch__threshold': [round(2 + i*0.1, 1) for i in range(31)],
    'birch__branching_factor': [15, 30, 50],
    'birch__n_clusters': [3, 5, 8]
}

# 3) Use silhouette_score as our guide (higher=better)
scorer = make_scorer(silhouette_score)

# 4) Set up GridSearchCV (3‐fold stratified isn’t really a thing for clustering,
#    but we can use plain KFold or just fit the whole dataset each time)
search = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring=scorer,
    cv=3,
    n_jobs=-1,
    verbose=1,
    error_score=-1
)

# 5) Fit & inspect best params
search.fit(df_scaled_robust)  
print("Best score:", search.best_score_)
print("Best params:", search.best_params_)


Fitting 3 folds for each of 279 candidates, totalling 837 fits
Best score: -1.0
Best params: {'birch__branching_factor': 15, 'birch__n_clusters': 3, 'birch__threshold': 2.0}


In [12]:
model = Birch(threshold=0.2, branching_factor=15, n_clusters=3)
model.fit(df_scaled_robust)
leaf_counts = model.subcluster_centers_.shape[0]  

In [13]:
leaf_counts

487

In [11]:
search.subcluster_centers_.shape[0]  

AttributeError: 'GridSearchCV' object has no attribute 'subcluster_centers_'