## Hyperparameter tuning of Birch

In [17]:
import pandas as pd

data = pd.read_csv('../data/processed/processed.csv')

In [18]:
import numpy as np
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score

threshold_range = np.arange(0.1, 0.9, 0.1)
branching_factor_range = np.arange(10, 100, 10)
n_clusters_range = np.arange(2, 21, 1)

best_score = {
    'silhouette_coefficient': 0,
    'threshold': 0,
    'branching_factor': 0,
    'n_clusters': 0
}

for n_clusters in n_clusters_range:
    for threshold in threshold_range:
        for branching_factor in branching_factor_range:

            birch = Birch(threshold=threshold, branching_factor=branching_factor, n_clusters=n_clusters)
            prediction = birch.fit_predict(X=data)

            silhouette_score_average = silhouette_score(data, prediction)
            per_cluster.append({
                'silhouette_score_average': silhouette_score_average,
                'threshold': threshold, 
                'branching_factor': branching_factor
            })

            if silhouette_score_average > best_score['silhouette_coefficient']:
                best_score['silhouette_coefficient'] = silhouette_score_average
                best_score['threshold'] = threshold
                best_score['branching_factor'] = branching_factor
                best_score['n_clusters'] = n_clusters

            print((f"n_clusters: {n_clusters:>2}, threshold: {threshold:.1f}, branching_factor: {branching_factor}, silhouette score average: {silhouette_score_average:.4f}"))

print("----------------#####-----#####---------------------")

print(best_per_cluster)

print("The best score from tuning is: ")
print(f"n_clusters: {best_score['n_clusters']}, threshold: {best_score['threshold']}, branching_factor: {best_score['branching_factor']}, silhouette coefficient: {best_score['silhouette_coefficient']:.4f}")

----------------#####-----#####---------------------
[None, None, {'silhouette_score_average': 0.47045655202957437, 'threshold': 0.6, 'branching_factor': 10}, {'silhouette_score_average': 0.3684619962399025, 'threshold': 0.6, 'branching_factor': 10}, {'silhouette_score_average': 0.29786783569359865, 'threshold': 0.8, 'branching_factor': 10}, {'silhouette_score_average': 0.29786783569359865, 'threshold': 0.8, 'branching_factor': 10}, {'silhouette_score_average': 0.29786783569359865, 'threshold': 0.8, 'branching_factor': 10}, {'silhouette_score_average': 0.29786783569359865, 'threshold': 0.8, 'branching_factor': 10}, {'silhouette_score_average': 0.29786783569359865, 'threshold': 0.8, 'branching_factor': 10}, {'silhouette_score_average': 0.29786783569359865, 'threshold': 0.8, 'branching_factor': 10}, {'silhouette_score_average': 0.29786783569359865, 'threshold': 0.8, 'branching_factor': 10}, {'silhouette_score_average': 0.29786783569359865, 'threshold': 0.8, 'branching_factor': 10}, {'sil