In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
!nvidia-smi

In [3]:
import logging
import pandas as pd

from tqdm.auto import tqdm
from utilities.clustering.cluster_utils import cluster_kmeans, cluster_custom
from utilities.clustering.cluster_model import ClusterModel, rand_index, adjusted_rand_index
from utilities.utils import shared_dir, figures_dir, env_bool, read_json, write_json, get_cuda_availability, \
    kmeans_tuning_dir, custom_tuning_dir

In [4]:
tqdm.pandas()
pd.set_option('display.max_colwidth', None)

logging.basicConfig(level=logging.CRITICAL)

DEVICE = get_cuda_availability()

### Load dataset for clustering

In [None]:
cluster_df = pd.read_pickle(f'{shared_dir}/clusters.pickle')
cluster_df.head()

### Build cluster model and print sample data

In [None]:
time_limit = 9
space_limit = 10
n_clusters = 7000

cluster_type = 'kmeans'
cluster_model = ClusterModel(df=cluster_df, device=DEVICE)
cluster_model.print_sample_data()

### Hyperparameter tuning

This is optional due to significant runtime

In [None]:
if env_bool('TUNE_CLUSTER'):
    timezones = {}
    ds_custom, timezones = cluster_custom(cluster_df, timezones, DEVICE)
    ds_kmeans, timezones = cluster_kmeans(cluster_df, timezones, DEVICE)

    write_json(ds_custom, f'{custom_tuning_dir}/clustering_distances.json')
    write_json(ds_kmeans, f'{kmeans_tuning_dir}/clustering_distances.json')

    cluster_model.distances = ds_custom
    cluster_model.plot(cluster_type='custom')
    cluster_model.distances = ds_kmeans
    cluster_model.plot(cluster_type='kmeans')

### Fit cluster model

In [None]:
match cluster_type:
    case 'custom': cluster_model.fit(cluster_type, time_limit=time_limit, space_limit=space_limit)
    case 'kmeans': cluster_model.fit(cluster_type, n_clusters)

### Format and save clusters

In [None]:
cluster_model.format_data()
list(cluster_model.formatted_data.values())[0]

In [None]:
cluster_model.save(filename=f'{figures_dir}/clustering/{cluster_type}_clusters/{cluster_type}_clusters.html')
cluster_model.display()

### Calculate Rand Index and ARI for comparison between algorithms

In [38]:
data_kmeans = read_json(f'{shared_dir}/kmeans_clusters/labelled/{n_clusters}.json')
data_custom = read_json(f'{shared_dir}/custom_clusters/labelled/{time_limit}_{space_limit}.json')

ids_kmeans = [list(x['location'].keys()) for x in list(data_kmeans.values())]
ids_custom = [list(x['location'].keys()) for x in list(data_custom.values())]

rand_index([ids_kmeans, ids_custom])
adjusted_rand_index([ids_kmeans, ids_custom])