In [2]:
# %pip install -r ../../requirements.txt

In [None]:
import os
import numpy as np
import pandas as pd

from pic import PIC
from evaluation import evaluate_clustering_algorithms, plot_clustering_algorithms, plot_silhouette
from datasets import synthetic_dataset_1, synthetic_dataset_2, synthetic_dataset_3, prepare_datasets, load_dataset

In [2]:
data_path = '../data'
raw_data_path = '../data/raw'
prepare_datasets(data_path, raw_data_path)

In [None]:
results = {}
for dataset in ['MNIST', 'USPS', 'Caltech-256']:
    X, Y_true = load_dataset(data_path, dataset)
    print(f'Loaded {dataset} dataset with shape {X.shape}')

    n_clusters = int(np.max(Y_true)+1)
    pic = PIC(n_clusters=n_clusters, k=20, a=0.95, z=0.01)
    Y_pred_PIC = pic.fit_predict(X)

    results_df = evaluate_clustering_algorithms(X, Y_true, n_clusters, Y_pred_PIC)
    results[dataset] = results_df


Loaded MNIST dataset with shape (5139, 784)
Time taken for PIC fit_predict: 256.05274534225464 seconds
k-med: NMI = 0.3180, CE = 0.5338
A-link: NMI = 0.4085, CE = 0.5733
S-link: NMI = 0.0016, CE = 0.7786
C-link: NMI = 0.5389, CE = 0.2804
AP: NMI = 0.4258, CE = 0.9597
NCuts: NMI = 0.8069, CE = 0.1152
NJW: NMI = 0.8977, CE = 0.0333
CT: NMI = 0.6342, CE = 0.4927
Zell: NMI = 0.9128, CE = 0.0272
C-kernel: NMI = 0.7801, CE = 0.1288
D-kernel: NMI = 0.9029, CE = 0.0286
Loaded USPS dataset with shape (9298, 256)
Time taken for PIC fit_predict: 854.7342283725739 seconds
k-med: NMI = 0.5527, CE = 0.3729
A-link: NMI = 0.1393, CE = 0.7777
S-link: NMI = 0.0022, CE = 0.8328
C-link: NMI = 0.3738, CE = 0.6006
AP: NMI = 0.5246, CE = 0.9342
NCuts: NMI = 0.7715, CE = 0.3564
NJW: NMI = 0.7842, CE = 0.2688
CT: NMI = 0.4385, CE = 0.6146
Zell: NMI = 0.8455, CE = 0.1971
C-kernel: NMI = 0.7684, CE = 0.2686
D-kernel: NMI = 0.8461, CE = 0.1320
Loaded Caltech-256 dataset with shape (600, 4200)
Time taken for PIC f

In [None]:
for dataset in ['Iris', 'Breast-Cancer']:
    X, Y_true = load_dataset(data_path, dataset)
    print(f'Loaded {dataset} dataset with shape {X.shape}')

    n_clusters = int(np.max(Y_true)+1)
    pic = PIC(n_clusters=n_clusters, k=20, a=0.95, z=0.01)
    Y_pred_PIC = pic.fit_predict(X)

    results_df = evaluate_clustering_algorithms(X, Y_true, n_clusters, Y_pred_PIC)
    results[dataset] = results_df

    plots_path = f'../results/plots/{dataset}'
    plot_silhouette(X, Y_pred_PIC, plots_path)


In [None]:
# Extract NMI scores
nmi_scores = pd.DataFrame()
for dataset, df in results.items():
    nmi_scores[dataset] = df.set_index('Algorithm')['NMI']

# Extract CE scores
ce_scores = pd.DataFrame()
for dataset, df in results.items():
    ce_scores[dataset] = df.set_index('Algorithm')['CE']

# Extract Silhouette scores
silhouette_scores = pd.DataFrame()
for dataset, df in results.items():
    silhouette_scores[dataset] = df.set_index('Algorithm')['Silhouette']

results_path = '../results'

# Print NMI scores table
print("NMI Scores:")
print(nmi_scores)
nmi_scores.to_csv(os.path.join(results_path, 'nmi_scores.csv'))

# Print CE scores table
print("\nCE Scores:")
print(ce_scores)
ce_scores.to_csv(os.path.join(results_path, 'ce_scores.csv'))

# Print Silhouette scores table
print("\nSilhouette Scores:")
print(silhouette_scores)
silhouette_scores.to_csv(os.path.join(results_path, 'silhouette_scores.csv'))

NMI Scores:
              MNIST      USPS  Caltech-256
Algorithm                                 
PIC        0.940414  0.835289     0.652795
k-med      0.318010  0.552715     0.314882
A-link     0.408475  0.139269     0.312531
S-link     0.001565  0.002166     0.019359
C-link     0.538921  0.373770     0.395362
AP         0.425761  0.524590     0.492139
NCuts      0.806937  0.771538     0.589497
NJW        0.897719  0.784214     0.528869
CT         0.634169  0.438512     0.181210
Zell       0.912796  0.845529     0.343009
C-kernel   0.780087  0.768363     0.521350
D-kernel   0.902892  0.846122     0.508032

CE Scores:
              MNIST      USPS  Caltech-256
Algorithm                                 
PIC        0.015956  0.268875     0.306667
k-med      0.533761  0.372876     0.606667
A-link     0.573263  0.777694     0.665000
S-link     0.778556  0.832760     0.828333
C-link     0.280405  0.600559     0.506667
AP         0.959720  0.934179     0.705000
NCuts      0.115198  0.356421 

In [3]:
for i, dataset in enumerate([synthetic_dataset_1, synthetic_dataset_2, synthetic_dataset_3]):
    plots_path = f'../results/plots/dataset_{i}'
    X, Y_true = dataset(cluster_density=0.75)
    n_clusters = int(np.max(Y_true)+1)
    pic = PIC(n_clusters=n_clusters, k=20, a=0.95, z=0.01)
    Y_pred_PIC = pic.fit_predict(X)

    plot_clustering_algorithms(X, Y_true, n_clusters, Y_pred_PIC, plots_path)

Time taken for PIC fit_predict: 30.657036781311035 seconds




Time taken for PIC fit_predict: 341.81405544281006 seconds




Time taken for PIC fit_predict: 31.962682247161865 seconds


