In [1]:
# %pip install -r ../../requirements.txt

In [1]:
import os
import numpy as np
import pandas as pd

from pic import PIC
from evaluation import evaluate_clustering_algorithms, plot_clustering_algorithms, plot_silhouette
from datasets import synthetic_dataset_1, synthetic_dataset_2, synthetic_dataset_3, prepare_datasets, load_dataset

In [2]:
data_path = '../data'
raw_data_path = '../data/raw'
prepare_datasets(data_path, raw_data_path)

In [4]:
results = {}
for dataset in ['MNIST', 'USPS', 'Caltech-256']:
    X, Y_true = load_dataset(data_path, dataset)
    print(f'Loaded {dataset} dataset with shape {X.shape}')

    n_clusters = int(np.max(Y_true)+1)
    pic = PIC(n_clusters=n_clusters, k=20, a=0.95, z=0.01)
    Y_pred_PIC = pic.fit_predict(X)

    results_df = evaluate_clustering_algorithms(X, Y_true, n_clusters, Y_pred_PIC)
    results[dataset] = results_df


Loaded MNIST dataset with shape (5139, 784)
Time taken for PIC fit_predict: 188.37566566467285 seconds
PIC: NMI = 0.9404, CE = 0.0160, Silhouette = 0.1058
k-med: NMI = 0.4905, CE = 0.3563, Silhouette = 0.1013
A-link: NMI = 0.4085, CE = 0.5733, Silhouette = 0.1034
S-link: NMI = 0.0016, CE = 0.7786, Silhouette = -0.0099
C-link: NMI = 0.5389, CE = 0.2804, Silhouette = 0.0933
AP: NMI = 0.4258, CE = 0.9597, Silhouette = 0.0599
NCuts: NMI = 0.8069, CE = 0.1152, Silhouette = 0.0840
NJW: NMI = 0.8977, CE = 0.0333, Silhouette = 0.1114
CT: NMI = 0.6342, CE = 0.4927, Silhouette = 0.0088
Zell: NMI = 0.9128, CE = 0.0272, Silhouette = 0.1089
C-kernel: NMI = 0.7801, CE = 0.1288, Silhouette = 0.0784
D-kernel: NMI = 0.9029, CE = 0.0286, Silhouette = 0.1068


Loaded USPS dataset with shape (9298, 256)
Time taken for PIC fit_predict: 644.951354265213 seconds
PIC: NMI = 0.8353, CE = 0.2689, Silhouette = 0.0244
k-med: NMI = 0.4690, CE = 0.5505, Silhouette = 0.0868
A-link: NMI = 0.1393, CE = 0.7777, Silhoue

In [None]:
for dataset in ['Iris', 'Breast-Cancer']:
    X, Y_true = load_dataset(data_path, dataset)
    print(f'Loaded {dataset} dataset with shape {X.shape}')

    n_clusters = int(np.max(Y_true)+1)
    pic = PIC(n_clusters=n_clusters, k=20, a=0.95, z=0.01)
    Y_pred_PIC = pic.fit_predict(X)

    results_df = evaluate_clustering_algorithms(X, Y_true, n_clusters, Y_pred_PIC)
    results[dataset] = results_df

    plots_path = f'../results/plots/{dataset}'
    plot_silhouette(X, Y_pred_PIC, n_clusters, plots_path, dataset)


Loaded Iris dataset with shape (150, 4)
Time taken for PIC fit_predict: 0.27173900604248047 seconds
PIC: NMI = 0.8057, CE = 0.0933, Silhouette = 0.5542
k-med: NMI = 0.7582, CE = 0.1067, Silhouette = 0.5528
A-link: NMI = 0.8057, CE = 0.0933, Silhouette = 0.5542
S-link: NMI = 0.7175, CE = 0.3200, Silhouette = 0.5121
C-link: NMI = 0.7221, CE = 0.1600, Silhouette = 0.5136
AP: NMI = 0.6694, CE = 0.4867, Silhouette = 0.3474
NCuts: NMI = 0.7857, CE = 0.0933, Silhouette = 0.5518
Error with NJW: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle

  Y = eigenvects / rows_norm


Zell: NMI = 0.7857, CE = 0.0933, Silhouette = 0.5507
C-kernel: NMI = 0.7772, CE = 0.1200, Silhouette = 0.5499
D-kernel: NMI = 0.7629, CE = 0.1133, Silhouette = 0.5528


Loaded Breast-Cancer dataset with shape (569, 30)
Time taken for PIC fit_predict: 4.196013927459717 seconds
PIC: NMI = 0.4085, CE = 0.1810, Silhouette = 0.4007
k-med: NMI = 0.4980, CE = 0.1318, Silhouette = 0.6921
A-link: NMI = 0.0881, CE = 0.3374, Silhouette = 0.6909
S-link: NMI = 0.0052, CE = 0.3708, Silhouette = 0.7990
C-link: NMI = 0.0881, CE = 0.3374, Silhouette = 0.6909
AP: NMI = 0.2720, CE = 0.8014, Silhouette = 0.3909
NCuts: NMI = 0.4200, CE = 0.1740, Silhouette = 0.4083
NJW: NMI = 0.0465, CE = 0.3743, Silhouette = 0.2472
CT: NMI = 0.4142, CE = 0.1775, Silhouette = 0.4047
Zell: NMI = 0.4142, CE = 0.1775, Silhouette = 0.4047
C-kernel: NMI = 0.2988, CE = 0.2654, Silhouette = 0.3042
D-kernel: NMI = 0.4200, CE = 0.1740, Silhouette = 0.4081




In [7]:
# Extract NMI scores
nmi_scores = pd.DataFrame()
for dataset, df in results.items():
    nmi_scores[dataset] = df.set_index('Algorithm')['NMI']

# Extract CE scores
ce_scores = pd.DataFrame()
for dataset, df in results.items():
    ce_scores[dataset] = df.set_index('Algorithm')['CE']

# Extract Silhouette scores
silhouette_scores = pd.DataFrame()
for dataset, df in results.items():
    silhouette_scores[dataset] = df.set_index('Algorithm')['Silhouette']

results_path = '../results'

# Print NMI scores table
print("NMI Scores:")
print(nmi_scores)
nmi_scores.to_csv(os.path.join(results_path, 'nmi_scores.csv'))

# Print CE scores table
print("\nCE Scores:")
print(ce_scores)
ce_scores.to_csv(os.path.join(results_path, 'ce_scores.csv'))

# Print Silhouette scores table
print("\nSilhouette Scores:")
print(silhouette_scores)
silhouette_scores.to_csv(os.path.join(results_path, 'silhouette_scores.csv'))

NMI Scores:
              MNIST      USPS  Caltech-256      Iris  Breast-Cancer
Algorithm                                                          
PIC        0.940414  0.835289     0.652795  0.805694       0.408539
k-med      0.490522  0.468998     0.466325  0.758176       0.498036
A-link     0.408475  0.139269     0.312531  0.805694       0.088097
S-link     0.001565  0.002166     0.019359  0.717464       0.005162
C-link     0.538921  0.373770     0.395362  0.722066       0.088097
AP         0.425761  0.524590     0.492139  0.669446       0.271973
NCuts      0.806937  0.769843     0.592862  0.785665       0.420049
NJW        0.897719  0.784214     0.528869       NaN       0.046462
CT         0.634169  0.438512     0.181210  0.396983       0.414250
Zell       0.912796  0.845529     0.343009  0.785665       0.414250
C-kernel   0.780087  0.768363     0.521350  0.777154       0.298794
D-kernel   0.902892  0.846122     0.508032  0.762899       0.420049

CE Scores:
              MNIST     

In [None]:
for i, dataset in enumerate([synthetic_dataset_1, synthetic_dataset_2, synthetic_dataset_3]):
    plots_path = f'../results/plots/dataset_{i}'
    X, Y_true = dataset(cluster_density=0.75)
    n_clusters = int(np.max(Y_true)+1)
    pic = PIC(n_clusters=n_clusters, k=20, a=0.95, z=0.01)
    Y_pred_PIC = pic.fit_predict(X)

    plot_clustering_algorithms(X, Y_true, n_clusters, Y_pred_PIC, plots_path)

Time taken for PIC fit_predict: 30.657036781311035 seconds




Time taken for PIC fit_predict: 341.81405544281006 seconds




Time taken for PIC fit_predict: 31.962682247161865 seconds


