In [None]:
import numpy as np
import requests
import pandas as pd
import matplotlib.pyplot as plt
import genieclust
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
import hdbscan 
from sklearn_som.som import SOM
from sklearn.metrics.cluster import adjusted_mutual_info_score, adjusted_rand_score, v_measure_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import time

# **<span style="color:#3c1518">2D Dataset</span>**

In [None]:
github = "https://raw.githubusercontent.com/gagolews/clustering-data-v1/master/"

dataset = "fcps/engytime" # Change dataset here 
data_url = github + dataset + ".data.gz"
labels_url = github + dataset + ".labels0.gz"

response = requests.get(data_url)
open("dataset.gz", "wb").write(response.content)
response = requests.get(labels_url)
open("labels.gz", "wb").write(response.content)

X = np.loadtxt("dataset.gz", ndmin=2)
y = np.loadtxt("labels.gz", dtype=np.intc)-1
n_clusters = len(np.unique(y))

metrics = pd.DataFrame(columns=['Clustering Algorithm', 'ARI', 'AMI', 'V-measure'])
confMatrix = pd.DataFrame(columns=['Clustering Algorithm', 'Confusion Matrix Settings'])

## <span style="color:#69140e">K-Means</span>

In [None]:
clStr = 'K-Means'
cl = KMeans(n_clusters=2, random_state=0)
cl.fit(X)
prediction = cl.labels_

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metrics['Clustering Algorithm'].unique()):
    metrics.loc[len(metrics)] = [clStr, ari, ami, v_measure]
else:
    index = metrics[metrics['Clustering Algorithm']== clStr].index.to_list()[0] 
    metrics.loc[index] = [clStr, ari, ami, v_measure]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrix['Clustering Algorithm'].unique()):
    confMatrix.loc[len(metrics)] = [clStr, disp]
else:
    index = confMatrix[confMatrix['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrix.loc[index] = [clStr, disp]

genieclust.plots.plot_scatter(X, labels=y)
plt.title("True Clusters")
plt.axis("equal")
plt.show()

genieclust.plots.plot_scatter(X, labels=prediction)
plt.title("Predicted Clusters") 
plt.axis("equal")
plt.show()

## <span style="color:#69140e">HAC</span>

In [None]:
clStr = 'HAC'
cl = AgglomerativeClustering(n_clusters= 2)
cl.fit(X)
prediction = cl.labels_

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metrics['Clustering Algorithm'].unique()):
    metrics.loc[len(metrics)] = [clStr, ari, ami, v_measure]
else:
    index = metrics[metrics['Clustering Algorithm']== clStr].index.to_list()[0] 
    metrics.loc[index] = [clStr, ari, ami, v_measure]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrix['Clustering Algorithm'].unique()):
    confMatrix.loc[len(metrics)] = [clStr, disp]
else:
    index = confMatrix[confMatrix['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrix.loc[index] = [clStr, disp]
    
genieclust.plots.plot_scatter(X, labels=y)
plt.title("True Clusters")
plt.axis("equal")
plt.show()

genieclust.plots.plot_scatter(X, labels=prediction)
plt.title("Predicted Clusters") 
plt.axis("equal")
plt.show()

## <span style="color:#69140e">GMM</span>

In [None]:
clStr = 'GMM'
cl = GaussianMixture(n_components=2, random_state=0)
cl.fit(X)
prediction = cl.predict(X)

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metrics['Clustering Algorithm'].unique()):
    metrics.loc[len(metrics)] = [clStr, ari, ami, v_measure]
else:
    index = metrics[metrics['Clustering Algorithm']== clStr].index.to_list()[0] 
    metrics.loc[index] = [clStr, ari, ami, v_measure]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrix['Clustering Algorithm'].unique()):
    confMatrix.loc[len(metrics)] = [clStr, disp]
else:
    index = confMatrix[confMatrix['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrix.loc[index] = [clStr, disp]    

genieclust.plots.plot_scatter(X, labels=y)
plt.title("True Clusters")
plt.axis("equal")
plt.show()

genieclust.plots.plot_scatter(X, labels=prediction)
plt.title("Predicted Clusters") 
plt.axis("equal")
plt.show()

## <span style="color:#69140e">DBSCAN</span>

In [None]:
clStr = 'DBSCAN'
cl = DBSCAN()
cl.fit(X)
prediction = cl.labels_

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metrics['Clustering Algorithm'].unique()):
    metrics.loc[len(metrics)] = [clStr, ari, ami, v_measure]
else:
    index = metrics[metrics['Clustering Algorithm']== clStr].index.to_list()[0] 
    metrics.loc[index] = [clStr, ari, ami, v_measure]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrix['Clustering Algorithm'].unique()):
    confMatrix.loc[len(metrics)] = [clStr, disp]
else:
    index = confMatrix[confMatrix['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrix.loc[index] = [clStr, disp]
    
genieclust.plots.plot_scatter(X, labels=y)
plt.title("True Clusters")
plt.axis("equal")
plt.show()

genieclust.plots.plot_scatter(X, labels=prediction)
plt.title("Predicted Clusters") 
plt.axis("equal")
plt.show()

## <span style="color:#69140e">HDBSCAN</span>

In [None]:
clStr = 'HDBSCAN'
cl = hdbscan.HDBSCAN(min_cluster_size=2, min_samples= 350)
prediction = cl.fit_predict(X)
#prediction = cl.labels_

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metrics['Clustering Algorithm'].unique()):
    metrics.loc[len(metrics)] = [clStr, ari, ami, v_measure]
else:
    index = metrics[metrics['Clustering Algorithm']== clStr].index.to_list()[0] 
    metrics.loc[index] = [clStr, ari, ami, v_measure]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrix['Clustering Algorithm'].unique()):
    confMatrix.loc[len(metrics)] = [clStr, disp]
else:
    index = confMatrix[confMatrix['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrix.loc[index] = [clStr, disp]

genieclust.plots.plot_scatter(X, labels=y)
plt.title("True Clusters")
plt.axis("equal")
plt.show()

genieclust.plots.plot_scatter(X, labels=prediction)
plt.title("Predicted Clusters") 
plt.axis("equal")
plt.show()

## <span style="color:#69140e">SOM</span>

In [None]:
clStr = 'SOM'
cl = SOM(m=2, n=1, dim=2)
cl.fit(X)
prediction = cl.predict(X)

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metrics['Clustering Algorithm'].unique()):
    metrics.loc[len(metrics)] = [clStr, ari, ami, v_measure]
else:
    index = metrics[metrics['Clustering Algorithm']== clStr].index.to_list()[0] 
    metrics.loc[index] = [clStr, ari, ami, v_measure]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrix['Clustering Algorithm'].unique()):
    confMatrix.loc[len(metrics)] = [clStr, disp]
else:
    index = confMatrix[confMatrix['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrix.loc[index] = [clStr, disp]
    
genieclust.plots.plot_scatter(X, labels=y)
plt.title("True Clusters")
plt.axis("equal")
plt.show()

genieclust.plots.plot_scatter(X, labels=prediction)
plt.title("Predicted Clusters") 
plt.axis("equal")
plt.show()

In [None]:
# Reset confMatrix Index
confMatrix.reset_index(drop= True, inplace= True)

# **<span style="color:#3c1518">Results</span>**

In [None]:
metrics

# **<span style="color:#3c1518">Confusion Matrix</span>**

## <span style="color:#69140e">Best Clustering Algorithm</span>

In [None]:
disp = confMatrix['Confusion Matrix Settings'].loc[((metrics['ARI']+metrics['AMI']+metrics['V-measure'])/3).idxmax()]
alg = confMatrix['Clustering Algorithm'].loc[((metrics['ARI']+metrics['AMI']+metrics['V-measure'])/3).idxmax()]
disp.plot()
plt.gcf().set_size_inches(13,8)
plt.title(alg)
plt.show()

## <span style="color:#69140e">Worst Clustering Algorithm</span>

In [None]:
disp = confMatrix['Confusion Matrix Settings'].loc[((metrics['ARI']+metrics['AMI']+metrics['V-measure'])/3).idxmin()]
alg = confMatrix['Clustering Algorithm'].loc[((metrics['ARI']+metrics['AMI']+metrics['V-measure'])/3).idxmin()]
disp.plot()
plt.gcf().set_size_inches(13,8)
plt.title(alg)
plt.show()

# **<span style="color:#3c1518">Proposed Clustering Algorithm</span>**

### <span style="color:#69140e">The proposed Clustering Algorithm for this problem is GMM</span>

# **<span style="color:#3c1518">Big Dataset</span>**

In [None]:
github = "https://raw.githubusercontent.com/gagolews/clustering-data-v1/master/"

dataset = "g2mg/g2mg_128_80" # Change dataset here 
data_url = github + dataset + ".data.gz"
labels_url = github + dataset + ".labels0.gz"

response = requests.get(data_url)
open("datasetBig.gz", "wb").write(response.content)
response = requests.get(labels_url)
open("labelsBig.gz", "wb").write(response.content)

X = np.loadtxt("datasetBig.gz", ndmin=2)
y = np.loadtxt("labelsBig.gz", dtype=np.intc)-1
n_clusters = len(np.unique(y))

metricsBigDataset = pd.DataFrame(columns=['Clustering Algorithm', 'ARI', 'AMI', 'V-measure','Fit Predict Time'])
confMatrixBigDataset = pd.DataFrame(columns=['Clustering Algorithm', 'Confusion Matrix Settings'])

## <span style="color:#69140e">K-Means</span>

In [None]:
clStr = 'K-Means'
cl = KMeans(n_clusters=2, random_state=0)

startTime = time.time()
prediction = cl.fit_predict(X)
stopTime = time.time()
fitPredictTime = stopTime - startTime

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metricsBigDataset['Clustering Algorithm'].unique()):
    metricsBigDataset.loc[len(metricsBigDataset)] = [clStr, ari, ami, v_measure, fitPredictTime]
else:
    index = metricsBigDataset[metricsBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    metricsBigDataset.loc[index] = [clStr, ari, ami, v_measure, fitPredictTime]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrixBigDataset['Clustering Algorithm'].unique()):
    confMatrixBigDataset.loc[len(metricsBigDataset)] = [clStr, disp]
else:
    index = confMatrixBigDataset[confMatrixBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrixBigDataset.loc[index] = [clStr, disp]


## <span style="color:#69140e">HAC</span>

In [None]:
clStr = 'HAC'
cl = AgglomerativeClustering(n_clusters= 2)

startTime = time.time()
prediction = cl.fit_predict(X)
stopTime = time.time()
fitPredictTime = stopTime - startTime

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metricsBigDataset['Clustering Algorithm'].unique()):
    metricsBigDataset.loc[len(metricsBigDataset)] = [clStr, ari, ami, v_measure, fitPredictTime]
else:
    index = metricsBigDataset[metricsBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    metricsBigDataset.loc[index] = [clStr, ari, ami, v_measure, fitPredictTime]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrixBigDataset['Clustering Algorithm'].unique()):
    confMatrixBigDataset.loc[len(metricsBigDataset)] = [clStr, disp]
else:
    index = confMatrixBigDataset[confMatrixBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrixBigDataset.loc[index] = [clStr, disp]
    

## <span style="color:#69140e">GMM</span>

In [None]:
clStr = 'GMM'
cl = GaussianMixture(n_components=2, random_state=0)

startTime = time.time()
prediction = cl.fit_predict(X)
stopTime = time.time()
fitPredictTime = stopTime - startTime

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metricsBigDataset['Clustering Algorithm'].unique()):
    metricsBigDataset.loc[len(metricsBigDataset)] = [clStr, ari, ami, v_measure, fitPredictTime]
else:
    index = metricsBigDataset[metricsBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    metricsBigDataset.loc[index] = [clStr, ari, ami, v_measure, fitPredictTime]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrixBigDataset['Clustering Algorithm'].unique()):
    confMatrixBigDataset.loc[len(metricsBigDataset)] = [clStr, disp]
else:
    index = confMatrixBigDataset[confMatrixBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrixBigDataset.loc[index] = [clStr, disp]  


## <span style="color:#69140e">DBSCAN</span>

In [None]:
clStr = 'DBSCAN'
cl = DBSCAN()

startTime = time.time()
prediction = cl.fit_predict(X)
stopTime = time.time()
fitPredictTime = stopTime - startTime

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metricsBigDataset['Clustering Algorithm'].unique()):
    metricsBigDataset.loc[len(metricsBigDataset)] = [clStr, ari, ami, v_measure, fitPredictTime]
else:
    index = metricsBigDataset[metricsBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    metricsBigDataset.loc[index] = [clStr, ari, ami, v_measure, fitPredictTime]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrixBigDataset['Clustering Algorithm'].unique()):
    confMatrixBigDataset.loc[len(metricsBigDataset)] = [clStr, disp]
else:
    index = confMatrixBigDataset[confMatrixBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrixBigDataset.loc[index] = [clStr, disp]


## <span style="color:#69140e">HDBSCAN</span>

In [None]:
clStr = 'HDBSCAN'
cl = hdbscan.HDBSCAN(min_cluster_size=2, min_samples= 350)

startTime = time.time()
prediction = cl.fit_predict(X)
stopTime = time.time()
fitPredictTime = stopTime - startTime

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metricsBigDataset['Clustering Algorithm'].unique()):
    metricsBigDataset.loc[len(metricsBigDataset)] = [clStr, ari, ami, v_measure, fitPredictTime]
else:
    index = metricsBigDataset[metricsBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    metricsBigDataset.loc[index] = [clStr, ari, ami, v_measure, fitPredictTime]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrixBigDataset['Clustering Algorithm'].unique()):
    confMatrixBigDataset.loc[len(metricsBigDataset)] = [clStr, disp]
else:
    index = confMatrixBigDataset[confMatrixBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrixBigDataset.loc[index] = [clStr, disp]


## <span style="color:#69140e">SOM</span>

In [None]:
clStr = 'SOM'
cl = SOM(m=3, n=3, dim=128)

startTime = time.time()
prediction = cl.fit_predict(X)
stopTime = time.time()
fitPredictTime = stopTime - startTime

ari = adjusted_rand_score(y, prediction)
ami = adjusted_mutual_info_score(y, prediction)
v_measure = v_measure_score(y, prediction)

if not (clStr in metricsBigDataset['Clustering Algorithm'].unique()):
    metricsBigDataset.loc[len(metricsBigDataset)] = [clStr, ari, ami, v_measure, fitPredictTime]
else:
    index = metricsBigDataset[metricsBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    metricsBigDataset.loc[index] = [clStr, ari, ami, v_measure, fitPredictTime]

cm = confusion_matrix(y, prediction, labels=[0,1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=[0,1])

if not (clStr in confMatrixBigDataset['Clustering Algorithm'].unique()):
    confMatrixBigDataset.loc[len(metricsBigDataset)] = [clStr, disp]
else:
    index = confMatrixBigDataset[confMatrixBigDataset['Clustering Algorithm']== clStr].index.to_list()[0] 
    confMatrixBigDataset.loc[index] = [clStr, disp]


In [None]:
# Reset confMatrixBigDataset index
confMatrixBigDataset.reset_index(drop= True, inplace= True)

# **<span style="color:#3c1518">Results</span>**

In [None]:
metricsBigDataset

# **<span style="color:#3c1518">Confusion Matrices</span>**

## <span style="color:#69140e">Best Clustering Algorithm</span>

In [None]:
disp = confMatrixBigDataset['Confusion Matrix Settings'].loc[((metricsBigDataset['ARI']+metricsBigDataset['AMI']+metricsBigDataset['V-measure'])/3).idxmax()]
alg = confMatrixBigDataset['Clustering Algorithm'].loc[((metricsBigDataset['ARI']+metricsBigDataset['AMI']+metricsBigDataset['V-measure'])/3).idxmax()]
disp.plot()
plt.gcf().set_size_inches(13,8)
plt.title(alg)
plt.show()

## <span style="color:#69140e">Worst Clustering Algorithm</span>

In [None]:
disp = confMatrixBigDataset['Confusion Matrix Settings'].loc[((metricsBigDataset['ARI']+metricsBigDataset['AMI']+metricsBigDataset['V-measure'])/3).idxmin()]
alg = confMatrixBigDataset['Clustering Algorithm'].loc[((metricsBigDataset['ARI']+metricsBigDataset['AMI']+metricsBigDataset['V-measure'])/3).idxmin()]
disp.plot()
plt.gcf().set_size_inches(13,8)
plt.title(alg)
plt.show()

# **<span style="color:#3c1518">Proposed Clustering Algorithm</span>**

### <span style="color:#69140e">The proposed Clustering Algorithm for this problem is K - Means</span>