In [1]:
# Updated on 21.02.2022
from ipywidgets import IntProgress
from IPython.utils import io
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 40em; }</style>"))
from sty import fg, rs
import progressbar

import numpy as np
from numpy.random import RandomState
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, MeanShift, SpectralClustering, OPTICS, Birch, AffinityPropagation, estimate_bandwidth
from fcmeans import FCM
from sklearn.mixture import GaussianMixture
from sklearn.mixture_mod5 import GaussianMixtureMod5
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pytictoc import TicToc
from statistics import mean, stdev
from tqdm import tqdm, trange, tqdm_notebook
from sklearn.metrics import accuracy_score, adjusted_rand_score
# from statmodels import robust
import warnings
warnings.filterwarnings('ignore')
import time
import timeit
import datetime
from pathlib import Path
import os

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)

#Get the Clustering Algorithm Performance
def cluster_eval(y_true, y_pred, X_train):
    eval_scores = []
   
    accuracy = np.mean(y_pred.ravel() == y_true.ravel()) * 100 
    purity = purity_score(y_train, y_pred)
    ari_score = adjusted_rand_score(y_true, y_pred)

#     eval_scores = [m, cluster_label, elapsed_time, iter_num, purity, ari_score]
    eval_scores = [purity, ari_score, elapsed_time, iter_num,]
    return eval_scores

def animated_marker():     
    widgets = ['Processing : ', progressbar.AnimatedMarker()]
    bar = progressbar.ProgressBar(widgets=widgets).start()   
    for i in range(100):
        bar.update(i)
        
#Main Program start here!---------------------------------------------------------------------------------------------------------------------------------------#
# dataset_labels = ["transfusion", "breastCancer", "heartDisease", "australian", "japanese", "vertebral", "haberman", "iris", "new_thyroid", "dermatology"] # Group1 - Done 22.01.2022
# dataset_labels = ["bank", "seismicBumps", "german", "chess", "abalone", "wallRobot","wilt", "adultIncome"] # Group2 - Done 22.01.2022
# dataset_labels = ["seismicBumps", "german", "chess", "abalone", "wallRobot","wilt"] # Group2 - Done 31.01.2022 --remove bank & adultIncome
# dataset_labels = ["seismicBumps", "german", "chess", "abalone"] # Group2 - Done 01.02.2022
# dataset_labels = ["wallRobot","wilt"] #Group3 - Done 01.02.2022
# dataset_labels = ["iris", "heartDisease", "australian", "japanese", "abalone", "chess"] #Balanced - 26.05.2022 
# dataset_labels = ["vertebral", "new_thyroid", "haberman", "dermatology", "breastCancer", "transfusion", "german", "seismicBumps", "wilt", "wallRobot"] #Imbalanced 1 - 26.05.2022 
# dataset_labels = ["bank","adultIncome"] #Imbalanced 1 - 26.05.2022 

dataset_labels = ["eegEyeState"] 

# method_labels = ["KMeans", 'GMM-random', 'GMM-kMeans', 'GMM-init_hybrid', 'OBGMM']
# method_labels = ['DBSCAN','MeanShift', 'Birch', 'OPTICS', 'FCM', 'AffinityPropagation', 'SpectralClustering']
method_labels = ["KMeans", 'DBSCAN','MeanShift', 'FCM', 'OBGMM',' GMM-init_hybrid'] # [clust1, clust6, clust7, clust10, clust5, clust4]
# method_labels = ['SpectralClustering']
num_methods = len(method_labels)
result_labels = ["Dataset","Algorithm", "purity", "purity_std", "ari_score", "ari_score_std", "Elapsed Time", "Iter_num"]

cov_type = 'full'
t = TicToc() # create TicToc instance
num_repeat = 100

# Declare All Clustering methods
clust1 = 'KMeans(n_clusters = n_classes)'
clust2 = "GaussianMixture(n_components = n_classes, init_params='random', covariance_type='full', max_iter=100)"
clust3 = "GaussianMixture(n_components = n_classes, init_params='kmeans', covariance_type='full', max_iter=100)"
clust4 = "GaussianMixture(n_components = n_classes, init_params='jaha_init_hybrid', covariance_type='full', max_iter=100)"
clust5 = "GaussianMixtureMod5(n_components=n_classes, init_params='jaha_init_hybrid', covariance_type='full', max_iter=100, resp_type='remove')"
clust6 = "DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None)"
clust7 = "MeanShift(bandwidth=bandwidth, seeds=None, bin_seeding=True, min_bin_freq=1, cluster_all=True, n_jobs=None, max_iter=100)"
clust8 = "Birch(n_clusters=n_classes)"
clust9 = "OPTICS(min_samples=50, xi=.05, min_cluster_size=.05, cluster_method='xi', metric='minkowski', algorithm = 'auto')"
clust10 = "FCM(n_clusters=n_classes)"
# clust11 = "AffinityPropagation()"
# clust12 = "SpectralClustering(n_clusters = 3, random_state=0)"

# for i in tqdm_notebook(range(4), desc='1st loop'):
#     for j in tqdm_notebook(range(100), desc='2nd loop', leave=False):

all_result = pd.DataFrame()
for m in tqdm_notebook(dataset_labels, desc = 'Overall Progress :'):
    #Read dataset
    dataset = pd.read_csv("dataset/"+m+".csv")
    #Drop Target Column in data using Index
    X_train = dataset.drop('Target',axis=1)
    #Get y_train
    y_train =  dataset['Target']
    n_classes = len(np.unique(y_train))  
    
    bandwidth = estimate_bandwidth(X_train, quantile=0.2, n_samples=len(X_train))

#     print("Start Dataset : ",m)
   
    all_scores = []
#     for clust,met_label in zip([clust1, clust4, clust5, clust6, clust7, clust10, clust11, clust12], method_labels):
#     for clust,met_label in tqdm_notebook(zip([clust1, clust2, clust3, clust4, clust5], method_labels), desc='Method loop'):
#     for clust,met_label in tqdm_notebook(zip([clust1, clust4, clust5, clust6, clust7, clust10], method_labels)
#     for clust,met_label in zip([clust1, clust2, clust3], method_labels):
    for i, clust,met_label in zip(tqdm_notebook(range(num_methods), desc = m.capitalize()+' dataset :'), [clust1, clust6, clust7, clust10, clust5, clust4], method_labels):
        met_scores = []
#         print("\nMethod :",met_label)
        with io.capture_output() as captured:
            cluster_method = eval(clust)
        for repeat in tqdm_notebook(range(num_repeat), desc = ">>> "+met_label, bar_format="{percentage:3.0f}%|{bar}{r_bar}"): 
#             if (repeat == 0):
#                 print("Start-Loop = ",repeat)
#             animated_marker()
            t.tic() # Start timer
            #Run clstering algorithm for related clusteringmethod
#             cluster_method = eval(clust)
#             print("\nClustering method :", cluster_method)    
            if (met_label == 'DBSCAN' or met_label == 'SpectralClustering'):
                with io.capture_output() as captured:
                    y_train_pred = cluster_method.fit_predict(X_train)
                    elapsed_time = t.tocvalue() #End Timer & Save elapsed time   
                    iter_num = 0     

            elif (met_label == 'FCM'):
                with io.capture_output() as captured:
                    cluster_method.fit(X_train)
                    elapsed_time = t.tocvalue() #End Timer & Save elapsed time 
                    y_train_pred = cluster_method.u.argmax(axis=1)
                    iter_num = 0     

            else:
                with io.capture_output() as captured:
                    cluster_method.fit(X_train)
    #                 print("\nClustering method Fit:", repeat)  
                    elapsed_time = t.tocvalue() #End Timer & Save elapsed time

                    iter_num = cluster_method.n_iter_
    #                 print("kMeans time : ", elapsed_time )   
                    y_train_pred = cluster_method.predict(X_train)
                       
            eval_scores = cluster_eval(y_train, y_train_pred, X_train)
            met_scores += [eval_scores]
#             print(eval_scores) 
#             if (repeat == num_repeat - 1):
#                 print("Finish-Loop = ",repeat)
        met_scores = pd.DataFrame(met_scores)
#         all_scores.columns=result_labels 
        mean_scores = met_scores.mean()
        std_scores = met_scores.iloc[:,0:2].std()
        met_scores = [m, met_label, mean_scores[0], std_scores[0],mean_scores[1], std_scores[1], mean_scores[2], mean_scores[3] ]   
        all_scores += [met_scores]
        
    #Change result to Pandas DataFrame
    result = pd.DataFrame(all_scores)
    all_result = all_result.append(result)
all_result.columns=result_labels   
# print("\nAll_results: \n",all_result) 

#Save results to Excel
ts = time.time() 
st = datetime.datetime.fromtimestamp(ts).strftime('%H-%M-%S---%d-%m-%Y')
# all_result.to_excel("result/All_result_"+st+".xlsx", index=False)
all_result.to_excel("result/new_loop/All_result_"+st+"_"+str(num_repeat)+"_loops_eegEyeState.xlsx", index=False)
filePath = Path("result/new_loop/All_result_"+st+"_"+str(num_repeat)+"_loops_eegEyeState.xlsx", index=False).resolve()
print("filePath :", filePath)
print("\nFinish at ",st)
# print("Number of repeat :", num_repeat )

os.system(f'start excel.exe "{filePath}"')

HBox(children=(IntProgress(value=0, description='Overall Progress :', max=1, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Eegeyestate dataset :', max=6, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='>>> KMeans', style=ProgressStyle(description_width='initial')…

HBox(children=(IntProgress(value=0, description='>>> DBSCAN', style=ProgressStyle(description_width='initial')…

HBox(children=(IntProgress(value=0, description='>>> MeanShift', style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='>>> FCM', style=ProgressStyle(description_width='initial')), …

HBox(children=(IntProgress(value=0, description='>>> OBGMM', style=ProgressStyle(description_width='initial'))…

HBox(children=(IntProgress(value=0, description='>>>  GMM-init_hybrid', style=ProgressStyle(description_width=…


filePath : E:\MyJupyter\1.Exp_03\GMM_outlier\result\new_loop\All_result_11-46-54---02-06-2022_100_loops_eegEyeState.xlsx

Finish at  11-46-54---02-06-2022


0

In [2]:
adjusted_rand_score([0, 0, 1, 1], [0, 0, 1, 2])

0.5714285714285714

In [7]:
adjusted_rand_score([0, 0, 1, 1], [1, 1, 0, 0])

1.0

In [20]:
adjusted_rand_score([0, 0, 0, 0], [0, 1, 2, 3])
adjusted_rand_score([1, 2, 0, 0], [0, 1, 2, 3])
rand_score([1, 2, 0, 0], [0, 1, 2, 3])
adjusted_rand_score([1, 0, 0, 0], [0, 1, 2, 3])

0.0

In [5]:
from sklearn.metrics import rand_score, adjusted_rand_score
rand_score([0, 0, 1, 1], [1, 1, 0, 0])

1.0

In [6]:
rand_score([0, 0, 1, 2], [0, 0, 1, 1])

0.8333333333333334

In [None]:
method_labels = ["KMeans",'GMM-init_hybrid', 'OBGMM', 'DBSCAN','MeanShift', 'FCM', 'AffinityPropagation']
num_methods = len(method_labels)
print(num_methods)

In [None]:
Problem :
AffinityPropagation
MemoryError: Unable to allocate 15.2 GiB for an array with shape (45211, 45211) and data type float64