In [3]:
# #Display full output
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
from tqdm.auto import tqdm, trange
from ipywidgets import IntProgress
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 40em; }</style>"))

import warnings
warnings.filterwarnings('ignore')
import datetime
import time
from pytictoc import TicToc
import sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift
from fcmeans import FCM
from sklearn.mixture import GaussianMixture
from sklearn.mixture_mod import GaussianMixtureMod
from sklearn.mixture_mod2 import GaussianMixtureMod2
from sklearn.mixture_mod3 import GaussianMixtureMod3
from sklearn.mixture_mod4 import GaussianMixtureMod4
from sklearn.mixture_mod5 import GaussianMixtureMod5
from IPython.utils import io
import numpy as np
from sklearn import metrics
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score, normalized_mutual_info_score, homogeneity_completeness_v_measure,fowlkes_mallows_score,silhouette_score
from sklearn.metrics import davies_bouldin_score, v_measure_score

#Funtion declaration
def cluster_accuracy(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)

    # Find optimal one-to-one mapping between cluster labels and true labels
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix)
    print("row_ind : ", row_ind)
    print("col_ind : ", col_ind)
    val1 = contingency_matrix[row_ind, col_ind].sum()
    print("contingency_matrix[row_ind, col_ind].sum() : ", val1)
    val2 = np.sum(contingency_matrix)
    print("np.sum(contingency_matrix) : ", val2)

    # Return cluster accuracy
    return contingency_matrix[row_ind, col_ind].sum() / np.sum(contingency_matrix)

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)
    
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

#Get the Clustering Algorithm Performance
def cluster_eval(y_true, y_pred, X_train, cluster_label, m, elapsed_time,iter_num):
    eval_scores = []
    
    accuracy = np.mean(y_pred.ravel() == y_true.ravel()) * 100
    purity = purity_score(y_train, y_pred)
    micro_f1_score = f1_score(y_true, y_pred, average='micro')
    macro_f1_score = f1_score(y_true, y_pred, average='macro')
    weighted_f1_score = f1_score(y_true, y_pred, average='weighted')
    ari_score = adjusted_rand_score(y_true, y_pred)
    nmi_score = normalized_mutual_info_score(y_true, y_pred)
#     hc_v_measure = homogeneity_completeness_v_measure(y_true, y_pred)
    v_measure = v_measure_score(y_true, y_pred)
    fm_score = fowlkes_mallows_score(y_true, y_pred)
#     s_score = silhouette_score(X_train, y_pred, metric='euclidean')
#     db_score = davies_bouldin_score(X_train, y_pred)
    eval_scores = [m, cluster_label, elapsed_time, iter_num, accuracy, purity, micro_f1_score, macro_f1_score, weighted_f1_score, ari_score, 
                   nmi_score, v_measure, fm_score]
    return eval_scores
    
"""------------------------------------------------------------------------------------------------------------------------
#Main Program - Start here!
------------------------------------------------------------------------------------------------------------------------"""
# np.set_printoptions(threshold=sys.maxsize)
# pd.set_option('display.max_rows', dataset.shape[0]+1)

#Define dataset_labels
# dataset_labels = ["balanceScale"]
dataset_labels = ["heartDisease",  "transfusion", "wilt"]  
# dataset_labels = ["transfusion", "wilt", "breastCancer", "heartDisease", "eegEyeState", "adultIncome",
#                   "australian", "japanese", "bank", "seismicBumps", "german", "chess", 
#                   "iris", "abalone", "wallRobot", "dermatology"]
                   
# dataset_labels = ["heartDisease", "transfusion", "wilt", "breastCancer", "chess", 
#                   "parkinson","australian", "german", "japanese", "iris", "abalone", "wallRobot",
#                   "dermatology","creditApproval", "letters", "eegEyeState",
#                   "seismicBumps","bank", "adultIncome", "sensorlessDrive"]

skip_Mahal = ["seismicBumps","bank", "adultIncome"]

# Threshold of GMM-EM Response values
th_resp_val = [0.1, 0.2, 0.3, 0.4, 0.5]
# th_resp_val = [1]

# resp_type_val = ['donate']
# resp_type_val = ['donate']
resp_type_val = ['remove', 'donate']

result_labels = ["Dataset","Algorithm", "Elapsed Time", "Iter_num","accuracy", "purity", "micro_f1_score", "macro_f1_score", "weighted_f1_score", "ari_score", 
                "nmi_score", "hc_v_measure", "fm_score"]

all_scores = []
final_scores = []
all_result = pd.DataFrame()
t = TicToc() # create TicToc instance
# for m in dataset_labels:
for m in tqdm(dataset_labels):
#     enablePrint()
    print("\n\nStart Dataset : ",m)
    #Read dataset
    dataset = pd.read_csv("dataset/"+m+".csv")

    #Drop Target Column in data using Index
    X_train = dataset.drop('Target',axis=1)

    # #How to get Target data
    y_train =  dataset['Target']

    print(np.array(y_train))

    n_classes = len(np.unique(y_train))
#     cov_type = 'full'
    cov_type = 'tied'
#     cov_type = 'diag'
    #Since we have class labels for the training data, we can initialize the GMM parameters in a supervised manner.
    es_means_init = np.array([X_train[y_train == i].mean(axis=0) for i in range(n_classes)])
#     print("es_means_init :\n",es_means_init)

    #Run the kMeans on current
    kMeans_cluster = KMeans(n_clusters = n_classes, max_iter=20, random_state=0)
    t.tic() # Start timer
    kMeans_cluster.fit(X_train)
    elapsed_time = t.tocvalue() #Save elapsed time
    iter_num = kMeans_cluster.n_iter_
#     print("kMeans time : ", elapsed_time )   
    y_train_pred = kMeans_cluster.predict(X_train)
    print("kMeans y_train_pred : ", y_train_pred )   
    eval_scores = cluster_eval(y_train, y_train_pred, X_train, "kMeans", m, elapsed_time, iter_num )
#     eval_scores = np.append([m],eval_scores,axis=0)  
    all_scores = [eval_scores]
    
#     #Run the Meanshift on current
#     ms_cluster = MeanShift(bandwidth=n_classes)
#     t.tic() # Start timer
#     ms_cluster = ms_cluster.fit(X_train)
#     elapsed_time = t.tocvalue() #Save elapsed time
#     iter_num = "NA" #ms_cluster.n_iter_
# #     print("GMM time : ", elapsed_time )       
#     y_train_pred = ms_cluster.predict(X_train)
#     eval_scores = cluster_eval(y_train, y_train_pred, X_train, "Meanshift", m, elapsed_time, iter_num )
# #     all_scores += [eval_scores]
#     all_scores += [eval_scores]
    
#     #Run the FCM on current
#     fcm_cluster = FCM(n_clusters=n_classes)
#     t.tic() # Start timer
#     fcm_cluster.fit(X_train)
#     elapsed_time = t.tocvalue() #Save elapsed time
#     iter_num = "NA" #fcm_cluster.n_iter_
# #     print("GMM time : ", elapsed_time )       
#     y_train_pred = fcm_cluster.predict(X_train)
#     eval_scores = cluster_eval(y_train, y_train_pred, X_train, "FCM", m, elapsed_time, iter_num )
# #     all_scores += [eval_scores]
#     all_scores += [eval_scores]

    #Run the GMM on current
    gmm_cluster = GaussianMixture(n_components = n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0)
    t.tic() # Start timer
    gmm_cluster.fit(X_train)
    elapsed_time = t.tocvalue() #Save elapsed time
    iter_num = gmm_cluster.n_iter_
#     print("GMM time : ", elapsed_time )       
    y_train_pred = gmm_cluster.predict(X_train)
    eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM", m, elapsed_time, iter_num )
#     all_scores += [eval_scores]
    all_scores += [eval_scores]


    #Change result to Pandas DataFrame
    result = pd.DataFrame(all_scores)
    all_result = all_result.append(result)
#     print("Finish GaussianMixtureMod : ", m, " : ",resp_type )
all_result = all_result.append(pd.Series(), ignore_index=True) 
all_result.columns=result_labels    


#Save results to Excel
ts = time.time() 
st = datetime.datetime.fromtimestamp(ts).strftime('%H-%M-%S---%d-%m-%Y')
# all_result.to_excel("result/All_result_"+st+".xlsx", index=False)
all_result.to_excel("result/All_result_"+cov_type+"_"+st+".xlsx", index=False)
print("\nFinish at ",st)


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))



Start Dataset :  heartDisease
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]
kMeans y_train_pred :  [0 0 0 0 1 0 1 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1
 0 1 1 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1
 0 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 0 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 