26.02.2021 - Modified "gaussian_mixture_mod5" 
- Add new method to get th_resp
  - using median of resp
  - using center value of min and max of resp ((max - min) /2))

In [None]:
# #Display full output
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
from tqdm.auto import tqdm, trange
from ipywidgets import IntProgress
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 40em; }</style>"))

import warnings
warnings.filterwarnings('ignore')
import datetime
import time
from pytictoc import TicToc
import sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift
from fcmeans import FCM
from sklearn.mixture import GaussianMixture
from sklearn.mixture_mod import GaussianMixtureMod
from sklearn.mixture_mod2 import GaussianMixtureMod2
from sklearn.mixture_mod3 import GaussianMixtureMod3
from sklearn.mixture_mod4 import GaussianMixtureMod4
from sklearn.mixture_mod5 import GaussianMixtureMod5
from sklearn.mixture_mod6 import GaussianMixtureMod6
from IPython.utils import io
import numpy as np
from sklearn import metrics
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score, normalized_mutual_info_score, homogeneity_completeness_v_measure,fowlkes_mallows_score,silhouette_score
from sklearn.metrics import davies_bouldin_score, v_measure_score
from scipy.stats import chi2
# opening EXCEL through Code local path in dir
import os
from pathlib import Path
#Funtion declaration
def cluster_accuracy(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)

    # Find optimal one-to-one mapping between cluster labels and true labels
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix)
    print("row_ind : ", row_ind)
    print("col_ind : ", col_ind)
    val1 = contingency_matrix[row_ind, col_ind].sum()
    print("contingency_matrix[row_ind, col_ind].sum() : ", val1)
    val2 = np.sum(contingency_matrix)
    print("np.sum(contingency_matrix) : ", val2)

    # Return cluster accuracy
    return contingency_matrix[row_ind, col_ind].sum() / np.sum(contingency_matrix)

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)
    
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

#Get the Clustering Algorithm Performance
def cluster_eval(y_true, y_pred, X_train, cluster_label, m, elapsed_time,iter_num):
    eval_scores = []
    
    accuracy = np.mean(y_pred.ravel() == y_true.ravel()) * 100
    
    purity = purity_score(y_train, y_pred)
    micro_f1_score = f1_score(y_true, y_pred, average='micro')
    macro_f1_score = f1_score(y_true, y_pred, average='macro')
    weighted_f1_score = f1_score(y_true, y_pred, average='weighted')
    ari_score = adjusted_rand_score(y_true, y_pred)
    nmi_score = normalized_mutual_info_score(y_true, y_pred)
#     hc_v_measure = homogeneity_completeness_v_measure(y_true, y_pred)
    v_measure = v_measure_score(y_true, y_pred)
    fm_score = fowlkes_mallows_score(y_true, y_pred)
#     s_score = silhouette_score(X_train, y_pred, metric='euclidean')
#     db_score = davies_bouldin_score(X_train, y_pred)
    eval_scores = [m, cluster_label, elapsed_time, iter_num, accuracy, purity, micro_f1_score, macro_f1_score, weighted_f1_score, ari_score, 
                   nmi_score, v_measure, fm_score]
    return eval_scores

#function to remove outliers using z-score
def z_score(data):
    data_z = data['mean']
    outliers_z = data_z.copy()
    mean_z = np.mean(data_z)
    std_z =np.std(data_z)
    threshold=3.

    data['z_score'] = (outliers_z - mean_z)/std_z 
    outliers_z[(data['z_score'] > threshold)] = np.nan
    print('outliers_z : \n', outliers_z)
    #Drop outliers
    no_outliers_z = outliers_z.dropna()
    print('outliers_z_new : \n', no_outliers_z)    
    return no_outliers_z

#function to calculate means 
def means_init(X, n_components):
    n_samples, n_attributes = X.shape
    print("n_components : ", n_components)
    
    df_X = pd.DataFrame(X)
    
    df_X['mean'] = df_X.mean(axis=1)
    
    no_outliers_z = z_score(df_X)

    #Find Max, Min, Diff, Dev
    Max = max(no_outliers_z)
    Min = min(no_outliers_z)
    
    diff = Max - Min
    dev = diff/n_components
    
    print('Max: ', Max , 'Min: ', Min)   
    print('dev :', dev)
    
    cluster_range = np.zeros((n_components, 2))
    for i in range (n_components):
        cluster_range[i] = Min, Min+dev
        
        Min = Min+dev
        df_X['c'+str(i)]=0
    print(cluster_range)  

    for j in range (n_components):
        df_X['c'+str(j)][(df_X['mean']>= cluster_range[j,0]) & (df_X['mean'] < cluster_range[j,1])] = 1

    es_init_means = np.zeros((n_components, n_attributes))
    for k in range (n_components):    
        temp_df_X = df_X.iloc[:, 0:n_attributes][df_X['c'+str(k)] == 1]   
        es_init_means[k] = temp_df_X.mean(axis=0) 
    return es_init_means
"""------------------------------------------------------------------------------------------------------------------------
#Main Program - Start here!
------------------------------------------------------------------------------------------------------------------------"""
# np.set_printoptions(threshold=sys.maxsize)
# pd.set_option('display.max_rows', dataset.shape[0]+1)

#Define dataset_labels
# dataset_labels = [ "haberman", "landsat", "iris", "seed" ]
# dataset_labels = ["ecoli"]"glass","vertebral","new_thyroid",
# dataset_labels = ["wine", "new_thyroid"]
# dataset_labels = ["transfusion", "wilt", "australian", "japanese", "iris"]
# dataset_labels = ["transfusion", "wilt", "breastCancer", "heartDisease", "adultIncome", "australian", "japanese", "bank", "seismicBumps", "german", 
#                   "chess", "iris", "abalone", "wallRobot", "dermatology"] 
dataset_labels = ["iris"]            
# dataset_labels = ["ecoli", "seed", "glass"]

# skip_Mahal = ["australian","japanese", "seismicBumps","bank", "adultIncome"]

# cov_type_labels = ["full", "tied", "diag", "spherical"]
cov_type_labels = ["full"]
# cov_type_labels = ["diag", "spherical"]
# Threshold of GMM-EM Response values
# th_resp_val = [0.1, 0.2, 0.3, 0.4, 0.5]
# th_resp_val = [1]

# resp_type_val = ['remove']
# resp_type_val = ['donate']
resp_type_val = ['remove', 'donate']

result_labels = ["Dataset","Algorithm", "Elapsed Time", "Iter_num","accuracy", "purity", "micro_f1_score", "macro_f1_score", "weighted_f1_score", "ari_score", 
                "nmi_score", "hc_v_measure", "fm_score"]
absolutePath = []
for cov_type in (cov_type_labels):
    print("\nStart Cov_Type :",cov_type)
                   
    all_scores = []
    final_scores = []
    all_result = pd.DataFrame()
    t = TicToc() # create TicToc instance
    # for m in dataset_labels:
    for m in tqdm(dataset_labels):
    #     enablePrint()
        print("\n\nStart Dataset : ",m)
        #Read dataset
        dataset = pd.read_csv("dataset/"+m+".csv")

        #Drop Target Column in data using Index
        X_train = dataset.drop('Target',axis=1)

        # #How to get Target data
        y_train =  dataset['Target']

        print(np.array(y_train))

        n_classes = len(np.unique(y_train))

        #     cov_type = 'full'
    #     cov_type = 'full'
    #     cov_type = 'diag'
#         Since we have class labels for the training data, we can initialize the GMM parameters in a supervised manner.
        es_means_init = np.array([X_train[y_train == i].mean(axis=0) for i in range(n_classes)])

    #     es_means_init = np.array ([[5.005555556, 3.344444444, 1.596296296, 0.303703704],
    #                                 [5.973333333, 2.775, 4.503333333, 1.468333333],
#     #                                 [6.86, 3.077142857, 5.731428571, 2.091428571]]) 

        es_means_init2_method = np.array ([[5.006, 3.428, 1.462, 0.246],
                                          [5.95178571, 2.77678571, 4.3375, 1.375],
                                          [6.65681818, 2.99318182, 5.62954545, 2.05909091]])
    
        means__init = np.array ([ [5.00555556, 3.34444444, 1.5962963,  0.3037037 ],
                                    [5.97333333, 2.775,      4.50333333, 1.46833333],
                                    [6.88333333, 3.09722222, 5.75833333, 2.09444444] ])   

        medians_init = np.array ([[5.,   3.4,  1.5,  0.2 ],
                                     [6.,   2.8,  4.5,  1.4 ],
                                     [6.75, 3.05, 5.7,  2.1 ]])  
        
        means_init_hybrid = np.array ([ [5.006,      3.428,      1.462,      0.246     ],
                                         [5.95172414, 2.77241379, 4.35862069, 1.39482759],
                                         [6.69047619, 3.00952381, 5.66190476, 2.06428571] ])   

        medians_init_hybrid = np.array ([[5.,   3.4,  1.5,  0.2 ],
                                         [6.,   2.8,  4.45, 1.4 ],
                                         [6.7,  3.,   5.6,  2.1 ]])
        
        means_init_attr = np.array ([[4.95384615, 3.26981132, 3.203,      0.99      ],
                                     [6.00857143, 2.87971014, 4.74680851, 1.58297872],
                                     [7.05185185, 3.06666667, 6.76666667, 2.16666667]])
        
         
#         X = X_train.copy()
#         es_means_init = means_init(X, n_classes)
#         print("es_means_init :\n",es_means_init)
#         if (m == "wilt"):
#     #         es_means_init[[0, 1]] = es_means_init[[1, 0]] 
#             print("es_means_init_"+m+" :\n",es_means_init) 
#         elif (m == "heartDisease"):
#             es_means_init[[0, 1]] = es_means_init[[1, 0]] 
#             print("es_means_init_"+m+" :\n",es_means_init) 
#         elif (m == "adultIncome"):
#             es_means_init[[0, 1]] = es_means_init[[1, 0]] 
#             print("es_means_init_"+m+" :\n",es_means_init) 
#         elif (m == "australian"):
#             es_means_init[[0, 1]] = es_means_init[[1, 0]] 
#             print("es_means_init_"+m+" :\n",es_means_init)   
#         elif (m == "wallRobot"):
#             es_means_init[[0, 1, 2, 3]] = es_means_init[[1, 2, 3, 0]] 
#             print("es_means_init_"+m+" :\n",es_means_init)     
#         elif (m == "dermatology"):
#             es_means_init[[0, 1, 2, 3, 4, 5]] = es_means_init[[3, 5, 4, 1, 0, 2]] 
#             print("es_means_init_"+m+" :\n",es_means_init) 
#     #     es_means_init = None

        #Run the kMeans on current
        kMeans_cluster = KMeans(n_clusters = n_classes, max_iter=20, random_state=0)
        t.tic() # Start timer
        kMeans_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = kMeans_cluster.n_iter_
    #     print("kMeans time : ", elapsed_time )   
        y_train_pred = kMeans_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "kMeans", m, elapsed_time, iter_num )
    #     eval_scores = np.append([m],eval_scores,axis=0)  
        all_scores = [eval_scores]

    #     #Run the Meanshift on current
    #     ms_cluster = MeanShift(bandwidth=n_classes)
    #     t.tic() # Start timer
    #     ms_cluster = ms_cluster.fit(X_train)
    #     elapsed_time = t.tocvalue() #Save elapsed time
    #     iter_num = "NA" #ms_cluster.n_iter_
    # #     print("GMM time : ", elapsed_time )       
    #     y_train_pred = ms_cluster.predict(X_train)
    #     eval_scores = cluster_eval(y_train, y_train_pred, X_train, "Meanshift", m, elapsed_time, iter_num )
    # #     all_scores += [eval_scores]
    #     all_scores += [eval_scores]

    #     #Run the FCM on current
    #     fcm_cluster = FCM(n_clusters=n_classes)
    #     t.tic() # Start timer
    #     fcm_cluster.fit(X_train)
    #     elapsed_time = t.tocvalue() #Save elapsed time
    #     iter_num = "NA" #fcm_cluster.n_iter_
    # #     print("GMM time : ", elapsed_time )       
    #     y_train_pred = fcm_cluster.predict(X_train)
    #     eval_scores = cluster_eval(y_train, y_train_pred, X_train, "FCM", m, elapsed_time, iter_num )
    # #     all_scores += [eval_scores]
    #     all_scores += [eval_scores]
        
        #Run the GMM on current
        gmm_cluster = GaussianMixture(n_components = n_classes, init_params='random', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
        t.tic() # Start timer
        gmm_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster.n_iter_
    #     print("GMM time : ", elapsed_time )       
        y_train_pred = gmm_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-random", m, elapsed_time, iter_num )
    #     all_scores += [eval_scores]
        all_scores += [eval_scores]
        
                #Run the GMM on current
        gmm_cluster = GaussianMixture(n_components = n_classes, init_params='kmeans', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
        t.tic() # Start timer
        gmm_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster.n_iter_
    #     print("GMM time : ", elapsed_time )       
        y_train_pred = gmm_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-kmeans", m, elapsed_time, iter_num )
    #     all_scores += [eval_scores]
        all_scores += [eval_scores]
        
#                  #Run the GMM on current
#         gmm_cluster = GaussianMixture(n_components = n_classes, means_init= means_init_attr, covariance_type=cov_type, max_iter=20, random_state=0)
#         t.tic() # Start timer
#         gmm_cluster.fit(X_train)
#         elapsed_time = t.tocvalue() #Save elapsed time
#         iter_num = gmm_cluster.n_iter_
#     #     print("GMM time : ", elapsed_time )       
#         y_train_pred = gmm_cluster.predict(X_train)
#         eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-means_init_attr", m, elapsed_time, iter_num )
#     #     all_scores += [eval_scores]
#         all_scores += [eval_scores]
        
        #Run the GMM on current
        gmm_cluster = GaussianMixture(n_components = n_classes, means_init= es_means_init, covariance_type=cov_type, max_iter=20, random_state=0)
        t.tic() # Start timer
        gmm_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster.n_iter_
    #     print("GMM time : ", elapsed_time )       
        y_train_pred = gmm_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-es_means_init", m, elapsed_time, iter_num )
    #     all_scores += [eval_scores]
        all_scores += [eval_scores]
        
        
         #Run the GMM on current
        gmm_cluster = GaussianMixture(n_components = n_classes, init_params='jaha_init1', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
        t.tic() # Start timer
        gmm_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster.n_iter_
    #     print("GMM time : ", elapsed_time )       
        y_train_pred = gmm_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-init1", m, elapsed_time, iter_num )
    #     all_scores += [eval_scores]
        all_scores += [eval_scores]
        
                 #Run the GMM on current
        gmm_cluster = GaussianMixture(n_components = n_classes, init_params='jaha_init_hybrid', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
        t.tic() # Start timer
        gmm_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster.n_iter_
    #     print("GMM time : ", elapsed_time )       
        y_train_pred = gmm_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-init_hybrid", m, elapsed_time, iter_num )
    #     all_scores += [eval_scores]
        all_scores += [eval_scores]

    #     #Run the GMM Mahalanobis on current - Last tested on 28.01.2021
    #     if m not in (skip_Mahal):    
    #         for resp_type in resp_type_val:
    #             gmm_cluster_mod2 = GaussianMixtureMod2(n_components = n_classes, init_params='kmeans', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type)
    #             t.tic() # Start timer
    #             gmm_cluster_mod2.fit(X_train)
    #             elapsed_time = t.tocvalue() #Save elapsed time
    #             iter_num = gmm_cluster_mod2.n_iter_
    #         #     print("GMM time : ", elapsed_time ) 
    #             resp_labels = "Mahalanobis-"+str(resp_type)
    #             y_train_pred_mod2 = gmm_cluster_mod2.predict(X_train)
    #             eval_scores = cluster_eval(y_train, y_train_pred_mod2, X_train, resp_labels, m, elapsed_time, iter_num )
    #             all_scores += [eval_scores]
    #         print("Finish GaussianMixtureMod2/Mahalanobis : ",m," : ",resp_type )
    #     else:
    #         print("Skip GaussianMixtureMod2/Mahalanobis for :",m )

        # Run the GMM-Auto for every th_resp value on current dataset (using intersect)- Last tested on 27.01.2021
        #     for resp_type in resp_type_val:
        #         gmm_cluster_mod3 = GaussianMixtureMod3(n_components=n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
        #         t.tic() # Start timer
        #         gmm_cluster_mod3.fit(X_train)
        #         elapsed_time = t.tocvalue() #Save elapsed time
        #         iter_num = gmm_cluster_mod3.n_iter_     
        #         resp_labels = "GMM_Auto-"+str(resp_type)
        #         y_train_pred_mod3 = gmm_cluster_mod3.predict(X_train)
        #         eval_scores = cluster_eval(y_train, y_train_pred_mod3, X_train, resp_labels,m, elapsed_time, iter_num)
        #         all_scores += [eval_scores]
        #     print("Finish GaussianMixtureMod3 : ", m," : ",resp_type )
        
          #Run the GMM-Auto-v2 for every th_resp value on current dataset (using weights*cv or cv/2) - Last tested on 27.01.2021
 
#comment by Azha on 9.9.2021
        for resp_type in resp_type_val:
            gmm_cluster_mod4 = GaussianMixtureMod4(n_components=n_classes, init_params='random', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod4.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod4.n_iter_     
            resp_labels = "resp Mixing Coefficients-random_"+str(resp_type)
            y_train_pred_mod4v = gmm_cluster_mod4.predict(X_train)
#             y_train_pred_mod4v = np.choose(y_train_pred_mod4v,(4,5,0,1,2,3)).astype(np.int64) 
            print('y_train_pred_mod4v :\n', y_train_pred_mod4v)
            eval_scores = cluster_eval(y_train, y_train_pred_mod4v, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod4/resp Mixing Coefficients random  : ", m, " : ",resp_type )
        
          #Run the GMM-Auto-v2 for every th_resp value on current dataset (using weights*cv or cv/2) - Last tested on 27.01.2021
        for resp_type in resp_type_val:
            gmm_cluster_mod4 = GaussianMixtureMod4(n_components=n_classes, init_params='kmeans', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod4.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod4.n_iter_     
            resp_labels = "resp Mixing Coefficients-kmeans_"+str(resp_type)
            y_train_pred_mod4v = gmm_cluster_mod4.predict(X_train)
#             y_train_pred_mod4v = np.choose(y_train_pred_mod4v,(4,5,0,1,2,3)).astype(np.int64) 
            print('y_train_pred_mod4v :\n', y_train_pred_mod4v)
            eval_scores = cluster_eval(y_train, y_train_pred_mod4v, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod4/resp Mixing Coefficients kmeans  : ", m, " : ",resp_type )
        
        #Run the GMM-Auto-v2 for every th_resp value on current dataset (using weights*cv or cv/2) - Last tested on 27.01.2021
        for resp_type in resp_type_val:
            gmm_cluster_mod4 = GaussianMixtureMod4(n_components=n_classes, init_params='jaha_init1', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod4.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod4.n_iter_     
            resp_labels = "resp Mixing Coefficients-init1_"+str(resp_type)
            y_train_pred_mod4v = gmm_cluster_mod4.predict(X_train)
#             y_train_pred_mod4v = np.choose(y_train_pred_mod4v,(4,5,0,1,2,3)).astype(np.int64) 
            print('y_train_pred_mod4v :\n', y_train_pred_mod4v)
            eval_scores = cluster_eval(y_train, y_train_pred_mod4v, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod4/resp Mixing Coefficients init1  : ", m, " : ",resp_type )
        
        #Run the GMM-Auto-v2 for every th_resp value on current dataset (using weights*cv or cv/2) - Last tested on 27.01.2021
        for resp_type in resp_type_val:
            gmm_cluster_mod4 = GaussianMixtureMod4(n_components=n_classes, init_params='jaha_init_hybrid', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod4.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod4.n_iter_     
            resp_labels = "resp Mixing Coefficients-init_hybrid_"+str(resp_type)
            y_train_pred_mod4 = gmm_cluster_mod4.predict(X_train)
            eval_scores = cluster_eval(y_train, y_train_pred_mod4, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod4/resp Mixing Coefficients-init_hybrid  : ", m, " : ",resp_type )
               
        for resp_type in resp_type_val:
            gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='random', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod5.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod5.n_iter_     
            resp_labels = "resp center values-random"+str(resp_type)
            y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
#             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
#             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
            print('y_train_pred_mod5 :\n', y_train_pred_mod5)
            print('y_train :\n', y_train)
            eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod5/resp center values-random : ", m, " : ",resp_type )
        
        #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
        for resp_type in resp_type_val:
            gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='kmeans', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod5.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod5.n_iter_     
            resp_labels = "resp center values-kmeans"+str(resp_type)
            y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
#             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
#             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
            print('y_train_pred_mod5 :\n', y_train_pred_mod5)
            print('y_train :\n', y_train)
            eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod5/resp center values-kmeans : ", m, " : ",resp_type)   
        
#                 #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
#         for resp_type in resp_type_val:
#             gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, means_init=means_init_attr, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
#             t.tic() # Start timer
#             gmm_cluster_mod5.fit(X_train)
#             elapsed_time = t.tocvalue() #Save elapsed time
#             iter_num = gmm_cluster_mod5.n_iter_     
#             resp_labels = "resp center values-means_init_attr"+str(resp_type)
#             y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
# #             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
# #             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
#             print('y_train_pred_mod5 :\n', y_train_pred_mod5)
#             print('y_train :\n', y_train)
#             eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
#             all_scores += [eval_scores]
#         print("Finish GaussianMixtureMod5/resp center values-means_init_attr : ", m, " : ",resp_type)   
        
         #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
        for resp_type in resp_type_val:
            gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='jaha_init1', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod5.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod5.n_iter_     
            resp_labels = "resp center values-init1_"+str(resp_type)
            y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
            eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod5/resp center values init1 : ", m, " : ",resp_type )

        #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
        for resp_type in resp_type_val:
            gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='jaha_init_hybrid', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod5.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod5.n_iter_     
            resp_labels = "resp center values-init_hybrid_"+str(resp_type)
            y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
#             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
#             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
            print('y_train_pred_mod5 :\n', y_train_pred_mod5)
            print('y_train :\n', y_train)
            eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod5/resp center values init_hybrid: ", m, " : ",resp_type )

#         #Run the 
#         gmm_cluster_mod6 = GaussianMixtureMod6(n_components=n_classes, means_init=means_init_attr, covariance_type=cov_type, max_iter=20, random_state=0 )
#         t.tic() # Start timer
#         gmm_cluster_mod6.fit(X_train)
#         elapsed_time = t.tocvalue() #Save elapsed time
#         iter_num = gmm_cluster_mod6.n_iter_     
#         resp_labels = "outliers based-init_attr_"
#         y_train_pred_mod6 = gmm_cluster_mod6.predict(X_train)
        
#         print('y_train_pred_mod6 :\n', y_train_pred_mod6)
#         print('y_train :\n', y_train)
#         eval_scores = cluster_eval(y_train, y_train_pred_mod6, X_train, resp_labels,m, elapsed_time, iter_num)
#         all_scores += [eval_scores]
#         print("Finish GaussianMixtureMod6/outliers based-means_init_attr: ", m )
        
         #Run the 
        gmm_cluster_mod6 = GaussianMixtureMod6(n_components=n_classes, means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0 )
        t.tic() # Start timer
        gmm_cluster_mod6.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster_mod6.n_iter_     
        resp_labels = "outliers based-es_means_init"
        y_train_pred_mod6 = gmm_cluster_mod6.predict(X_train)

        print('y_train_pred_mod6 :\n', y_train_pred_mod6)
        print('y_train :\n', y_train)
        eval_scores = cluster_eval(y_train, y_train_pred_mod6, X_train, resp_labels,m, elapsed_time, iter_num)
        all_scores += [eval_scores]
        print("Finish GaussianMixtureMod6/outliers based-es_means_init: ", m )
        
        gmm_cluster_mod6 = GaussianMixtureMod6(n_components=n_classes, init_params='jaha_init_hybrid', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0 )
        t.tic() # Start timer
        gmm_cluster_mod6.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster_mod6.n_iter_     
        resp_labels = "outliers based-init_hybrid_"
        y_train_pred_mod6 = gmm_cluster_mod6.predict(X_train)

        print('y_train_pred_mod6 :\n', y_train_pred_mod6)
        print('y_train :\n', y_train)
        eval_scores = cluster_eval(y_train, y_train_pred_mod6, X_train, resp_labels,m, elapsed_time, iter_num)
        all_scores += [eval_scores]
        print("Finish GaussianMixtureMod6/outliers based-init_hybrid: ", m )


    #     #Run the GMM-OR for every th_resp value on current dataset
    #     for resp_type in resp_type_val:
    #         for th_resp in th_resp_val:
    #             gmm_cluster_mod = GaussianMixtureMod(n_components=n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0, th_resp=th_resp, resp_type=resp_type )

    #             t.tic() # Start timer
    #             gmm_cluster_mod.fit(X_train)
    #             elapsed_time = t.tocvalue() #Save elapsed time
    #             iter_num = gmm_cluster_mod.n_iter_       
    #             y_train_pred_mod = gmm_cluster_mod.predict(X_train)
    #             th_resp_labels = "GMM-"+str(resp_type)+"_th_resp="+str(th_resp)
    #     #         print(th_resp_labels," time : ", elapsed_time )   
    #             eval_scores = cluster_eval(y_train, y_train_pred_mod, X_train, th_resp_labels,m, elapsed_time, iter_num)
    #             all_scores += [eval_scores]

        #Change result to Pandas DataFrame
        result = pd.DataFrame(all_scores)
        all_result = all_result.append(result)
        print("Finish GaussianMixtureMod : ", m)
    all_result = all_result.append(pd.Series(), ignore_index=True) 
    all_result.columns=result_labels    


    #Save results to Excel
    ts = time.time() 
    st = datetime.datetime.fromtimestamp(ts).strftime('%H-%M-%S---%d-%m-%Y')
    # all_result.to_excel("result/All_result_"+st+".xlsx", index=False)
    all_result.to_excel("result/apr/All_result_"+cov_type+"_"+st+".xlsx", index=False)
    absolutePath += [Path("result/apr/All_result_"+cov_type+"_"+st+".xlsx").resolve()]
print("\nFinish at ",st)



for k in range (len(cov_type_labels)):
    filePath = absolutePath[k]
    os.system(f'start excel.exe "{filePath}"')

In [None]:
#Main program end here

In [None]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(10,6))
df1 = pd.DataFrame()
# Make a few areas have NaN values
df.iloc[1:3,1] = np.nan
df.iloc[5,3] = np.nan
df.iloc[7:9,5] = np.nan

if (df.isna().any(axis=None)):
    print("Nan")
else:
    print("ok")
    
if (df1.empty):
    print("Nan")
else:
    print("ok")

In [None]:

y_train_test = y_train.copy()
# y_train_pred_mod5_flip = np.flip(y_train_pred_mod5)
# print(y_train_pred_mod5)
y_train_pred_mod5_df = pd.DataFrame(y_train_pred_mod5)
# print(y_train_pred_mod5_df) 
# print(y_train)

frames = [y_train_pred_mod5_df, y_train_test ]
y_pred_train = pd.concat(frames, axis=1)
# print(y_pred_train)

#Save results to Excel
# ts = time.time() 
# st = datetime.datetime.fromtimestamp(ts).strftime('%H-%M-%S--%d-%m-%Y')
y_pred_train.to_excel("result/apr/All_result_y_pred_train_"+m+"_"+st+".xlsx", index=False)
# print(y_pred_train)

# filePath = [Path("result/apr/All_result_y_pred_train_"+st+".xlsx", index=False).resolve()]
# os.system(f'start excel.exe "{filePath}"')

In [None]:
from sklearn.metrics import accuracy_score

y_true = [1, 1, 2, 3, 2, 3, 3]
y_pred = [1, 1, 3, 2, 2, 2, 2]

accuracy = accuracy_score(y_true, y_pred) *100
print('Accuracy = ', accuracy)

purity = purity_score(y_true, y_pred) *100
print('Purity   = ', purity)

In [None]:
#For breastCancer
y_train_pred_mod5 = np.choose(y_train_pred_mod5,(1,0)).astype(np.int64) 

#For heartDisease
y_train_pred_mod5 = np.choose(y_train_pred_mod5,(1,0)).astype(np.int64) 
y_train_pred_mod5 = np.flip(y_train_pred_mod5)

#For dermatology
y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 

In [None]:
a= np.array([3,3,3,3,3,1,1,1,1,1,0,0,0,0,0,2,2,2,2,2])
print(a)
print('np.choose(a,(3,1,0,2))')
a_x = np.choose(a,(3,1,0,2)).astype(np.int64) 
print(a_x)
print("\n")
b= np.array([3,2,3,3,2,3,3,1,1,1,1,1,0,0,2,2,2,2,2])
print(b)
b_x = np.choose(b,(3,2,1,0)).astype(np.int64) 
print('np.choose(b,(3,2,1,0))')
print(b_x)
p
# b= np.array([1,1,1,1,0,1,0,0,1,0,0,0])
# print(b)
# b_x = np.choose(b,(0,1)).astype(np.int64) 
# print('np.choose(b,(1,0))')
# print(b_x)

In [None]:
print(y_train_pred_mod5)
y_train_pred_mod5v2 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
print(y_train_pred_mod5v2)

In [None]:
print(y_train_pred_mod4)
y_train_pred_mod4v2 = np.choose(y_train_pred_mod4,(4,5,0,3,2,1)).astype(np.int64) 
print(y_train_pred_mod4v2)

In [None]:
means_init in means_init : 
 <class 'numpy.ndarray'> 
 [[5.00555556 3.34444444 1.5962963  0.3037037 ]
 [5.97333333 2.775      4.50333333 1.46833333]
 [6.88333333 3.09722222 5.75833333 2.09444444]]
medians_init in means_init : 
 <class 'numpy.ndarray'> 
 [[5.   3.4  1.5  0.2 ]
 [6.   2.8  4.5  1.4 ]
 [6.75 3.05 5.7  2.1 ]]
    
    means_init_hybrid : 
 <class 'numpy.ndarray'> 
 [[5.006      3.428      1.462      0.246     ]
 [5.95172414 2.77241379 4.35862069 1.39482759]
 [6.69047619 3.00952381 5.66190476 2.06428571]]
medians_init_hybrid : 
 <class 'numpy.ndarray'> 
 [[5.   3.4  1.5  0.2 ]
 [6.   2.8  4.45 1.4 ]
 [6.7  3.   5.6  2.1 ]]

In [None]:
 es_means_init2_method = np.array ([[5.006, 3.428, 1.462, 0.246],
                                    [5.95178571, 2.77678571, 4.3375, 1.375],
                                    [6.65681818, 2.99318182, 5.62954545, 2.05909091]])
    
means__init = np.array ([ [5.00555556, 3.34444444, 1.5962963,  0.3037037 ],
                          [5.97333333, 2.775,      4.50333333, 1.46833333],
                          [6.88333333, 3.09722222, 5.75833333, 2.09444444] ])   

medians_init = np.array ([[5.,   3.4,  1.5,  0.2 ],
                         [6.,   2.8,  4.5,  1.4 ],
                         [6.75, 3.05, 5.7,  2.1 ]])  

means__init_hybrid = np.array ([ [5.006,      3.428,      1.462,      0.246     ],
                          [5.95172414, 2.77241379, 4.35862069, 1.39482759],
                          [6.69047619, 3.00952381, 5.66190476, 2.06428571] ])   

medians_init_hybrid = np.array ([[5.,   3.4,  1.5,  0.2 ],
                                 [6.,   2.8,  4.45, 1.4 ],
                                 [6.7,  3.,   5.6,  2.1 ]])  

print('\n es_means_init2_method :\n', es_means_init2_method)
print('\n means__init_hybrid :\n', means__init_hybrid)
print('\n medians_init_hybrid :\n', medians_init_hybrid)

### End Here

In [None]:
#Import functions from Local File "myfunc"
from ipynb.fs.full.jahaUtils import JahaInit

#function to calculate means 
def means_init(X, n_components):
    n_samples, n_attributes = X.shape
    print("n_components : ", n_components)
    
    df_X = pd.DataFrame(X)
    
    df_X['mean'] = df_X.mean(axis=1)
    
    no_outliers_z = z_score(df_X)
    
    resp = np.zeros((n_samples, n_components))
    print(resp)
    resp = pd.DataFrame(resp)
    print(resp)

    #Find Max, Min, Diff, Dev
    Max = max(no_outliers_z)
    Min = min(no_outliers_z)
    
    diff = Max - Min
    dev = diff/n_components
    
    print('Max: ', Max , 'Min: ', Min)   
    print('dev :', dev)
    
    cluster_range = np.zeros((n_components, 2))
    for i in range (n_components):
        cluster_range[i] = Min, Min+dev
        
        Min = Min+dev
        df_X['c'+str(i)]=0
        
        
    print(cluster_range)  
      
    for j in range (n_components):
        print(resp[j])
        resp[j][(df_X['mean']>= cluster_range[j,0]) & (df_X['mean'] < cluster_range[j,1])] = 1
        df_X['c'+str(j)][(df_X['mean']>= cluster_range[j,0]) & (df_X['mean'] < cluster_range[j,1])] = 1
        
    resp.to_excel("result/apr/resp.xlsx", index=False)
    df_X.to_excel("result/apr/df_X.xlsx", index=False)
    

    return es_init_means

dataset = pd.read_csv("dataset/iris.csv")

#Drop Target Column in data using Index
X = dataset.drop('Target',axis=1)

# #How to get Target data
y_train =  dataset['Target']

n_components = len(np.unique(y_train))
n_samples, _ = X.shape

resp = means_init(X, n_components)
# # if self.init_params == 'kmeans':
# resp = np.zeros((n_samples, n_components))
# # print(resp)
# label = KMeans(n_clusters=n_components, n_init=1, random_state=0).fit(X).labels_
# print(label)
# resp[np.arange(n_samples), label] = 1
# print(resp)

# resp = np.random.rand(n_samples, n_components)
# print(resp)
# print(resp.sum(axis=1)[:, np.newaxis])
# resp /= resp.sum(axis=1)[:, np.newaxis]
# print(resp)




In [None]:
means = np.array ([[5.005555556, 3.344444444, 1.596296296, 0.303703704],
                        [5.973333333, 2.775, 4.503333333, 1.468333333],
                        [6.86, 3.077142857, 5.731428571, 2.091428571]]) 


print(means)

es_means_init :
 [[5.006 3.428 1.462 0.246]
 [5.936 2.77  4.26  1.326]
 [6.588 2.974 5.552 2.026]]

#Test Mahalanobis only 11.03.2021

In [None]:
# #Display full output
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
from tqdm.auto import tqdm, trange
from ipywidgets import IntProgress
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 40em; }</style>"))

import warnings
warnings.filterwarnings('ignore')
import datetime
import time
from pytictoc import TicToc
import sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift
from fcmeans import FCM
from sklearn.mixture import GaussianMixture
from sklearn.mixture_mod import GaussianMixtureMod
from sklearn.mixture_mod2 import GaussianMixtureMod2
from sklearn.mixture_mod3 import GaussianMixtureMod3
from sklearn.mixture_mod4 import GaussianMixtureMod4
from sklearn.mixture_mod5 import GaussianMixtureMod5
from IPython.utils import io
import numpy as np
from sklearn import metrics
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score, normalized_mutual_info_score, homogeneity_completeness_v_measure,fowlkes_mallows_score,silhouette_score
from sklearn.metrics import davies_bouldin_score, v_measure_score

#Funtion declaration
def cluster_accuracy(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)

    # Find optimal one-to-one mapping between cluster labels and true labels
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix)
    print("row_ind : ", row_ind)
    print("col_ind : ", col_ind)
    val1 = contingency_matrix[row_ind, col_ind].sum()
    print("contingency_matrix[row_ind, col_ind].sum() : ", val1)
    val2 = np.sum(contingency_matrix)
    print("np.sum(contingency_matrix) : ", val2)

    # Return cluster accuracy
    return contingency_matrix[row_ind, col_ind].sum() / np.sum(contingency_matrix)

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)
    
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

#Get the Clustering Algorithm Performance
def cluster_eval(y_true, y_pred, X_train, cluster_label, m, elapsed_time,iter_num):
    eval_scores = []
    
    accuracy = np.mean(y_pred.ravel() == y_true.ravel()) * 100
    purity = purity_score(y_train, y_pred)
    micro_f1_score = f1_score(y_true, y_pred, average='micro')
    macro_f1_score = f1_score(y_true, y_pred, average='macro')
    weighted_f1_score = f1_score(y_true, y_pred, average='weighted')
    ari_score = adjusted_rand_score(y_true, y_pred)
    nmi_score = normalized_mutual_info_score(y_true, y_pred)
#     hc_v_measure = homogeneity_completeness_v_measure(y_true, y_pred)
    v_measure = v_measure_score(y_true, y_pred)
    fm_score = fowlkes_mallows_score(y_true, y_pred)
#     s_score = silhouette_score(X_train, y_pred, metric='euclidean')
#     db_score = davies_bouldin_score(X_train, y_pred)
    eval_scores = [m, cluster_label, elapsed_time, iter_num, accuracy, purity, micro_f1_score, macro_f1_score, weighted_f1_score, ari_score, 
                   nmi_score, v_measure, fm_score]
    return eval_scores
    
"""------------------------------------------------------------------------------------------------------------------------
#Main Program - Start here!
------------------------------------------------------------------------------------------------------------------------"""
# np.set_printoptions(threshold=sys.maxsize)
# pd.set_option('display.max_rows', dataset.shape[0]+1)

#Define dataset_labels
dataset_labels = ["iris"]
# dataset_labels = ["heartDisease",  "transfusion", "wilt", "balanceScale","dermatology"]  
# dataset_labels = ["heartDisease", "transfusion", "wilt", "breastCancer", "chess", 
#                   "parkinson","australian", "german", "japanese", "iris", "abalone", "wallRobot",
#                   "dermatology","creditApproval", "letters", "eegEyeState",
#                   "seismicBumps","bank", "adultIncome", "sensorlessDrive"]
#Final dataset tested on 11.03.2021
# dataset_labels = ["transfusion", "wilt", "breastCancer", "heartDisease", "eegEyeState", "adultIncome",
#                   "australian", "japanese", "bank", "seismicBumps", "german", "chess", 
#                   "iris", "abalone", "wallRobot", "dermatology"]
# dataset_labels = ["seismicBumps","bank", "adultIncome"]
# skip_Mahal = ["seismicBumps","bank", "adultIncome"]
skip_Mahal = ["seismicBumps","bank"]
# Threshold of GMM-EM Response values
th_resp_val = [0.1, 0.2, 0.3, 0.4, 0.5]
# th_resp_val = [1]

# resp_type_val = ['donate']
# resp_type_val = ['donate']
resp_type_val = ['remove', 'donate']

result_labels = ["Dataset","Algorithm", "Elapsed Time", "Iter_num","accuracy", "purity", "micro_f1_score", "macro_f1_score", "weighted_f1_score", "ari_score", 
                "nmi_score", "hc_v_measure", "fm_score"]

all_scores = []
final_scores = []
all_result = pd.DataFrame()
t = TicToc() # create TicToc instance
# for m in dataset_labels:
for m in tqdm(dataset_labels):
#     enablePrint()
    print("\n\nStart Dataset : ",m)
    #Read dataset
    dataset = pd.read_csv("dataset/"+m+".csv")

    #Drop Target Column in data using Index
    X_train = dataset.drop('Target',axis=1)

    # #How to get Target data
    y_train =  dataset['Target']

    print(np.array(y_train))

    n_classes = len(np.unique(y_train))
#     cov_type = 'full'
    cov_type = 'tied'
#     cov_type = 'diag'
    #Since we have class labels for the training data, we can initialize the GMM parameters in a supervised manner.
    es_means_init = np.array([X_train[y_train == i].mean(axis=0) for i in range(n_classes)])
#     print("es_means_init :\n",es_means_init)

    #Run the kMeans on current
    kMeans_cluster = KMeans(n_clusters = n_classes, max_iter=20, random_state=0)
    t.tic() # Start timer
    kMeans_cluster.fit(X_train)
    elapsed_time = t.tocvalue() #Save elapsed time
    iter_num = kMeans_cluster.n_iter_
#     print("kMeans time : ", elapsed_time )   
    y_train_pred = kMeans_cluster.predict(X_train)
    eval_scores = cluster_eval(y_train, y_train_pred, X_train, "kMeans", m, elapsed_time, iter_num )
#     eval_scores = np.append([m],eval_scores,axis=0)  
    all_scores = [eval_scores]
    
#     #Run the Meanshift on current
#     ms_cluster = MeanShift(bandwidth=n_classes)
#     t.tic() # Start timer
#     ms_cluster = ms_cluster.fit(X_train)
#     elapsed_time = t.tocvalue() #Save elapsed time
#     iter_num = "NA" #ms_cluster.n_iter_
# #     print("GMM time : ", elapsed_time )       
#     y_train_pred = ms_cluster.predict(X_train)
#     eval_scores = cluster_eval(y_train, y_train_pred, X_train, "Meanshift", m, elapsed_time, iter_num )
# #     all_scores += [eval_scores]
#     all_scores += [eval_scores]
    
#     #Run the FCM on current
#     fcm_cluster = FCM(n_clusters=n_classes)
#     t.tic() # Start timer
#     fcm_cluster.fit(X_train)
#     elapsed_time = t.tocvalue() #Save elapsed time
#     iter_num = "NA" #fcm_cluster.n_iter_
# #     print("GMM time : ", elapsed_time )       
#     y_train_pred = fcm_cluster.predict(X_train)
#     eval_scores = cluster_eval(y_train, y_train_pred, X_train, "FCM", m, elapsed_time, iter_num )
# #     all_scores += [eval_scores]
#     all_scores += [eval_scores]

    #Run the GMM on current
    gmm_cluster = GaussianMixture(n_components = n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0)
    t.tic() # Start timer
    gmm_cluster.fit(X_train)
    elapsed_time = t.tocvalue() #Save elapsed time
    iter_num = gmm_cluster.n_iter_
#     print("GMM time : ", elapsed_time )       
    y_train_pred = gmm_cluster.predict(X_train)
    eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM", m, elapsed_time, iter_num )
#     all_scores += [eval_scores]
    all_scores += [eval_scores]

    #Run the GMM Mahalanobis on current - Last tested on 28.01.2021
    if m not in (skip_Mahal):    
        for resp_type in resp_type_val:
            gmm_cluster_mod2 = GaussianMixtureMod2(n_components = n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type)
            t.tic() # Start timer
            gmm_cluster_mod2.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod2.n_iter_
        #     print("GMM time : ", elapsed_time ) 
            resp_labels = "Mahalanobis-"+str(resp_type)
            y_train_pred_mod2 = gmm_cluster_mod2.predict(X_train)
            eval_scores = cluster_eval(y_train, y_train_pred_mod2, X_train, resp_labels, m, elapsed_time, iter_num )
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod2/Mahalanobis : ",m," : ",resp_type )
    else:
        print("Skip GaussianMixtureMod2/Mahalanobis for :",m )
 
    # Run the GMM-Auto for every th_resp value on current dataset (using intersect)- Last tested on 27.01.2021
    #     for resp_type in resp_type_val:
    #         gmm_cluster_mod3 = GaussianMixtureMod3(n_components=n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
    #         t.tic() # Start timer
    #         gmm_cluster_mod3.fit(X_train)
    #         elapsed_time = t.tocvalue() #Save elapsed time
    #         iter_num = gmm_cluster_mod3.n_iter_     
    #         resp_labels = "GMM_Auto-"+str(resp_type)
    #         y_train_pred_mod3 = gmm_cluster_mod3.predict(X_train)
    #         eval_scores = cluster_eval(y_train, y_train_pred_mod3, X_train, resp_labels,m, elapsed_time, iter_num)
    #         all_scores += [eval_scores]
    #     print("Finish GaussianMixtureMod3 : ", m," : ",resp_type )

#     #Run the GMM-Auto-v2 for every th_resp value on current dataset (using weights*cv or cv/2) - Last tested on 27.01.2021
#     for resp_type in resp_type_val:
#         gmm_cluster_mod4 = GaussianMixtureMod4(n_components=n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
#         t.tic() # Start timer
#         gmm_cluster_mod4.fit(X_train)
#         elapsed_time = t.tocvalue() #Save elapsed time
#         iter_num = gmm_cluster_mod4.n_iter_     
#         resp_labels = "resp Mixing Coefficients -"+str(resp_type)
#         y_train_pred_mod4 = gmm_cluster_mod4.predict(X_train)
#         eval_scores = cluster_eval(y_train, y_train_pred_mod4, X_train, resp_labels,m, elapsed_time, iter_num)
#         all_scores += [eval_scores]
#     print("Finish GaussianMixtureMod4/resp Mixing Coefficients  : ", m, " : ",resp_type )

#     #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
#     for resp_type in resp_type_val:
#         gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
#         t.tic() # Start timer
#         gmm_cluster_mod5.fit(X_train)
#         elapsed_time = t.tocvalue() #Save elapsed time
#         iter_num = gmm_cluster_mod5.n_iter_     
#         resp_labels = "resp center values-"+str(resp_type)
#         y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
#         eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
#         all_scores += [eval_scores]
#     print("Finish GaussianMixtureMod5/resp center values : ", m, " : ",resp_type )

#     #Run the GMM-OR for every th_resp value on current dataset
#     for resp_type in resp_type_val:
#         for th_resp in th_resp_val:
#             gmm_cluster_mod = GaussianMixtureMod(n_components=n_classes, init_params='random', means_init=es_means_init, covariance_type=cov_type, max_iter=20, random_state=0, th_resp=th_resp, resp_type=resp_type )

#             t.tic() # Start timer
#             gmm_cluster_mod.fit(X_train)
#             elapsed_time = t.tocvalue() #Save elapsed time
#             iter_num = gmm_cluster_mod.n_iter_       
#             y_train_pred_mod = gmm_cluster_mod.predict(X_train)
#             th_resp_labels = "GMM-"+str(resp_type)+"_th_resp="+str(th_resp)
#     #         print(th_resp_labels," time : ", elapsed_time )   
#             eval_scores = cluster_eval(y_train, y_train_pred_mod, X_train, th_resp_labels,m, elapsed_time, iter_num)
#             all_scores += [eval_scores]

    #Change result to Pandas DataFrame
    result = pd.DataFrame(all_scores)
    all_result = all_result.append(result)
#     print("Finish GaussianMixtureMod : ", m, " : ",resp_type )
all_result = all_result.append(pd.Series(), ignore_index=True) 
all_result.columns=result_labels    


#Save results to Excel
ts = time.time() 
st = datetime.datetime.fromtimestamp(ts).strftime('%H-%M-%S---%d-%m-%Y')
# all_result.to_excel("result/All_result_"+st+".xlsx", index=False)
all_result.to_excel("result/All_result_"+cov_type+"_"+st+".xlsx", index=False)
print("\nFinish at ",st)


In [None]:
# #Display full output
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
from tqdm.auto import tqdm, trange
from ipywidgets import IntProgress
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 40em; }</style>"))

import warnings
warnings.filterwarnings('ignore')
import datetime
import time
from pytictoc import TicToc
import sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift
from fcmeans import FCM
from sklearn.mixture import GaussianMixture
from sklearn.mixture_mod import GaussianMixtureMod
from sklearn.mixture_mod2 import GaussianMixtureMod2
from sklearn.mixture_mod3 import GaussianMixtureMod3
from sklearn.mixture_mod4 import GaussianMixtureMod4
from sklearn.mixture_mod5 import GaussianMixtureMod5
from IPython.utils import io
import numpy as np
from sklearn import metrics
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score, normalized_mutual_info_score, homogeneity_completeness_v_measure,fowlkes_mallows_score,silhouette_score
from sklearn.metrics import davies_bouldin_score, v_measure_score

#Funtion declaration
def cluster_accuracy(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)

    # Find optimal one-to-one mapping between cluster labels and true labels
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix)
    print("row_ind : ", row_ind)
    print("col_ind : ", col_ind)
    val1 = contingency_matrix[row_ind, col_ind].sum()
    print("contingency_matrix[row_ind, col_ind].sum() : ", val1)
    val2 = np.sum(contingency_matrix)
    print("np.sum(contingency_matrix) : ", val2)

    # Return cluster accuracy
    return contingency_matrix[row_ind, col_ind].sum() / np.sum(contingency_matrix)

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)
    
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

#Get the Clustering Algorithm Performance
def cluster_eval(y_true, y_pred, X_train, cluster_label, m, elapsed_time,iter_num):
    eval_scores = []
    
    accuracy = np.mean(y_pred.ravel() == y_true.ravel()) * 100
    purity = purity_score(y_train, y_pred)
    micro_f1_score = f1_score(y_true, y_pred, average='micro')
    macro_f1_score = f1_score(y_true, y_pred, average='macro')
    weighted_f1_score = f1_score(y_true, y_pred, average='weighted')
    ari_score = adjusted_rand_score(y_true, y_pred)
    nmi_score = normalized_mutual_info_score(y_true, y_pred)
#     hc_v_measure = homogeneity_completeness_v_measure(y_true, y_pred)
    v_measure = v_measure_score(y_true, y_pred)
    fm_score = fowlkes_mallows_score(y_true, y_pred)
#     s_score = silhouette_score(X_train, y_pred, metric='euclidean')
#     db_score = davies_bouldin_score(X_train, y_pred)
    eval_scores = [m, cluster_label, elapsed_time, iter_num, accuracy, purity, micro_f1_score, macro_f1_score, weighted_f1_score, ari_score, 
                   nmi_score, v_measure, fm_score]
    return eval_scores
    
"""------------------------------------------------------------------------------------------------------------------------
#Main Program - Start here!
------------------------------------------------------------------------------------------------------------------------"""
# np.set_printoptions(threshold=sys.maxsize)
# pd.set_option('display.max_rows', dataset.shape[0]+1)

#Define dataset_labels
dataset_labels = ["iris"]
# dataset_labels = ["heartDisease",  "transfusion", "wilt", "balanceScale","dermatology"]  
# dataset_labels = ["heartDisease", "transfusion", "wilt", "breastCancer", "chess", 
#                   "parkinson","australian", "german", "japanese", "iris", "abalone", "wallRobot",
#                   "dermatology","creditApproval", "letters", "eegEyeState",
#                   "seismicBumps","bank", "adultIncome", "sensorlessDrive"]
#Final dataset tested on 11.03.2021
# dataset_labels = ["transfusion", "wilt", "breastCancer", "heartDisease", "eegEyeState", "adultIncome",
#                   "australian", "japanese", "bank", "seismicBumps", "german", "chess", 
#                   "iris", "abalone", "wallRobot", "dermatology"]
# dataset_labels = ["seismicBumps","bank", "adultIncome"]
# skip_Mahal = ["seismicBumps","bank", "adultIncome"]
skip_Mahal = ["seismicBumps","bank"]
# Threshold of GMM-EM Response values
th_resp_val = [0.1, 0.2, 0.3, 0.4, 0.5]
# th_resp_val = [1]

# resp_type_val = ['donate']
# resp_type_val = ['donate']
resp_type_val = ['remove', 'donate']

result_labels = ["Dataset","Algorithm", "Elapsed Time", "Iter_num","accuracy", "purity", "micro_f1_score", "macro_f1_score", "weighted_f1_score", "ari_score", 
                "nmi_score", "hc_v_measure", "fm_score"]

all_scores = []
final_scores = []
all_result = pd.DataFrame()
t = TicToc() # create TicToc instance
# for m in dataset_labels:
for m in tqdm(dataset_labels):
#     enablePrint()
    print("\n\nStart Dataset : ",m)
    #Read dataset
    dataset = pd.read_csv("dataset/"+m+".csv")

    #Drop Target Column in data using Index
    X_train = dataset.drop('Target',axis=1)

    # #How to get Target data
    y_train =  dataset['Target']
    
    print(np.array(y_train))

    n_classes = len(np.unique(y_train))
#     cov_type = 'full'
    cov_type = 'tied'
    
    #Since we have class labels for the training data, we can initialize the GMM parameters in a supervised manner.
    es_means_init = np.array([X_train[y_train == i].mean(axis=0) for i in range(n_classes)])
    
import warnings
warnings.filterwarnings('ignore')
import datetime
import time
from pytictoc import TicToc
import sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift    print("es_means_init : \n", es_means_init)

## Find means_init

In [None]:
import warnings
warnings.filterwarnings('ignore')
import datetime
import time
from pytictoc import TicToc
import sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift
from sklearn.mixture import GaussianMixture

dataset_labels = ["iris"]
init_params = 'gmm'

n_components = len(np.unique(y_train))
random_state=0

for m in tqdm(dataset_labels):
    print("\n\nStart Dataset : ",m)
    
    #Read dataset
    dataset = pd.read_csv("dataset/"+m+".csv")

    #Drop Target Column in data using Index
    X_train = dataset.drop('Target',axis=1)

    # #How to get Target data
    y_train =  dataset['Target']

   
    X = X_train
    
    n_samples, n_attributes = X.shape
    
#     print(n_attributes)
    
    df_X = pd.DataFrame(X)
    
    #Find rows average
    df_X['mean'] = df_X.mean(axis=1)
     
    
    #Find Max, Min, Diff, Dev
    Max = max(df_X['mean'])
    Min = min(df_X['mean'])
    
    diff = Max - Min
    dev = diff/n_components
    
    print('Max: ', Max , 'Min: ', Min)   
    print('dev :', dev)
    
    cluster_range = np.zeros((n_components, 2))
    for i in range (n_components):
        cluster_range[i] = Min, Min+dev
        
        Min = Min+dev
        df_X['c'+str(i)]=0
#     print(cluster_range)  

    for j in range (n_components):
        df_X['c'+str(j)][(df_X['mean']>= cluster_range[j,0]) & (df_X['mean'] < cluster_range[j,1])] = 1
    
   
    es_init_means = np.zeros((n_components, n_attributes))
    for k in range (n_components):    
        temp_df_X = df_X.iloc[:, 0:n_attributes][df_X['c'+str(k)] == 1]   
        es_init_means[k] = temp_df_X.mean(axis=0)
        
#         print('es_init_means[',k,'] : ',es_init_means[k])
    print("es_init_means : \n", es_init_means)  
    


In [None]:

#     if init_params == 'kmeans':
#         resp = np.zeros((n_samples, n_components))
#         label = KMeans(n_clusters= n_components, n_init=1,
#                                random_state=random_state).fit(X).labels_
#         resp[np.arange(n_samples), label] = 1
#         print('label : ', label)
#         print('resp  :  \n', resp)
    
#     elif init_params == 'gmm':
#         resp = np.zeros((n_samples, n_components))
#         gmm_cluster = GaussianMixture(n_components= n_components,  init_params='random', random_state=random_state).fit(X)
#         label = gmm_cluster.predict(X)
#         resp[np.arange(n_samples), label] = 1
#         print('label : ', label)
#         print('resp  :  \n', resp)
        
#     elif init_params == 'random':
#         resp = random_state.rand(n_samples, n_components)
#         resp /= resp.sum(axis=1)[:, np.newaxis]
#         resp1_ = resp
#     else:
#         raise ValueError("Unimplemented initialization method '%s'"
#                      % self.init_params)

    es_means_init = np.array ([[5.005555556, 3.344444444, 1.596296296, 0.303703704],
                                [5.973333333, 2.775, 4.503333333, 1.468333333],
                                [6.86, 3.077142857, 5.731428571, 2.091428571]]) 

In [None]:
print(df_X)
df_X_new = df_X.iloc[:, 0:n_attributes]
print(df_X_new)

In [None]:
# print(n_components)
Min = 2.1
dev = 1
Max=5.1
cluster_range = np.zeros((n_components, 2))
for i in range (n_components):
    print(i, Min)
#     cluster_range[i] = i, i+1
    cluster_range[i] = Min, Min+dev
#     min_range[i] = i+1
# #     max_range[i] = Min+dev
    Min = Min+dev
print(cluster_range)
#     print(Min)

In [None]:
print(max(df_X['mean']))   
print(min(df_X['mean'])) 
print

In [None]:
cv = np.array([[0.10605892, 0.27159891, 0.53311513, 0.29504658, 0.43389628],
 [0.10532298, 0.30813951, 0.53196672, 0.35493489, 0.52814269]])


o_cv = np.amax(cv, axis=1)
# cv = np.asarray(cv, dtype=np.int32)
print(o_cv)
for i in range (len(o_cv)):
    print(i,"o_cv:",o_cv[i])
    cv = o_cv[i]
    print(i,"cv :",cv)
    
    

In [None]:
dataset_labels = ["heartDisease", "transfusion", "wilt", "breastCancer", "chess", 
                  "parkinson","australian", "german", "japanese", "iris", "abalone", "wallRobot"
                  "dermatology","creditApproval", "letters", "eegEyeState",
                  "seismicBumps","bank", "adultIncome", "sensorlessDrive", "creditCard"]

skip_Mahal = ["seismicBumps","bank", "adultIncome", "sensorlessDrive", "creditCard"]

for m in tqdm(dataset_labels):
    if m not in (skip_Mahal):
        print("Run Mahalanobis for :",m )
    else:
        print("Skip Mahalanobis for :",m )


In [None]:
# print(all_result)
all_result = all_result.append(pd.Series(), ignore_index=True) 
all_result.columns=result_labels 

#Save results to Excel
ts = time.time() 
st = datetime.datetime.fromtimestamp(ts).strftime('%H-%M-%S---%d-%m-%Y')
print(st)
all_result.to_excel("result/All_result_"+cov_type+"_"+st+".xlsx", index=False)