26.02.2021 - Modified "gaussian_mixture_mod5" 
- Add new method to get th_resp
  - using median of resp
  - using center value of min and max of resp ((max - min) /2))

In [72]:
import numpy as np
from numpy.random import RandomState
from numpy.random import MT19937
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pytictoc import TicToc
from statistics import mean, stdev
# import time

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)


# dataset_labels = ["transfusion", "wilt", "breastCancer", "heartDisease", "adultIncome", "australian", "japanese", "bank", "seismicBumps", "german", 
#                   "chess", "iris", "abalone", "wallRobot", "dermatology"] 
m = 'transfusion'

print("\n\nStart Dataset : ",m)
#Read dataset
dataset = pd.read_csv("dataset/"+m+".csv")

#Drop Target Column in data using Index
X_train = dataset.drop('Target',axis=1)

# #How to get Target data
y_train =  dataset['Target']
# print(np.array(X_train))
# print(np.array(y_train))
# mt19937 = MT19937()
# rs = RandomState(mt19937)
# rs_val = rs.standard_normal()
# print("RandomState :", rs_val)

cov_type = 'full'
n_classes = len(np.unique(y_train))   
t = TicToc() # create TicToc instance
num_repreat = 1000

all_purity = []
all_accuracy = []
all_purity2 = []
all_accuracy2 = []
all_purity3 = []
all_accuracy3 = []
print("\nk-Means Clustering :")
for repeat in range (num_repreat):       
    t.tic() # Start timer
    
    #Run the kMeans on current  
    
    kMeans_cluster = KMeans( n_clusters = n_classes)
    kMeans_cluster.fit(X_train)
    elapsed_time = t.tocvalue() #Save elapsed time
    iter_num = kMeans_cluster.n_iter_
    #print("kMeans time : ", elapsed_time )   
    y_train_pred = kMeans_cluster.predict(X_train)
    purity = purity_score(y_train, y_train_pred)
    accuracy = accuracy_score(y_train, y_train_pred)
#     print("Cluster Center :", kMeans_cluster.cluster_centers_)
#     print("\nPurity ",repeat,":", purity)
#     print("Accuracy :", accuracy)
    all_purity.append(purity)
    all_accuracy.append(accuracy)
# print("\nAll_Purity :\n", all_purity)
mean_pu = mean(all_purity)
std_pu = stdev(all_purity)
print("Purity Means :", mean_pu ,",","Std :", std_pu)

# print("\nAll_accuracy :\n", all_accuracy)
mean_ac = mean(all_accuracy)
std_ac = stdev(all_accuracy)
print("Accuracy Means :", mean_ac ,",","Std :", std_ac)

print("\nGMM Clustering-random :")
for repeat in range (num_repreat):  
    #Run the GMM on current
    gmm_cluster = GaussianMixture(n_components = n_classes, init_params='random', means_init= None, covariance_type=cov_type, max_iter=60)
    t.tic() # Start timer
    gmm_cluster.fit(X_train)
    elapsed_time = t.tocvalue() #Save elapsed time
    iter_num = gmm_cluster.n_iter_
#     print("GMM time : ", elapsed_time )       
    y_train_pred2 = gmm_cluster.predict(X_train)
    purity2 = purity_score(y_train, y_train_pred2)
    accuracy2 = accuracy_score(y_train, y_train_pred2)
#     print("Cluster Center :", kMeans_cluster.cluster_centers_)
    all_purity2.append(purity2)
    all_accuracy2.append(accuracy2)
# print("\nAll_Purity :\n", all_purity2)
mean_pu2 = mean(all_purity2)
std_pu2 = stdev(all_purity2)
print("Purity Means :", mean_pu2 ,",","Std :", std_pu2)

# print("\nAll_accuracy :\n", all_accuracy2)
mean_ac2 = mean(all_accuracy2)
std_ac2 = stdev(all_accuracy2)
print("Accuracy Means :", mean_ac2 ,",","Std :", std_ac2)

print("\nGMM Clustering-kmeans :")
for repeat in range (num_repreat):  
    #Run the GMM on current
    gmm_cluster2 = GaussianMixture(n_components = n_classes, init_params='kmeans', means_init= None, covariance_type=cov_type, max_iter=50)
    t.tic() # Start timer
    gmm_cluster2.fit(X_train)
    elapsed_time = t.tocvalue() #Save elapsed time
    iter_num = gmm_cluster2.n_iter_
#     print("GMM time : ", elapsed_time )       
    y_train_pred3 = gmm_cluster2.predict(X_train)
    purity3 = purity_score(y_train, y_train_pred3)
    accuracy3 = accuracy_score(y_train, y_train_pred3)
#     print("Cluster Center :", kMeans_cluster.cluster_centers_)
    all_purity3.append(purity3)
    all_accuracy3.append(accuracy3)
# print("\nAll_Purity :\n", all_purity3)
mean_pu3 = mean(all_purity3)
std_pu3 = stdev(all_purity3)
print("Purity Means :", mean_pu3 ,",","Std :", std_pu3)

# print("\nAll_accuracy :\n", all_accuracy3)
mean_ac3 = mean(all_accuracy3)
std_ac3 = stdev(all_accuracy3)
print("Accuracy Means :", mean_ac3 ,",","Std :", std_ac3)




Start Dataset :  transfusion

k-Means Clustering :
Purity Means : 0.7620320855614974 , Std : 0.0
Accuracy Means : 0.5571550802139037 , Std : 0.21593021473233168

GMM Clustering-random :




Purity Means : 0.7620320855614974 , Std : 0.0
Accuracy Means : 0.4989291443850267 , Std : 0.07622990286355347

GMM Clustering-kmeans :
Purity Means : 0.7620320855614974 , Std : 0.0
Accuracy Means : 0.4649465240641711 , Std : 0.06769612680687848


In [55]:
from numpy.random import MT19937
from numpy.random import RandomState

rs = RandomState(12345)
mt19937 = MT19937()
mt19937.state = rs.get_state()
rs2 = RandomState(mt19937)

# # Same output
rs.standard_normal()
rs2.standard_normal()

print(rs.standard_normal(),rs2.standard_normal())
rs.random()
rs2.random()
print(rs.random(),rs2.random())

rs.standard_exponential()
rs2.standard_exponential()
print(rs.standard_exponential(),rs2.standard_exponential())

0.47894333805754824 0.47894333805754824
0.2045602785530397 0.2045602785530397
0.9052140627545171 0.9052140627545171


In [None]:
# #Display full output
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
from tqdm.auto import tqdm, trange
from ipywidgets import IntProgress
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 40em; }</style>"))

import warnings
warnings.filterwarnings('ignore')
import datetime
import time
from pytictoc import TicToc
import sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift
from fcmeans import FCM
from sklearn.mixture import GaussianMixture
from sklearn.mixture_mod import GaussianMixtureMod
from sklearn.mixture_mod2 import GaussianMixtureMod2
from sklearn.mixture_mod3 import GaussianMixtureMod3
from sklearn.mixture_mod4 import GaussianMixtureMod4
from sklearn.mixture_mod5 import GaussianMixtureMod5
from sklearn.mixture_mod6 import GaussianMixtureMod6
from IPython.utils import io
import numpy as np
from sklearn import metrics
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score, normalized_mutual_info_score, homogeneity_completeness_v_measure,fowlkes_mallows_score,silhouette_score
from sklearn.metrics import davies_bouldin_score, v_measure_score
from scipy.stats import chi2
# opening EXCEL through Code local path in dir
import os
from pathlib import Path
#Funtion declaration
def cluster_accuracy(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)

    # Find optimal one-to-one mapping between cluster labels and true labels
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix)
    print("row_ind : ", row_ind)
    print("col_ind : ", col_ind)
    val1 = contingency_matrix[row_ind, col_ind].sum()
    print("contingency_matrix[row_ind, col_ind].sum() : ", val1)
    val2 = np.sum(contingency_matrix)
    print("np.sum(contingency_matrix) : ", val2)

    # Return cluster accuracy
    return contingency_matrix[row_ind, col_ind].sum() / np.sum(contingency_matrix)

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)
    
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

#Get the Clustering Algorithm Performance
def cluster_eval(y_true, y_pred, X_train, cluster_label, m, elapsed_time,iter_num):
    eval_scores = []
    
    accuracy = np.mean(y_pred.ravel() == y_true.ravel()) * 100
    
    purity = purity_score(y_train, y_pred)
    micro_f1_score = f1_score(y_true, y_pred, average='micro')
    macro_f1_score = f1_score(y_true, y_pred, average='macro')
    weighted_f1_score = f1_score(y_true, y_pred, average='weighted')
    ari_score = adjusted_rand_score(y_true, y_pred)
    nmi_score = normalized_mutual_info_score(y_true, y_pred)
#     hc_v_measure = homogeneity_completeness_v_measure(y_true, y_pred)
    v_measure = v_measure_score(y_true, y_pred)
    fm_score = fowlkes_mallows_score(y_true, y_pred)
#     s_score = silhouette_score(X_train, y_pred, metric='euclidean')
#     db_score = davies_bouldin_score(X_train, y_pred)
#     eval_scores = [m, cluster_label, elapsed_time, iter_num, accuracy, purity, micro_f1_score, macro_f1_score, weighted_f1_score, ari_score, 
#                    nmi_score, v_measure, fm_score]
    eval_scores = [m, cluster_label, elapsed_time, iter_num, purity,  ari_score]
    return eval_scores

#function to remove outliers using z-score
def z_score(data):
    data_z = data['mean']
    outliers_z = data_z.copy()
    mean_z = np.mean(data_z)
    std_z =np.std(data_z)
    threshold=3.

    data['z_score'] = (outliers_z - mean_z)/std_z 
    outliers_z[(data['z_score'] > threshold)] = np.nan
    print('outliers_z : \n', outliers_z)
    #Drop outliers
    no_outliers_z = outliers_z.dropna()
    print('outliers_z_new : \n', no_outliers_z)    
    return no_outliers_z

#function to calculate means 
def means_init(X, n_components):
    n_samples, n_attributes = X.shape
    print("n_components : ", n_components)
    
    df_X = pd.DataFrame(X)
    
    df_X['mean'] = df_X.mean(axis=1)
    
    no_outliers_z = z_score(df_X)

    #Find Max, Min, Diff, Dev
    Max = max(no_outliers_z)
    Min = min(no_outliers_z)
    
    diff = Max - Min
    dev = diff/n_components
    
    print('Max: ', Max , 'Min: ', Min)   
    print('dev :', dev)
    
    cluster_range = np.zeros((n_components, 2))
    for i in range (n_components):
        cluster_range[i] = Min, Min+dev
        
        Min = Min+dev
        df_X['c'+str(i)]=0
    print(cluster_range)  

    for j in range (n_components):
        df_X['c'+str(j)][(df_X['mean']>= cluster_range[j,0]) & (df_X['mean'] < cluster_range[j,1])] = 1

    es_init_means = np.zeros((n_components, n_attributes))
    for k in range (n_components):    
        temp_df_X = df_X.iloc[:, 0:n_attributes][df_X['c'+str(k)] == 1]   
        es_init_means[k] = temp_df_X.mean(axis=0) 
    return es_init_means
"""------------------------------------------------------------------------------------------------------------------------
#Main Program - Start here!
------------------------------------------------------------------------------------------------------------------------"""
# np.set_printoptions(threshold=sys.maxsize)
# pd.set_option('display.max_rows', dataset.shape[0]+1)

#Define dataset_labels
# dataset_labels = ["iris", "vertebral","new_thyroid", "haberman", "landsat", "seed", "ecoli", "glass", "wine"]
# dataset_labels = ["ecoli"]"glass","vertebral","new_thyroid",
# dataset_labels = ["wine", "new_thyroid"]
# dataset_labels = ["transfusion", "wilt", "australian", "japanese", "iris"]
# dataset_labels = ["transfusion", "wilt", "breastCancer", "heartDisease", "adultIncome", "australian", "japanese", "bank", "seismicBumps", "german", 
#                   "chess", "iris", "abalone", "wallRobot", "dermatology"] 
dataset_labels = ["iris", "breastCancer"]            
# dataset_labels = ["ecoli", "seed", "glass"]

# skip_Mahal = ["australian","japanese", "seismicBumps","bank", "adultIncome"]

# cov_type_labels = ["full", "tied", "diag", "spherical"]
cov_type_labels = ["full"]
# cov_type_labels = ["diag", "spherical"]
# Threshold of GMM-EM Response values
# th_resp_val = [0.1, 0.2, 0.3, 0.4, 0.5]
# th_resp_val = [1]

resp_type_val = ['remove']
# resp_type_val = ['donate']
# resp_type_val = ['remove', 'donate']

# result_labels = ["Dataset","Algorithm", "Elapsed Time", "Iter_num","accuracy", "purity", "micro_f1_score", "macro_f1_score", "weighted_f1_score", "ari_score", 
#                 "nmi_score", "hc_v_measure", "fm_score"]
result_labels = ["Dataset","Algorithm", "Elapsed Time", "Iter_num", "purity", "ari_score"]
num_repreat = 20

absolutePath = []
for cov_type in (cov_type_labels):
    print("\nStart Cov_Type :",cov_type)
                   
    all_scores = []
    final_scores = []
    all_result = pd.DataFrame()
    t = TicToc() # create TicToc instance
    # for m in dataset_labels:
    for m in tqdm(dataset_labels):
        for repeat in range (num_repreat):
    
        #     enablePrint()
            print("\n\nStart Dataset : ",m)
            #Read dataset
            dataset = pd.read_csv("dataset/"+m+".csv")

            #Drop Target Column in data using Index
            X_train = dataset.drop('Target',axis=1)

            # #How to get Target data
            y_train =  dataset['Target']

            print(np.array(y_train))

            n_classes = len(np.unique(y_train))


            #Run the kMeans on current
            kMeans_cluster = KMeans(init='random', n_clusters = n_classes, max_iter=20, random_state=0)
            t.tic() # Start timer
            kMeans_cluster.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = kMeans_cluster.n_iter_
        #     print("kMeans time : ", elapsed_time )   
            y_train_pred = kMeans_cluster.predict(X_train)
            eval_scores = cluster_eval(y_train, y_train_pred, X_train, "kMeans", m, elapsed_time, iter_num )
        #     eval_scores = np.append([m],eval_scores,axis=0)  
            all_scores = [eval_scores]

    #         #Run the GMM on current
    #         gmm_cluster = GaussianMixture(n_components = n_classes, init_params='random', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
    #         t.tic() # Start timer
    #         gmm_cluster.fit(X_train)
    #         elapsed_time = t.tocvalue() #Save elapsed time
    #         iter_num = gmm_cluster.n_iter_
    #     #     print("GMM time : ", elapsed_time )       
    #         y_train_pred = gmm_cluster.predict(X_train)
    #         eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-random_init", m, elapsed_time, iter_num )
    #     #     all_scores += [eval_scores]
    #         all_scores += [eval_scores]

    #         #Run the GMM on current
    #         gmm_cluster = GaussianMixture(n_components = n_classes, init_params='kmeans', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
    #         t.tic() # Start timer
    #         gmm_cluster.fit(X_train)
    #         elapsed_time = t.tocvalue() #Save elapsed time
    #         iter_num = gmm_cluster.n_iter_
    #     #     print("GMM time : ", elapsed_time )       
    #         y_train_pred = gmm_cluster.predict(X_train)
    #         eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-kmeans_init", m, elapsed_time, iter_num )
    #     #     all_scores += [eval_scores]
    #         all_scores += [eval_scores]

    #         #Run the GMM on current
    #         gmm_cluster = GaussianMixture(n_components = n_classes, init_params='jaha_init_hybrid', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
    #         t.tic() # Start timer
    #         gmm_cluster.fit(X_train)
    #         elapsed_time = t.tocvalue() #Save elapsed time
    #         iter_num = gmm_cluster.n_iter_
    #     #     print("GMM time : ", elapsed_time )       
    #         y_train_pred = gmm_cluster.predict(X_train)
    #         eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-hybrid_init", m, elapsed_time, iter_num )
    #     #     all_scores += [eval_scores]
    #         all_scores += [eval_scores]


    #         for resp_type in resp_type_val:
    #             gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='random', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
    #             t.tic() # Start timer
    #             gmm_cluster_mod5.fit(X_train)
    #             elapsed_time = t.tocvalue() #Save elapsed time
    #             iter_num = gmm_cluster_mod5.n_iter_     
    #             resp_labels = "resp center values-random"+str(resp_type)
    #             y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
    # #             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
    # #             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
    #             print('y_train_pred_mod5 :\n', y_train_pred_mod5)
    #             print('y_train :\n', y_train)
    #             eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
    #             all_scores += [eval_scores]
    #         print("Finish GaussianMixtureMod5/resp center values-random : ", m, " : ",resp_type )

    #         #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
    #         for resp_type in resp_type_val:
    #             gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='kmeans', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
    #             t.tic() # Start timer
    #             gmm_cluster_mod5.fit(X_train)
    #             elapsed_time = t.tocvalue() #Save elapsed time
    #             iter_num = gmm_cluster_mod5.n_iter_     
    #             resp_labels = "resp center values-kmeans"+str(resp_type)
    #             y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
    # #             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
    # #             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
    #             print('y_train_pred_mod5 :\n', y_train_pred_mod5)
    #             print('y_train :\n', y_train)
    #             eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
    #             all_scores += [eval_scores]
    #         print("Finish GaussianMixtureMod5/resp center values-kmeans : ", m, " : ",resp_type)   


    #         #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
    #         for resp_type in resp_type_val:
    #             gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='jaha_init_hybrid', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
    #             t.tic() # Start timer
    #             gmm_cluster_mod5.fit(X_train)
    #             elapsed_time = t.tocvalue() #Save elapsed time
    #             iter_num = gmm_cluster_mod5.n_iter_     
    #             resp_labels = "resp center values-hybrid_init_"+str(resp_type)
    #             y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
    # #             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
    # #             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
    #             print('y_train_pred_mod5 :\n', y_train_pred_mod5)
    #             print('y_train :\n', y_train)
    #             eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
    #             all_scores += [eval_scores]
    #         print("Finish GaussianMixtureMod5/resp center values init_hybrid: ", m, " : ",resp_type )



    #         gmm_cluster_mod6 = GaussianMixtureMod6(n_components=n_classes, init_params='jaha_init_hybrid', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0 )
    #         t.tic() # Start timer
    #         gmm_cluster_mod6.fit(X_train)
    #         elapsed_time = t.tocvalue() #Save elapsed time
    #         iter_num = gmm_cluster_mod6.n_iter_     
    #         resp_labels = "outliers based-hybrid_init"
    #         y_train_pred_mod6 = gmm_cluster_mod6.predict(X_train)

    #         print('y_train_pred_mod6 :\n', y_train_pred_mod6)
    #         print('y_train :\n', y_train)
    #         eval_scores = cluster_eval(y_train, y_train_pred_mod6, X_train, resp_labels,m, elapsed_time, iter_num)
    #         all_scores += [eval_scores]
    #         print("Finish GaussianMixtureMod6/outliers based-init_hybrid: ", m )


            #Change result to Pandas DataFrame
            result = pd.DataFrame(all_scores)
            all_result = all_result.append(result)
            print("Finish GaussianMixtureMod : ", m)
            
    all_result = all_result.append(pd.Series(), ignore_index=True)  
    all_result.columns=result_labels    


    #Save results to Excel
    ts = time.time() 
    st = datetime.datetime.fromtimestamp(ts).strftime('%H-%M-%S---%d-%m-%Y')
    # all_result.to_excel("result/All_result_"+st+".xlsx", index=False)
    all_result.to_excel("result/apr/All_result_"+cov_type+"_"+st+".xlsx", index=False)
    absolutePath += [Path("result/apr/All_result_"+cov_type+"_"+st+".xlsx").resolve()]
print("\nFinish at ",st)



for k in range (len(cov_type_labels)):
    filePath = absolutePath[k]
    os.system(f'start excel.exe "{filePath}"')

In [None]:
# #Display full output
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"
from tqdm.auto import tqdm, trange
from ipywidgets import IntProgress
from IPython.display import display
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 40em; }</style>"))

import warnings
warnings.filterwarnings('ignore')
import datetime
import time
from pytictoc import TicToc
import sys
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, MeanShift
from fcmeans import FCM
from sklearn.mixture import GaussianMixture
from sklearn.mixture_mod import GaussianMixtureMod
from sklearn.mixture_mod2 import GaussianMixtureMod2
from sklearn.mixture_mod3 import GaussianMixtureMod3
from sklearn.mixture_mod4 import GaussianMixtureMod4
from sklearn.mixture_mod5 import GaussianMixtureMod5
from sklearn.mixture_mod6 import GaussianMixtureMod6
from IPython.utils import io
import numpy as np
from sklearn import metrics
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, adjusted_rand_score, normalized_mutual_info_score, homogeneity_completeness_v_measure,fowlkes_mallows_score,silhouette_score
from sklearn.metrics import davies_bouldin_score, v_measure_score
from scipy.stats import chi2
# opening EXCEL through Code local path in dir
import os
from pathlib import Path
#Funtion declaration
def cluster_accuracy(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)

    # Find optimal one-to-one mapping between cluster labels and true labels
    row_ind, col_ind = linear_sum_assignment(-contingency_matrix)
    print("row_ind : ", row_ind)
    print("col_ind : ", col_ind)
    val1 = contingency_matrix[row_ind, col_ind].sum()
    print("contingency_matrix[row_ind, col_ind].sum() : ", val1)
    val2 = np.sum(contingency_matrix)
    print("np.sum(contingency_matrix) : ", val2)

    # Return cluster accuracy
    return contingency_matrix[row_ind, col_ind].sum() / np.sum(contingency_matrix)

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
#     print("\ncontingency_matrix : \n", contingency_matrix)
    
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

#Get the Clustering Algorithm Performance
def cluster_eval(y_true, y_pred, X_train, cluster_label, m, elapsed_time,iter_num):
    eval_scores = []
    
    accuracy = np.mean(y_pred.ravel() == y_true.ravel()) * 100
    
    purity = purity_score(y_train, y_pred)
    micro_f1_score = f1_score(y_true, y_pred, average='micro')
    macro_f1_score = f1_score(y_true, y_pred, average='macro')
    weighted_f1_score = f1_score(y_true, y_pred, average='weighted')
    ari_score = adjusted_rand_score(y_true, y_pred)
    nmi_score = normalized_mutual_info_score(y_true, y_pred)
#     hc_v_measure = homogeneity_completeness_v_measure(y_true, y_pred)
    v_measure = v_measure_score(y_true, y_pred)
    fm_score = fowlkes_mallows_score(y_true, y_pred)
#     s_score = silhouette_score(X_train, y_pred, metric='euclidean')
#     db_score = davies_bouldin_score(X_train, y_pred)
#     eval_scores = [m, cluster_label, elapsed_time, iter_num, accuracy, purity, micro_f1_score, macro_f1_score, weighted_f1_score, ari_score, 
#                    nmi_score, v_measure, fm_score]
    eval_scores = [m, cluster_label, elapsed_time, iter_num, purity,  ari_score]
    return eval_scores

#function to remove outliers using z-score
def z_score(data):
    data_z = data['mean']
    outliers_z = data_z.copy()
    mean_z = np.mean(data_z)
    std_z =np.std(data_z)
    threshold=3.

    data['z_score'] = (outliers_z - mean_z)/std_z 
    outliers_z[(data['z_score'] > threshold)] = np.nan
    print('outliers_z : \n', outliers_z)
    #Drop outliers
    no_outliers_z = outliers_z.dropna()
    print('outliers_z_new : \n', no_outliers_z)    
    return no_outliers_z

#function to calculate means 
def means_init(X, n_components):
    n_samples, n_attributes = X.shape
    print("n_components : ", n_components)
    
    df_X = pd.DataFrame(X)
    
    df_X['mean'] = df_X.mean(axis=1)
    
    no_outliers_z = z_score(df_X)

    #Find Max, Min, Diff, Dev
    Max = max(no_outliers_z)
    Min = min(no_outliers_z)
    
    diff = Max - Min
    dev = diff/n_components
    
    print('Max: ', Max , 'Min: ', Min)   
    print('dev :', dev)
    
    cluster_range = np.zeros((n_components, 2))
    for i in range (n_components):
        cluster_range[i] = Min, Min+dev
        
        Min = Min+dev
        df_X['c'+str(i)]=0
    print(cluster_range)  

    for j in range (n_components):
        df_X['c'+str(j)][(df_X['mean']>= cluster_range[j,0]) & (df_X['mean'] < cluster_range[j,1])] = 1

    es_init_means = np.zeros((n_components, n_attributes))
    for k in range (n_components):    
        temp_df_X = df_X.iloc[:, 0:n_attributes][df_X['c'+str(k)] == 1]   
        es_init_means[k] = temp_df_X.mean(axis=0) 
    return es_init_means
"""------------------------------------------------------------------------------------------------------------------------
#Main Program - Start here!
------------------------------------------------------------------------------------------------------------------------"""
# np.set_printoptions(threshold=sys.maxsize)
# pd.set_option('display.max_rows', dataset.shape[0]+1)

#Define dataset_labels
# dataset_labels = ["iris", "vertebral","new_thyroid", "haberman", "landsat", "seed", "ecoli", "glass", "wine"]
# dataset_labels = ["ecoli"]"glass","vertebral","new_thyroid",
# dataset_labels = ["wine", "new_thyroid"]
# dataset_labels = ["transfusion", "wilt", "australian", "japanese", "iris"]
# dataset_labels = ["transfusion", "wilt", "breastCancer", "heartDisease", "adultIncome", "australian", "japanese", "bank", "seismicBumps", "german", 
#                   "chess", "iris", "abalone", "wallRobot", "dermatology"] 
dataset_labels = ["heartDisease"]            
# dataset_labels = ["ecoli", "seed", "glass"]

# skip_Mahal = ["australian","japanese", "seismicBumps","bank", "adultIncome"]

# cov_type_labels = ["full", "tied", "diag", "spherical"]
cov_type_labels = ["full"]
# cov_type_labels = ["diag", "spherical"]
# Threshold of GMM-EM Response values
# th_resp_val = [0.1, 0.2, 0.3, 0.4, 0.5]
# th_resp_val = [1]

resp_type_val = ['remove']
# resp_type_val = ['donate']
# resp_type_val = ['remove', 'donate']

# result_labels = ["Dataset","Algorithm", "Elapsed Time", "Iter_num","accuracy", "purity", "micro_f1_score", "macro_f1_score", "weighted_f1_score", "ari_score", 
#                 "nmi_score", "hc_v_measure", "fm_score"]
result_labels = ["Dataset","Algorithm", "Elapsed Time", "Iter_num", "purity", "ari_score"]

absolutePath = []
for cov_type in (cov_type_labels):
    print("\nStart Cov_Type :",cov_type)
                   
    all_scores = []
    final_scores = []
    all_result = pd.DataFrame()
    t = TicToc() # create TicToc instance
    # for m in dataset_labels:
    for m in tqdm(dataset_labels):
    #     enablePrint()
        print("\n\nStart Dataset : ",m)
        #Read dataset
        dataset = pd.read_csv("dataset/"+m+".csv")

        #Drop Target Column in data using Index
        X_train = dataset.drop('Target',axis=1)

        # #How to get Target data
        y_train =  dataset['Target']

        print(np.array(y_train))

        n_classes = len(np.unique(y_train))

 
        #Run the kMeans on current
        kMeans_cluster = KMeans(n_clusters = n_classes, max_iter=20, random_state=0)
        t.tic() # Start timer
        kMeans_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = kMeans_cluster.n_iter_
    #     print("kMeans time : ", elapsed_time )   
        y_train_pred = kMeans_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "kMeans", m, elapsed_time, iter_num )
    #     eval_scores = np.append([m],eval_scores,axis=0)  
        all_scores = [eval_scores]
 
        #Run the GMM on current
        gmm_cluster = GaussianMixture(n_components = n_classes, init_params='random', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
        t.tic() # Start timer
        gmm_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster.n_iter_
    #     print("GMM time : ", elapsed_time )       
        y_train_pred = gmm_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-random_init", m, elapsed_time, iter_num )
    #     all_scores += [eval_scores]
        all_scores += [eval_scores]
        
        #Run the GMM on current
        gmm_cluster = GaussianMixture(n_components = n_classes, init_params='kmeans', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
        t.tic() # Start timer
        gmm_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster.n_iter_
    #     print("GMM time : ", elapsed_time )       
        y_train_pred = gmm_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-kmeans_init", m, elapsed_time, iter_num )
    #     all_scores += [eval_scores]
        all_scores += [eval_scores]
              
        #Run the GMM on current
        gmm_cluster = GaussianMixture(n_components = n_classes, init_params='jaha_init_hybrid', means_init= None, covariance_type=cov_type, max_iter=20, random_state=0)
        t.tic() # Start timer
        gmm_cluster.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster.n_iter_
    #     print("GMM time : ", elapsed_time )       
        y_train_pred = gmm_cluster.predict(X_train)
        eval_scores = cluster_eval(y_train, y_train_pred, X_train, "GMM-hybrid_init", m, elapsed_time, iter_num )
    #     all_scores += [eval_scores]
        all_scores += [eval_scores]

                
#         for resp_type in resp_type_val:
#             gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='random', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
#             t.tic() # Start timer
#             gmm_cluster_mod5.fit(X_train)
#             elapsed_time = t.tocvalue() #Save elapsed time
#             iter_num = gmm_cluster_mod5.n_iter_     
#             resp_labels = "resp center values-random"+str(resp_type)
#             y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
# #             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
# #             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
#             print('y_train_pred_mod5 :\n', y_train_pred_mod5)
#             print('y_train :\n', y_train)
#             eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
#             all_scores += [eval_scores]
#         print("Finish GaussianMixtureMod5/resp center values-random : ", m, " : ",resp_type )
        
#         #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
#         for resp_type in resp_type_val:
#             gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='kmeans', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
#             t.tic() # Start timer
#             gmm_cluster_mod5.fit(X_train)
#             elapsed_time = t.tocvalue() #Save elapsed time
#             iter_num = gmm_cluster_mod5.n_iter_     
#             resp_labels = "resp center values-kmeans"+str(resp_type)
#             y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
# #             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
# #             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
#             print('y_train_pred_mod5 :\n', y_train_pred_mod5)
#             print('y_train :\n', y_train)
#             eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
#             all_scores += [eval_scores]
#         print("Finish GaussianMixtureMod5/resp center values-kmeans : ", m, " : ",resp_type)   

 
        #Run the GMM-Auto-v2 for every th_resp value on current dataset (using resp center values) - Last tested on 27.01.2021
        for resp_type in resp_type_val:
            gmm_cluster_mod5 = GaussianMixtureMod5(n_components=n_classes, init_params='jaha_init_hybrid', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0, resp_type=resp_type )
            t.tic() # Start timer
            gmm_cluster_mod5.fit(X_train)
            elapsed_time = t.tocvalue() #Save elapsed time
            iter_num = gmm_cluster_mod5.n_iter_     
            resp_labels = "resp center values-hybrid_init_"+str(resp_type)
            y_train_pred_mod5 = gmm_cluster_mod5.predict(X_train)
#             y_train_pred_mod5 = np.choose(y_train_pred_mod5,(4,5,0,1,2,3)).astype(np.int64) 
#             y_train_pred_mod5 = np.flip(y_train_pred_mod5)#5,1,0,3,2,5
            print('y_train_pred_mod5 :\n', y_train_pred_mod5)
            print('y_train :\n', y_train)
            eval_scores = cluster_eval(y_train, y_train_pred_mod5, X_train, resp_labels,m, elapsed_time, iter_num)
            all_scores += [eval_scores]
        print("Finish GaussianMixtureMod5/resp center values init_hybrid: ", m, " : ",resp_type )

        
        
        gmm_cluster_mod6 = GaussianMixtureMod6(n_components=n_classes, init_params='jaha_init_hybrid', means_init=None, covariance_type=cov_type, max_iter=20, random_state=0 )
        t.tic() # Start timer
        gmm_cluster_mod6.fit(X_train)
        elapsed_time = t.tocvalue() #Save elapsed time
        iter_num = gmm_cluster_mod6.n_iter_     
        resp_labels = "outliers based-hybrid_init"
        y_train_pred_mod6 = gmm_cluster_mod6.predict(X_train)

        print('y_train_pred_mod6 :\n', y_train_pred_mod6)
        print('y_train :\n', y_train)
        eval_scores = cluster_eval(y_train, y_train_pred_mod6, X_train, resp_labels,m, elapsed_time, iter_num)
        all_scores += [eval_scores]
        print("Finish GaussianMixtureMod6/outliers based-init_hybrid: ", m )


        #Change result to Pandas DataFrame
        result = pd.DataFrame(all_scores)
        all_result = all_result.append(result)
        print("Finish GaussianMixtureMod : ", m)
    all_result = all_result.append(pd.Series(), ignore_index=True) 
    all_result.columns=result_labels    


    #Save results to Excel
    ts = time.time() 
    st = datetime.datetime.fromtimestamp(ts).strftime('%H-%M-%S---%d-%m-%Y')
    # all_result.to_excel("result/All_result_"+st+".xlsx", index=False)
    all_result.to_excel("result/apr/All_result_"+cov_type+"_"+st+".xlsx", index=False)
    absolutePath += [Path("result/apr/All_result_"+cov_type+"_"+st+".xlsx").resolve()]
print("\nFinish at ",st)



for k in range (len(cov_type_labels)):
    filePath = absolutePath[k]
    os.system(f'start excel.exe "{filePath}"')

In [None]:
#Main program end here