In [8]:
#importing libraries
import pandas as pd 
import numpy as np
from scipy.io import arff
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from scipy.stats import multivariate_normal

In [9]:
dataset = arff.loadarff('php4ylQmK.arff')[0] ##converting from dataset to data frame using arff package

In [10]:
df = pd.DataFrame(dataset)
df['Class']=[int(x) for x in df['Class']]

In [11]:
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V14,V15,V16,V17,V18,V19,V20,V21,V22,Class
0,119.992,157.302,74.997,0.00784,0.00007,0.00370,0.00554,0.01109,0.04374,0.426,...,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654,2
1,122.400,148.650,113.819,0.00968,0.00008,0.00465,0.00696,0.01394,0.06134,0.626,...,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.335590,2.486855,0.368674,2
2,116.682,131.111,111.555,0.01050,0.00009,0.00544,0.00781,0.01633,0.05233,0.482,...,0.08270,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,2
3,116.676,137.871,111.366,0.00997,0.00009,0.00502,0.00698,0.01505,0.05492,0.517,...,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975,2
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.10470,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.332180,0.410335,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,174.188,230.978,94.261,0.00459,0.00003,0.00263,0.00259,0.00790,0.04087,0.405,...,0.07008,0.02764,19.517,0.448439,0.657899,-6.538586,0.121952,2.657476,0.133050,1
191,209.516,253.017,89.488,0.00564,0.00003,0.00331,0.00292,0.00994,0.02751,0.263,...,0.04812,0.01810,19.147,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895,1
192,174.688,240.005,74.287,0.01360,0.00008,0.00624,0.00564,0.01873,0.02308,0.256,...,0.03804,0.10715,17.883,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728,1
193,198.764,396.961,74.904,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,0.241,...,0.03794,0.07223,19.020,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306,1


LDA CODE

In [12]:
def LDA(df):
    X = df.iloc[:,:22].values #seperate the values using integer location based indexes
    Y = df['Class'].values #predict the last coulumn at the end
    accuracy = []
    balanced_accuracy = []
   
    kfold = KFold(n_splits=10,shuffle=True,random_state=100) #KFold from sklearn to split train/tests sets

    for train,test in kfold.split(X):
        X_train, X_test = X[train], X[test]
        Y_train, Y_test = Y[train], Y[test]
        
        # using values 1 and 2 seperate healthy and unhealthy in dataset
        healthy = [key for key, value in enumerate(Y_train) if value == 1]
        unhealthy = [key for key, value in enumerate(Y_train) if value == 2]

        prior_healthy = len(healthy)/len(Y_train)
        prior_unhealthy = 1 - prior_healthy 

        #constuct new dataset 
        new_healthy = X_train[healthy, :]
        new_unhealthy = X_train[unhealthy, :]

        #mean calculation
        mean_healthy = np.mean(new_healthy, axis=0)
        mean_unhealthy = np.mean(new_unhealthy, axis=0)

        #sigma calculation
        #get the converse matrix
        sigma = np.cov(X_train, rowvar=False) 
        inverse_sigma = np.linalg.pinv(sigma)

        predicted = []

        for X_new in X_test:
            
            #delta calculations
            delta_healthy = (np.dot(np.dot(X_new.T, inverse_sigma), mean_healthy)) - (0.5 * np.dot(np.dot(mean_healthy.T, inverse_sigma), mean_healthy)) + np.log(prior_healthy)
            delta_unhealthy = (np.dot(np.dot(X_new.T, inverse_sigma), mean_unhealthy)) - (0.5 * np.dot(np.dot(mean_unhealthy.T, inverse_sigma), mean_unhealthy)) + np.log(prior_unhealthy)

            if delta_healthy - delta_unhealthy > 0:
                predicted.append(1)
            else:
                predicted.append(2)
        accuracy.append(accuracy_score(Y_test, predicted))
        balanced_accuracy.append(balanced_accuracy_score(Y_test, predicted))
    return accuracy, balanced_accuracy   

In [13]:
lda_accuracy, lda_balanced_accuracy = LDA(df)
print(lda_accuracy,lda_balanced_accuracy)

[0.8, 0.85, 1.0, 0.9, 0.8, 0.8947368421052632, 0.6842105263157895, 1.0, 1.0, 0.8947368421052632] [0.6666666666666666, 0.7, 1.0, 0.8, 0.6666666666666666, 0.8571428571428572, 0.5714285714285714, 1.0, 1.0, 0.8]


QDA CODE

In [14]:
def QDA(df):
    X = df.iloc[:,:22].values #seperate the values using integer location based indexes
    Y = df['Class'].values #predict the last coulumn at the end
    accuracy = []
    balanced_accuracy = []
   
    kfold = KFold(n_splits=10,shuffle=True,random_state=100) #KFold from sklearn to split train/tests sets

    for train,test in kfold.split(X):
        X_train, X_test = X[train], X[test]
        Y_train, Y_test = Y[train], Y[test]

        # using values 1 and 2 seperate healthy and unhealthy in dataset 
        healthy = [key for key, value in enumerate(Y_train) if value == 1]
        unhealthy = [key for key, value in enumerate(Y_train) if value == 2]

        prior_healthy = len(healthy)/len(Y_train)
        prior_unhealthy = 1 - prior_healthy 

        #constuct new dataset
        new_healthy = X_train[healthy, :]
        new_unhealthy = X_train[unhealthy, :]

        #mean calculations
        mean_healthy = np.mean(new_healthy, axis=0)
        mean_unhealthy = np.mean(new_unhealthy, axis=0)

        #sigma calculation
        #Get the covarance matrix
        sigma_healthy = np.cov(new_healthy, rowvar=False)
        sigma_unhealthy = np.cov(new_unhealthy, rowvar=False)

        predicted = []

        for X_new in X_test:
            
            #delta calculations
            delta_healthy = -0.5 * np.log(np.linalg.det(sigma_healthy)) - 0.5 * np.dot(np.dot((X_new - mean_healthy).T, np.linalg.pinv(sigma_healthy)), (X_new - mean_healthy)) + np.log(prior_healthy)
            delta_unhealthy = -0.5 * np.log(np.linalg.det(sigma_unhealthy)) - 0.5 * np.dot(np.dot((X_new - mean_unhealthy).T, np.linalg.pinv(sigma_unhealthy)), (X_new - mean_unhealthy)) + np.log(prior_unhealthy)
 
            if delta_healthy - delta_unhealthy > 0:
                predicted.append(1)
            else:
                predicted.append(2)
        accuracy.append(accuracy_score(Y_test, predicted))
        balanced_accuracy.append(balanced_accuracy_score(Y_test, predicted))
    return accuracy, balanced_accuracy 

In [15]:
qda_accuracy, qda_balanced_accuracy = QDA(df)
print(qda_accuracy,qda_balanced_accuracy)

[0.8, 0.9, 0.85, 0.75, 0.9, 0.9473684210526315, 1.0, 0.7894736842105263, 0.8947368421052632, 0.8421052631578947] [0.6666666666666666, 0.8666666666666667, 0.9210526315789473, 0.6333333333333333, 0.9285714285714286, 0.9285714285714286, 1.0, 0.875, 0.8020833333333333, 0.8285714285714285]


GDA CODE

In [16]:
   def GDA(df):
    X = df.iloc[:,:22].values #seperate the values using integer location based indexes
    Y = df['Class'].values #predict the last coulumn at the end
    accuracy = []
    balanced_accuracy = []
   
    kfold = KFold(n_splits=10,shuffle=True)#KFold from sklearn to split train/tests sets

    for train,test in kfold.split(X):
        X_train, X_test = X[train], X[test]
        Y_train, Y_test = Y[train], Y[test]

         # using values 1 and 2 seperate healthy and unhealthy in dataset
        healthy = [key for key, value in enumerate(Y_train) if value == 1]
        unhealthy = [key for key, value in enumerate(Y_train) if value == 2]

        prior_healthy = len(healthy)/len(Y_train)
        prior_unhealthy = 1 - prior_healthy 

        #constuct new dataset
        new_healthy = X_train[healthy, :]
        new_unhealthy = X_train[unhealthy, :]

        #mean calculation
        mean_healthy = np.mean(new_healthy, axis=0)
        mean_unhealthy = np.mean(new_unhealthy, axis=0)

        #sigma calculation
        #Get the covarance matrix
        sigma = np.cov(X_train, rowvar=False)

        predicted = []

        for X_new in X_test:
            
            #calculate multivariate_normal
            mvn_healthy =  multivariate_normal(mean_healthy, sigma, allow_singular=True)
            mvn_unhealthy =  multivariate_normal(mean_unhealthy, sigma, allow_singular=True)
            PX_healthy = mvn_healthy.pdf(X_new)
            PX_unhealthy = mvn_unhealthy.pdf(X_new)
            
            if ((abs(PX_healthy - PX_unhealthy) < 0.001) and (PX_healthy > 0.01)):
                predicted.append(1)
            else:
                predicted.append(2)

        accuracy.append(accuracy_score(Y_test, predicted))
        balanced_accuracy.append(balanced_accuracy_score(Y_test, predicted))
    return accuracy, balanced_accuracy 

In [17]:
gda_accuracy, gda_balanced_accuracy = GDA(df)
print(gda_accuracy, gda_balanced_accuracy)

[0.75, 0.75, 0.95, 0.75, 0.8, 0.5789473684210527, 0.7368421052631579, 0.7368421052631579, 0.631578947368421, 0.8421052631578947] [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]


In [20]:
accuracy_map = {
    'CROSS-Validation' : ['CV1','CV2','CV3','CV','CV5','CV6','CV7','CV8','CV9','CV10'],
    'GDA' : gda_accuracy,        
    'LDA' : lda_accuracy,         
    'QDA' : qda_accuracy
}
dfa = pd.DataFrame(accuracy_map)
dfa = dfa.append(dfa.aggregate({
    "GDA":['mean'],             
    "LDA":['mean'],                
    "QDA":['mean']}))
dfa.style #display dataframe

Unnamed: 0,CROSS-Validation,GDA,LDA,QDA
0,CV1,0.75,0.8,0.8
1,CV2,0.75,0.85,0.9
2,CV3,0.95,1.0,0.85
3,CV,0.75,0.9,0.75
4,CV5,0.8,0.8,0.9
5,CV6,0.578947,0.894737,0.947368
6,CV7,0.736842,0.684211,1.0
7,CV8,0.736842,1.0,0.789474
8,CV9,0.631579,1.0,0.894737
9,CV10,0.842105,0.894737,0.842105


In [21]:
balanced_accuracy_map = {
    'CROSS-Validation' : ['CV1','CV2','CV3','CV4','CV5','CV6','CV7','CV8','CV9','CV10'],
    'GDA' : gda_balanced_accuracy,    
    'LDA' : lda_balanced_accuracy,      
    'QDA' : qda_balanced_accuracy
}
dfb = pd.DataFrame(balanced_accuracy_map)
dfb = dfb.append(dfb.aggregate({
     "GDA":['mean'],            
    "LDA":['mean'],   
    "QDA":['mean']
}))
dfb.style #display dataframe

Unnamed: 0,CROSS-Validation,GDA,LDA,QDA
0,CV1,0.5,0.666667,0.666667
1,CV2,0.5,0.7,0.866667
2,CV3,0.5,1.0,0.921053
3,CV4,0.5,0.8,0.633333
4,CV5,0.5,0.666667,0.928571
5,CV6,0.5,0.857143,0.928571
6,CV7,0.5,0.571429,1.0
7,CV8,0.5,1.0,0.875
8,CV9,0.5,1.0,0.802083
9,CV10,0.5,0.8,0.828571
