In [6]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
from ipynb.fs.full.TrainingFunction import Training_All_2
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [9]:
preprocessed_df = pd.read_csv("../data/preprocessedHeart.csv", index_col=0)

# Reseting the index
preprocessed_df.reset_index(drop=True, inplace=True)

categorical_features = [ 'sex_0','sex_1','cp_0','cp_1','cp_2','cp_3','exang_0','exang_1','slope_0','slope_1',
                        'slope_2','ca_0','ca_1','ca_2','ca_3','ca_4','thal_0','thal_1','thal_2','thal_3']
target = "target"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df[target].value_counts()))

Revenue distribution:
1    165
0    165
Name: target, dtype: int64


In [10]:
preprocessed_df

Unnamed: 0,thalach,oldpeak,chol,age,trestbps,target,sex_0,sex_1,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,0.234095,1.777495,-0.836098,-0.150692,140,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,1
1,-0.990359,1.518686,0.149501,1.172774,135,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1
2,-0.771706,0.138373,2.062724,0.069886,132,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1
3,-0.509323,0.742260,-0.758796,0.731619,140,0,0,1,1,0,...,1,0,1,0,0,0,0,0,0,1
4,-3.439267,-0.034166,-0.179032,1.393352,120,0,0,1,1,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,0.846322,-0.896862,-0.121055,0.180175,120,1,0,1,0,1,...,0,1,0,0,0,0,0,0,1,0
326,0.715131,0.138373,1.850144,0.069886,132,1,1,0,0,1,...,1,1,0,0,0,0,0,0,1,0
327,1.414819,-0.896862,-1.725070,-1.474158,120,1,0,1,0,1,...,1,1,0,0,0,0,0,0,1,0
328,1.021244,-0.896862,-1.377212,-1.805024,138,1,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0


# Models

Get Parameters from "MarketingCampaign_ParameterTuning"

In [11]:
knn_model = KNeighborsClassifier(metric='manhattan', n_neighbors=11, weights='distance')
svm_model = SVC(kernel= 'rbf')
dt_model = DecisionTreeClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 4, min_samples_split = 8)
rf_model = RandomForestClassifier(bootstrap= True, max_depth=10, max_features= 3, min_samples_leaf= 3, min_samples_split = 8, n_estimators = 300)
mlp_model = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(10,30,10), learning_rate='constant', solver='adam')
gde_model = GradientBoostingClassifier(learning_rate=1, max_depth=5, n_estimators=250)

models = [knn_model, svm_model, dt_model, rf_model, mlp_model, gde_model]

In [12]:
stats = Training_All_2 (preprocessed_df, target, models, 10, 10)

___________________________________________________________________________ Fold 1
____________________________ Model KNeighborsClassifier
1: 0.8787878787878788
2: 0.8787878787878788
3: 0.8787878787878788
4: 0.8787878787878788
5: 0.8787878787878788
6: 0.8787878787878788
7: 0.8787878787878788
8: 0.8787878787878788
9: 0.8787878787878788
10: 0.8787878787878788

 Average Accuracy: 0.8787878787878788

____________________________ Model SVC
1: 0.6060606060606061
2: 0.6060606060606061
3: 0.6060606060606061
4: 0.6060606060606061
5: 0.6060606060606061
6: 0.6060606060606061
7: 0.6060606060606061
8: 0.6060606060606061
9: 0.6060606060606061
10: 0.6060606060606061

 Average Accuracy: 0.6060606060606061

____________________________ Model DecisionTreeClassifier
1: 0.7878787878787878
2: 0.7878787878787878
3: 0.7878787878787878
4: 0.7272727272727273
5: 0.7878787878787878
6: 0.8484848484848485
7: 0.7878787878787878
8: 0.7878787878787878
9: 0.7878787878787878
10: 0.7878787878787878

 Average Accuracy: 0

2: 0.5454545454545454
3: 0.5454545454545454
4: 0.5454545454545454
5: 0.5454545454545454
6: 0.5454545454545454
7: 0.5454545454545454
8: 0.5454545454545454
9: 0.5454545454545454
10: 0.5454545454545454

 Average Accuracy: 0.5454545454545454

____________________________ Model DecisionTreeClassifier
1: 0.9090909090909091
2: 0.8787878787878788
3: 0.8787878787878788
4: 0.8787878787878788
5: 0.9090909090909091
6: 0.8787878787878788
7: 0.8787878787878788
8: 0.9090909090909091
9: 0.8787878787878788
10: 0.9090909090909091

 Average Accuracy: 0.8909090909090909

____________________________ Model RandomForestClassifier
1: 0.8787878787878788
2: 0.8787878787878788
3: 0.8787878787878788
4: 0.8787878787878788
5: 0.8787878787878788
6: 0.8484848484848485
7: 0.8787878787878788
8: 0.8787878787878788
9: 0.8787878787878788
10: 0.8484848484848485

 Average Accuracy: 0.8727272727272727

____________________________ Model MLPClassifier
1: 0.8181818181818182
2: 0.5151515151515151
3: 0.8181818181818182
4: 0.787

9: 0.8181818181818182
10: 0.8484848484848485

 Average Accuracy: 0.8424242424242424

____________________________ Model RandomForestClassifier
1: 0.9090909090909091
2: 0.9090909090909091
3: 0.9090909090909091
4: 0.9090909090909091
5: 0.9090909090909091
6: 0.9090909090909091
7: 0.8787878787878788
8: 0.8787878787878788
9: 0.9090909090909091
10: 0.9090909090909091

 Average Accuracy: 0.903030303030303

____________________________ Model MLPClassifier
1: 0.9393939393939394
2: 0.9393939393939394
3: 0.9393939393939394
4: 0.8787878787878788
5: 0.9090909090909091
6: 0.8787878787878788
7: 0.8484848484848485
8: 0.9393939393939394
9: 0.9393939393939394
10: 0.9393939393939394

 Average Accuracy: 0.9151515151515152

____________________________ Model GradientBoostingClassifier
1: 0.8181818181818182
2: 0.8787878787878788
3: 0.8181818181818182
4: 0.7878787878787878
5: 0.8787878787878788
6: 0.8181818181818182
7: 0.8181818181818182
8: 0.8787878787878788
9: 0.8181818181818182
10: 0.7878787878787878

 Av

In [13]:
stats

array([[0.87878788, 0.90909091, 0.87878788, 0.93939394, 0.90909091,
        0.81818182, 0.81818182, 0.81818182, 0.84848485, 0.93939394],
       [0.60606061, 0.54545455, 0.51515152, 0.66666667, 0.54545455,
        0.48484848, 0.57575758, 0.57575758, 0.63636364, 0.45454545],
       [0.78787879, 0.85757576, 0.78787879, 0.87575758, 0.89090909,
        0.81515152, 0.81818182, 0.59090909, 0.84242424, 0.90909091],
       [0.86666667, 0.91818182, 0.84848485, 0.95454545, 0.87272727,
        0.87272727, 0.82121212, 0.7030303 , 0.9030303 , 0.90606061],
       [0.81515152, 0.75151515, 0.84545455, 0.83333333, 0.73636364,
        0.76060606, 0.76666667, 0.68484848, 0.91515152, 0.71515152],
       [0.85757576, 0.8969697 , 0.84545455, 0.89090909, 0.8969697 ,
        0.8030303 , 0.81515152, 0.69393939, 0.83030303, 0.95151515]])

# Overall Results

In [14]:
# To copy-paste into the report

outputString = ""
print("      KNN     SVM     DT     RF     MLP     GDE")
for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (stats[0][i],stats[1][i],stats[2][i],stats[3][i],stats[4][i],stats[5][i])
    
print(outputString)

      KNN     SVM     DT     RF     MLP     GDE
1 & 0.87879 & 0.60606 & 0.78788 & 0.86667 & 0.81515 & 0.85758 \\ \hline 
2 & 0.90909 & 0.54545 & 0.85758 & 0.91818 & 0.75152 & 0.89697 \\ \hline 
3 & 0.87879 & 0.51515 & 0.78788 & 0.84848 & 0.84545 & 0.84545 \\ \hline 
4 & 0.93939 & 0.66667 & 0.87576 & 0.95455 & 0.83333 & 0.89091 \\ \hline 
5 & 0.90909 & 0.54545 & 0.89091 & 0.87273 & 0.73636 & 0.89697 \\ \hline 
6 & 0.81818 & 0.48485 & 0.81515 & 0.87273 & 0.76061 & 0.80303 \\ \hline 
7 & 0.81818 & 0.57576 & 0.81818 & 0.82121 & 0.76667 & 0.81515 \\ \hline 
8 & 0.81818 & 0.57576 & 0.59091 & 0.70303 & 0.68485 & 0.69394 \\ \hline 
9 & 0.84848 & 0.63636 & 0.84242 & 0.90303 & 0.91515 & 0.83030 \\ \hline 
10 & 0.93939 & 0.45455 & 0.90909 & 0.90606 & 0.71515 & 0.95152 \\ \hline 



# Friedman Statistic

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

In [None]:
K = 10
N = 6

encoder = OrdinalEncoder()

print("__________________________________________________________________________________________Ranks:")
ranks = []

for i in range(10):
    data = asarray([[knn_stats[i]], [svm_stats[i]], [dt_stats[i]], [rf_stats[i]], [mlp_stats[i]], [gde_stats[i]]])
    ranks.append(encoder.fit_transform(data).flatten())
    ranks[i] = [abs(x-6.0) for x in ranks[i]]
    print(ranks[i])


print("__________________________________________________________________________________________R_bar:")
R_bar = (K+1)/2

print(R_bar)

print("__________________________________________________________________________________________eq_2:")
avg_knn = mean(column(ranks, 0))
avg_svm = mean(column(ranks, 1))
avg_dt = mean(column(ranks, 2))
avg_rf = mean(column(ranks, 3))
avg_mlp = mean(column(ranks, 4))
avg_gde = mean(column(ranks, 5))

print('Average Ranks: %.5f | %.5f | %.5f | %.5f | %.5f | %.5f \n' % (avg_knn, avg_svm, avg_dt, avg_rf, avg_mlp, avg_gde))

eq_2 = N * ((avg_knn - R_bar)**2 + (avg_svm - R_bar)**2 + (avg_dt - R_bar)**2 + (avg_rf - R_bar)**2 + (avg_mlp - R_bar)**2 + (gde_knn - R_bar)**2)

print(eq_2)

print("__________________________________________________________________________________________eq_3:")

eq_3 = 0

for row in ranks:
    for entry in row:
        eq_3 = eq_3 + (entry - R_bar)**2
        
eq_3 = eq_3/(N*(K-1))

print(eq_3)

print("__________________________________________________________________________________________Freidman Statistics:")

F_stat = eq_2/eq_3

print(F_stat)

print("__________________________________________________________________________________________Decision:")

F_0 = 10.800          # From class slides.

if (abs(F_stat)<F_0):
    print("Fail to reject the null (ie. cannot say that they are significantly different).")
elif (abs(F_stat)>=F_0):
    print("Reject the null (ie. they are significantly different).")