In [1]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from ipynb.fs.full.TrainingFunction import Train_Models_CV10
from ipynb.fs.full.TrainingFunction import SavePredictionsToFile

from ipynb.fs.full.ParameterTuning import ParameterTuning

In [4]:
preprocessed_df = pd.read_csv("../data/preprocessedHeart.csv", index_col=0)

# Reseting the index
preprocessed_df.reset_index(drop=True, inplace=True)

categorical_features = [ 'sex_0','sex_1','cp_0','cp_1','cp_2','cp_3','exang_0','exang_1','slope_0','slope_1',
                        'slope_2','ca_0','ca_1','ca_2','ca_3','ca_4','thal_0','thal_1','thal_2','thal_3']
target = "target"

# Convert categorical feature types
preprocessed_df[categorical_features] = preprocessed_df[categorical_features].astype('category') 
preprocessed_df[target] = preprocessed_df[target].astype('category') 

print('Revenue distribution:\n'+ str(preprocessed_df[target].value_counts()))

Revenue distribution:
1    165
0    165
Name: target, dtype: int64


In [5]:
preprocessed_df

Unnamed: 0,thalach,oldpeak,chol,age,trestbps,target,sex_0,sex_1,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,0.234095,1.777495,-0.836098,-0.150692,140,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,1
1,-0.990359,1.518686,0.149501,1.172774,135,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1
2,-0.771706,0.138373,2.062724,0.069886,132,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1
3,-0.509323,0.742260,-0.758796,0.731619,140,0,0,1,1,0,...,1,0,1,0,0,0,0,0,0,1
4,-3.439267,-0.034166,-0.179032,1.393352,120,0,0,1,1,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,0.846322,-0.896862,-0.121055,0.180175,120,1,0,1,0,1,...,0,1,0,0,0,0,0,0,1,0
326,0.715131,0.138373,1.850144,0.069886,132,1,1,0,0,1,...,1,1,0,0,0,0,0,0,1,0
327,1.414819,-0.896862,-1.725070,-1.474158,120,1,0,1,0,1,...,1,1,0,0,0,0,0,0,1,0
328,1.021244,-0.896862,-1.377212,-1.805024,138,1,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0


# Models

Get Parameters from "MarketingCampaign_ParameterTuning"

In [9]:
knn_model = KNeighborsClassifier(metric='manhattan', n_neighbors=11, weights='distance')
svm_model = SVC(kernel= 'rbf')
dt_model = DecisionTreeClassifier(criterion='entropy', max_depth= 9, min_samples_leaf= 4, min_samples_split = 8)
rf_model = RandomForestClassifier(bootstrap= True, max_depth=10, max_features= 3, min_samples_leaf= 3, min_samples_split = 8, n_estimators = 300)
mlp_model = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(10,30,10), learning_rate='constant', solver='adam')
gde_model = GradientBoostingClassifier(learning_rate=1, max_depth=5, n_estimators=250)

models = [knn_model, svm_model, dt_model, rf_model, mlp_model, gde_model]

model_names = []

for mod in models:
    model_names.append(type(mod).__name__)

In [7]:
stats = Train_Models_CV10 (preprocessed_df, target, models)

KNeighborsClassifier Accuracy: 0.864 (0.056)
SVC Accuracy: 0.573 (0.092)
DecisionTreeClassifier Accuracy: 0.818 (0.069)
RandomForestClassifier Accuracy: 0.858 (0.038)
MLPClassifier Accuracy: 0.815 (0.044)
GradientBoostingClassifier Accuracy: 0.836 (0.063)


In [8]:
print(stats)

{'KNeighborsClassifier': array([0.84848485, 0.90909091, 0.78787879, 0.84848485, 0.93939394,
       0.87878788, 0.81818182, 0.93939394, 0.78787879, 0.87878788]), 'SVC': array([0.51515152, 0.51515152, 0.48484848, 0.63636364, 0.72727273,
       0.51515152, 0.6969697 , 0.60606061, 0.45454545, 0.57575758]), 'DecisionTreeClassifier': array([0.78787879, 0.93939394, 0.78787879, 0.75757576, 0.87878788,
       0.90909091, 0.72727273, 0.81818182, 0.78787879, 0.78787879]), 'RandomForestClassifier': array([0.87878788, 0.78787879, 0.84848485, 0.90909091, 0.87878788,
       0.90909091, 0.81818182, 0.84848485, 0.84848485, 0.84848485]), 'MLPClassifier': array([0.81818182, 0.75757576, 0.87878788, 0.84848485, 0.75757576,
       0.81818182, 0.81818182, 0.78787879, 0.78787879, 0.87878788]), 'GradientBoostingClassifier': array([0.84848485, 0.81818182, 0.87878788, 0.90909091, 0.84848485,
       0.75757576, 0.87878788, 0.78787879, 0.90909091, 0.72727273])}


In [15]:
SavePredictionsToFile("../accuracy/accuracyHeart.csv", stats)

# Overall Results

In [13]:
# To copy-paste into the report

outputString = ""
print("      KNN       SVM       DT         RF        MLP       GDE")
for i in range (10):
    outputString = outputString+str(i+1)+' & %.5f & %.5f & %.5f & %.5f & %.5f & %.5f \\\ \\hline \n' % (stats[model_names[0]][i],stats[model_names[1]][i],stats[model_names[2]][i],stats[model_names[3]][i],stats[model_names[4]][i],stats[model_names[5]][i])
    
print(outputString)

      KNN       SVM       DT         RF        MLP       GDE
1 & 0.84848 & 0.51515 & 0.78788 & 0.87879 & 0.81818 & 0.84848 \\ \hline 
2 & 0.90909 & 0.51515 & 0.93939 & 0.78788 & 0.75758 & 0.81818 \\ \hline 
3 & 0.78788 & 0.48485 & 0.78788 & 0.84848 & 0.87879 & 0.87879 \\ \hline 
4 & 0.84848 & 0.63636 & 0.75758 & 0.90909 & 0.84848 & 0.90909 \\ \hline 
5 & 0.93939 & 0.72727 & 0.87879 & 0.87879 & 0.75758 & 0.84848 \\ \hline 
6 & 0.87879 & 0.51515 & 0.90909 & 0.90909 & 0.81818 & 0.75758 \\ \hline 
7 & 0.81818 & 0.69697 & 0.72727 & 0.81818 & 0.81818 & 0.87879 \\ \hline 
8 & 0.93939 & 0.60606 & 0.81818 & 0.84848 & 0.78788 & 0.78788 \\ \hline 
9 & 0.78788 & 0.45455 & 0.78788 & 0.84848 & 0.78788 & 0.90909 \\ \hline 
10 & 0.87879 & 0.57576 & 0.78788 & 0.84848 & 0.87879 & 0.72727 \\ \hline 



# Friedman Statistic

In [None]:
def column(matrix, i):
    return [row[i] for row in matrix]

In [None]:
K = 10
N = 6

encoder = OrdinalEncoder()

print("__________________________________________________________________________________________Ranks:")
ranks = []

for i in range(10):
    data = asarray([[knn_stats[i]], [svm_stats[i]], [dt_stats[i]], [rf_stats[i]], [mlp_stats[i]], [gde_stats[i]]])
    ranks.append(encoder.fit_transform(data).flatten())
    ranks[i] = [abs(x-6.0) for x in ranks[i]]
    print(ranks[i])


print("__________________________________________________________________________________________R_bar:")
R_bar = (K+1)/2

print(R_bar)

print("__________________________________________________________________________________________eq_2:")
avg_knn = mean(column(ranks, 0))
avg_svm = mean(column(ranks, 1))
avg_dt = mean(column(ranks, 2))
avg_rf = mean(column(ranks, 3))
avg_mlp = mean(column(ranks, 4))
avg_gde = mean(column(ranks, 5))

print('Average Ranks: %.5f | %.5f | %.5f | %.5f | %.5f | %.5f \n' % (avg_knn, avg_svm, avg_dt, avg_rf, avg_mlp, avg_gde))

eq_2 = N * ((avg_knn - R_bar)**2 + (avg_svm - R_bar)**2 + (avg_dt - R_bar)**2 + (avg_rf - R_bar)**2 + (avg_mlp - R_bar)**2 + (gde_knn - R_bar)**2)

print(eq_2)

print("__________________________________________________________________________________________eq_3:")

eq_3 = 0

for row in ranks:
    for entry in row:
        eq_3 = eq_3 + (entry - R_bar)**2
        
eq_3 = eq_3/(N*(K-1))

print(eq_3)

print("__________________________________________________________________________________________Freidman Statistics:")

F_stat = eq_2/eq_3

print(F_stat)

print("__________________________________________________________________________________________Decision:")

F_0 = 10.800          # From class slides.

if (abs(F_stat)<F_0):
    print("Fail to reject the null (ie. cannot say that they are significantly different).")
elif (abs(F_stat)>=F_0):
    print("Reject the null (ie. they are significantly different).")