In [9]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

df = pd.read_csv('framingham.csv', encoding='latin-1')
# Filling out missing values
df['BPMeds'].fillna(0, inplace = True)
df['glucose'].fillna(df.glucose.mean(), inplace = True)
df['totChol'].fillna(df.totChol.mean(), inplace = True)
df['education'].fillna(1, inplace = True)
df['BMI'].fillna(df.BMI.mean(), inplace = True)
df['heartRate'].fillna(df.heartRate.mean(), inplace = True)
df.dropna(inplace= True)
df = pd.get_dummies(df)


labels = np.array(df['TenYearCHD'])
df= df.drop('TenYearCHD', axis = 1)
    
train_x, test_x, train_y, test_y = train_test_split(df, labels, test_size = 0.25, random_state = 42)

print ("Train_x Shape: ",train_x.shape)
print ("Train_y Shape: ", train_y.shape)
print ("Test_x Shape: ", test_x.shape)
print ("Test_y Shape: ", test_y.shape)

Train_x Shape:  (3158, 15)
Train_y Shape:  (3158,)
Test_x Shape:  (1053, 15)
Test_y Shape:  (1053,)


# Analysis & Results

Ensemble metotlardan daha iyi sonuç elde etmenin Bagging,Boosting,Voting,Stacking gibi bir kaç yolu var.
Ben performansı yeterince yüksek olmayan 3 algoritmayı kombinleyerek 
daha iyi bir sonuç elde etmeye çalıştığım ve daha kolay olduğu için Voting kullanmaya karar verdim.

Biri diğerini oluştursa da decision tree, random forest ve Gausian Naive Bayes classifierlara voting 
uygulandığında bir performans artışı olup olmadığına bakacağız.
Decision tree ve random forest  metodlarında ciddi bir overfitting görülüyor. 
Modeli train datası için çalıştırdığımızda çok yüksek AUC score geliyor.
Grid Search ile Tuning yapmaya çalıştım, elde ettiğim optimum sonuç bana yeterli gelmedi.

Voting için ağırlık kullanmamaya karar verdim. Bir kaç farklı algoritma denedim,
hepsinin 1 dediklerini 1 almak, fazla 1 denene bakmak gibi;
ama sonuçta herhangi bir algoritmanın 1 dediğini 1 sayarak daha iyi bir sonuç elde ettim.

Bu algoritmalar random oldukları için farklı 
denemelerde farklı sonuçlar vermeye yatkınlar.
Ama ulaşabilecekleri maksimum açıklama güçleri veya belirli sayıda deneme sonucunda 
verdikleri sonuçlar karşılaştırıldığında sonuç üzerinde anlamlı bir fark olduğu görülebilir.

     Random bir çalıştırmada elde edilen performans artışı aşağıda görülüyor:
     Decision Tree
     AUC Score: 0.578496583143508
     Random Forest
     AUC Score: 0.5618158151643345
     GausianNB
     AUC Score: 0.5984184835665473
     Voting
     AUC Score: 0.6294825903026358


## Decision Tree

In [18]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(min_samples_split = 15)

# Train Decision Tree Classifer
clf1 = clf.fit(train_x,train_y)

#Predict the response for test dataset
pred_y = clf1.predict(test_x)
pred_dec_tree = pred_y.copy()
print(confusion_matrix(test_y, pred_y))
print("Accuracy:",metrics.accuracy_score(test_y, pred_y))
print ('\x1b[6;30;42m' + 'AUC Score:' + '\x1b[0m', roc_auc_score(test_y, pred_y))
print ("Precision:", precision_score(test_y, pred_y))
print ("Recall:", recall_score(test_y, pred_y))
print ("F1 Score:", f1_score(test_y, pred_y))

[[773 105]
 [128  47]]
Accuracy: 0.7787274453941121
[6;30;42mAUC Score:[0m 0.574490725675236
Precision: 0.3092105263157895
Recall: 0.26857142857142857
F1 Score: 0.28746177370030584


## Random Forest

In [19]:
rfc = RandomForestClassifier(n_estimators = 5)

rfc = rfc.fit(train_x,train_y)

#Predict the response for test dataset
pred_y = rfc.predict(test_x)
pred_rand_forest = pred_y.copy()
print(confusion_matrix(test_y, pred_y))
print("Accuracy:",metrics.accuracy_score(test_y, pred_y))
print ('\x1b[6;30;42m' + 'AUC Score:' + '\x1b[0m', roc_auc_score(test_y, pred_y))
print ("Precision:", precision_score(test_y, pred_y))
print ("Recall:", recall_score(test_y, pred_y))
print ("F1 Score:", f1_score(test_y, pred_y))

[[836  42]
 [160  15]]
Accuracy: 0.8081671415004749
[6;30;42mAUC Score:[0m 0.5189391474129514
Precision: 0.2631578947368421
Recall: 0.08571428571428572
F1 Score: 0.12931034482758622


## Gausian NB

In [20]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb = gnb.fit(train_x,train_y)

#Predict the response for test dataset
pred_y = gnb.predict(test_x)
pred_gnb = pred_y.copy()
print(confusion_matrix(test_y, pred_y))
print("Accuracy:",metrics.accuracy_score(test_y, pred_y))
print ('\x1b[6;30;42m' + 'AUC Score:' + '\x1b[0m', roc_auc_score(test_y, pred_y))
print ("Precision:", precision_score(test_y, pred_y))
print ("Recall:", recall_score(test_y, pred_y))
print ("F1 Score:", f1_score(test_y, pred_y))

[[810  68]
 [127  48]]
Accuracy: 0.8148148148148148
[6;30;42mAUC Score:[0m 0.5984184835665473
Precision: 0.41379310344827586
Recall: 0.2742857142857143
F1 Score: 0.32989690721649484


## Voting Algoritması

In [21]:
#sum all prediction values of all three models to take account all 1 values
total_pred = pred_dec_tree+pred_rand_forest+pred_gnb

#turn all values into binary form by turning the values which are greater then 1 to 1.
for k in range(0,len(total_pred)):
    if total_pred[k] >= 1:
        total_pred[k]=1
    elif total_pred[k]==0:
        total_pred[k]=0
        
print(confusion_matrix(test_y, total_pred))
print("Accuracy:",metrics.accuracy_score(test_y, total_pred))
print ('\x1b[6;30;42m' + 'AUC Score:' + '\x1b[0m', roc_auc_score(test_y, total_pred))
print ("Precision:", precision_score(test_y, total_pred))
print ("Recall:", recall_score(test_y, total_pred))
print ("F1 Score:", f1_score(test_y, total_pred))

[[707 171]
 [100  75]]
Accuracy: 0.7426400759734093
[6;30;42mAUC Score:[0m 0.6169053042629353
Precision: 0.3048780487804878
Recall: 0.42857142857142855
F1 Score: 0.3562945368171021
