In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
%matplotlib inline

In [68]:
df = pd.read_csv("heart_cleveland_upload.csv")

In [69]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [70]:
df['condition'].value_counts()

0    160
1    137
Name: condition, dtype: int64

In [71]:
cor = df.corr()
cor['condition'].sort_values(ascending = False)

condition    1.000000
thal         0.520516
ca           0.463189
oldpeak      0.424052
exang        0.421355
cp           0.408945
slope        0.333049
sex          0.278467
age          0.227075
restecg      0.166343
trestbps     0.153490
chol         0.080285
fbs          0.003167
thalach     -0.423817
Name: condition, dtype: float64

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       297 non-null    int64  
 5   fbs        297 non-null    int64  
 6   restecg    297 non-null    int64  
 7   thalach    297 non-null    int64  
 8   exang      297 non-null    int64  
 9   oldpeak    297 non-null    float64
 10  slope      297 non-null    int64  
 11  ca         297 non-null    int64  
 12  thal       297 non-null    int64  
 13  condition  297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB


In [73]:
minmax = MinMaxScaler()
columns_continuous = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[columns_continuous] = minmax.fit_transform(df[columns_continuous])

In [74]:
x = df.drop(columns = 'condition', axis = 1)
y = df['condition']

In [75]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, stratify= y, random_state = 42)

In [76]:
x_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
91,0.6875,0,2,0.339623,0.312785,0,0,0.198473,0,0.193548,1,1,2
115,0.5,1,2,0.339623,0.273973,1,2,0.778626,0,0.0,0,3,0
125,0.4375,0,2,0.245283,0.212329,0,0,0.664122,0,0.258065,1,0,0
63,0.25,1,1,0.386792,0.175799,0,0,0.465649,0,0.0,1,0,1
159,0.8125,1,3,0.471698,0.152968,1,0,0.534351,0,0.548387,1,2,2


In [77]:
select_feature = SelectKBest(chi2, k=11).fit(x_train, y_train)

In [78]:
print('Score list:', select_feature.scores_)
print('Feature list:', x_train.columns)

Score list: [9.66566080e-01 3.28829899e+00 1.85478174e+01 3.98267675e-01
 2.40566564e-02 2.19932626e-01 5.49679327e+00 2.39168399e+00
 2.76050497e+01 9.55800306e+00 1.65312284e+01 6.91914050e+01
 5.70552040e+01]
Feature list: Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')


In [79]:
sorted(select_feature.scores_)

[0.024056656399247797,
 0.21993262614678885,
 0.3982676746477565,
 0.9665660801188893,
 2.391683986759997,
 3.288298993762586,
 5.496793272667693,
 9.558003062885186,
 16.531228447584517,
 18.547817389717874,
 27.605049730945666,
 57.05520404935147,
 69.19140497010156]

In [80]:
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)

## KNN

In [81]:
knn_tune = KNeighborsClassifier()
param_knn = {'n_neighbors' :[3, 5, 7, 9, 11, 13, 15],
             'weights':['uniform','distance'],
             'p':[1, 2]}
model_knn_tuned = GridSearchCV(estimator=knn_tune, param_grid=param_knn, cv = 3, n_jobs=-1, verbose=1, scoring='recall')
model_knn_tuned.fit(x_train_2, y_train)

Fitting 3 folds for each of 28 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:    5.1s finished


GridSearchCV(cv=3, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'n_neighbors': [3, 5, 7, 9, 11, 13, 15], 'p': [1, 2],
                         'weights': ['uniform', 'distance']},
             scoring='recall', verbose=1)

In [82]:
knn_tuned = model_knn_tuned.best_estimator_
knn_tuned

KNeighborsClassifier(n_neighbors=15, p=1)

In [83]:
pred_test = knn_tuned.predict(x_test_2)
recall_knn_tuning_test = recall_score(y_test, pred_test)

In [84]:
cm_knn_tuned = confusion_matrix(y_test, pred_test, labels=[1, 0])
cm_knn_tuned = pd.DataFrame(data=cm_knn_tuned, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_knn_tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,21,7
Akt 0,0,32


In [85]:
tp_knn = cm_knn_tuned['Pred 1'][0]
tn_knn = cm_knn_tuned['Pred 0'][1]
fp_knn = cm_knn_tuned['Pred 1'][1]
fn_knn = cm_knn_tuned['Pred 0'][0]

## SVM

In [86]:
svm_tune = SVC()
param_svm = {
    "C" : [0.0001, 0.001, 0.01, 0.1, 10],
    "kernel" : ['rbf', 'linear'],
    "gamma" : [0.001, 0.01, 0.1, 1],
    "max_iter" : [50, 100, 500, 1000]
}
model_svm_tuned = GridSearchCV(estimator=svm_tune, param_grid=param_svm, cv = 3, n_jobs=-1, verbose=1, scoring='recall')
model_svm_tuned.fit(x_train_2, y_train)

Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    1.6s finished


GridSearchCV(cv=3, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 10],
                         'gamma': [0.001, 0.01, 0.1, 1],
                         'kernel': ['rbf', 'linear'],
                         'max_iter': [50, 100, 500, 1000]},
             scoring='recall', verbose=1)

In [87]:
svm_tuned = model_svm_tuned.best_estimator_
svm_tuned

SVC(C=0.0001, gamma=1, max_iter=50)

In [88]:
pred_test = svm_tuned.predict(x_test_2)
recall_svm_tuning_test = recall_score(y_test, pred_test)

In [89]:
cm_svm_tuned = confusion_matrix(y_test, pred_test, labels=[1, 0])
cm_svm_tuned = pd.DataFrame(data=cm_svm_tuned, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_svm_tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,25,3
Akt 0,5,27


In [90]:
tp_svm = cm_svm_tuned['Pred 1'][0]
tn_svm = cm_svm_tuned['Pred 0'][1]
fp_svm = cm_svm_tuned['Pred 1'][1]
fn_svm = cm_svm_tuned['Pred 0'][0]

## Logistic Regression

In [91]:
log_reg_tune = LogisticRegression()
param_logreg = {'penalty' : ['l1', 'l2', 'elasticnet', None],
                'class_weight': [None, 'weight'],
                'fit_intercept' : [True, False]}
model_logreg_tuned = GridSearchCV(estimator=log_reg_tune, param_grid=param_logreg, cv = 3, n_jobs=-1, verbose=1, scoring='recall')
model_logreg_tuned.fit(x_train_2, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  41 out of  48 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.1s finished


GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'class_weight': [None, 'weight'],
                         'fit_intercept': [True, False],
                         'penalty': ['l1', 'l2', 'elasticnet', None]},
             scoring='recall', verbose=1)

In [92]:
logreg_tuned = model_logreg_tuned.best_estimator_
logreg_tuned

LogisticRegression()

In [93]:
pred_test = logreg_tuned.predict(x_test_2)
recall_logreg_tuning_test = recall_score(y_test, pred_test)

In [94]:
cm_logreg_tuned = confusion_matrix(y_test, pred_test, labels=[1, 0])
cm_logreg_tuned = pd.DataFrame(data=cm_logreg_tuned, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_logreg_tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,22,6
Akt 0,0,32


In [95]:
tp_logreg = cm_logreg_tuned['Pred 1'][0]
tn_logreg = cm_logreg_tuned['Pred 0'][1]
fp_logreg = cm_logreg_tuned['Pred 1'][1]
fn_logreg = cm_logreg_tuned['Pred 0'][0]

## XGBoost

In [96]:
xgb = XGBClassifier()
param_XGB = {'max_depth':[3,6,8],
             'subsample':[0.3, 0.5, 0.7, 1],
             'gamma':[0.001, 0.01, 0.1]}
model_xgb_tuned = GridSearchCV(estimator=xgb, param_grid=param_XGB, cv = 3, n_jobs=-1, verbose=1, scoring='recall')
model_xgb_tuned.fit(x_train_2, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 101 out of 108 | elapsed:    4.5s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:    4.6s finished


GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [97]:
xgb_tuned = model_xgb_tuned.best_estimator_
xgb_tuned

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.001, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.3,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [98]:
pred_test = xgb_tuned.predict(x_test_2)
recall_xgb_tuning_test = recall_score(y_test, pred_test)

In [99]:
cm_xgb_tuned = confusion_matrix(y_test, pred_test, labels=[1, 0])
cm_xgb_tuned = pd.DataFrame(data=cm_xgb_tuned, index=['Akt 1', 'Akt 0'], columns=['Pred 1', 'Pred 0'])
cm_xgb_tuned

Unnamed: 0,Pred 1,Pred 0
Akt 1,24,4
Akt 0,5,27


In [100]:
tp_xgb = cm_xgb_tuned['Pred 1'][0]
tn_xgb = cm_xgb_tuned['Pred 0'][1]
fp_xgb = cm_xgb_tuned['Pred 1'][1]
fn_xgb = cm_xgb_tuned['Pred 0'][0]

## Model Evaluation

In [101]:
recall = {'Recall' : [recall_knn_tuning_test, 
                      recall_svm_tuning_test, 
                      recall_logreg_tuning_test, 
                      recall_xgb_tuning_test]}
recall_results =  pd.DataFrame(data = recall, index = ['KNN', 'SVM', 'Logreg', 'XGBoost'])
recall_results.sort_values('Recall', ascending = False)

Unnamed: 0,Recall
SVM,0.892857
XGBoost,0.857143
Logreg,0.785714
KNN,0.75


In [102]:
cm = {
    "True Positive" : [tp_knn, tp_svm, tp_logreg, tp_xgb],
    
    "True Negative" : [tn_knn, tn_svm, tn_logreg, tn_xgb],
    
    "False Positive": [fp_knn, fp_svm, fp_logreg, fp_xgb],
    
    "False Negative": [fn_knn, fn_svm, fn_logreg, fn_xgb]
}
    
cm_matrix = pd.DataFrame(data = cm, index = ['KNN', 'SVM', 'Logistic Regression', 'XGBoost'])
cm_matrix.sort_values('False Negative')

Unnamed: 0,True Positive,True Negative,False Positive,False Negative
SVM,25,27,5,3
XGBoost,24,27,5,4
Logistic Regression,22,32,0,6
KNN,21,32,0,7
