In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# data preparation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import Perceptron

# to compare and evaluate our classifiers
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
import itertools
from sklearn.model_selection import GridSearchCV

#ignore warnings for now
import warnings
warnings.filterwarnings('ignore')

Matplotlib created a temporary config/cache directory at /var/folders/73/0j93mrvd1s77hv6p1fbfrqz80000gn/T/matplotlib-1xg25zly because the default path (/Users/anatlevari/.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
df = pd.read_csv('/Users/anatlevari/Downloads/heart.csv')

# Classification 


In this notebook, different classification algorithms will be implemented on the Heart Failure dataset. Their accuracy is compared, before and after cross-validation (KFold) is implemented. In addition, for KNN algorithm, the optimal value of k is obtained. 

## Data Preparation
For some of the algorithms, transforming the categorical features into numerical ones is essential. This replacement is implemented below, with the separation of the data into train and test sets. 

In [3]:
# get the different values of the catagorical features
df['Sex'].unique()
df['ChestPainType'].unique()
df['RestingECG'].unique()
df['ExerciseAngina'].unique()
df['ST_Slope'].unique()

# replace non numerical features with numerical ones
df = df.replace(['F', 'M'], [0,1])
df = df.replace(['ATA', 'NAP', 'ASY', 'TA'], [0,1,2,3])
df = df.replace(['Normal', 'ST', 'LVH'], [0,1,2])
df = df.replace(['N', 'Y'], [0,1])
df = df.replace(['Up', 'Flat', 'Down'], [0,1,2])

In [4]:
# Separate input (X) to output (y)
X = df.iloc[:,:-1].values  #input
y = df.iloc[:, 11].values  # output - target

# divide the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [5]:
# cross-validation - KFold
#cv = KFold(n_splits=25, random_state=1, shuffle=True)

## SVM Algorithm

In [6]:
svm = SVC()
svm.fit(X_train, y_train)

# accuracy of svm
svm_acc = round(svm.score(X_test,y_test)*100, 2)

# accuracy using cv
smv_cv_scores = cross_val_score(svm, X, y, scoring='accuracy', cv=10)
svm_acc_cv = round(smv_cv_scores.mean() * 100, 2)

print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(round(svm.score(X_train, y_train)*100, 2)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm_acc))
print('Accuracy of SVM classifier on test set, using 10Fold cv: {:.2f}'
     .format(svm_acc_cv))

# Verify balance of the data in the division of cv  
print("Standard Deviation of the cv accuracy score:", round(smv_cv_scores.std()*100,2))

Accuracy of SVM classifier on training set: 90.46
Accuracy of SVM classifier on test set: 85.33
Accuracy of SVM classifier on test set, using 10Fold cv: 70.89
Standard Deviation of the cv accuracy score: 8.26


As seen above, the accuracy after applying cv decrease. The reason may be revealed after looking into the accuracy of each fold more closely:

In [7]:
k = 10
kf = KFold(n_splits=k, random_state=None, shuffle=True)
model = SVC()
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X[train_index,:],X[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))


accuracy of each fold - [0.6521739130434783, 0.717391304347826, 0.6630434782608695, 0.6630434782608695, 0.7391304347826086, 0.7391304347826086, 0.6630434782608695, 0.782608695652174, 0.7692307692307693, 0.7802197802197802]
Avg accuracy : 0.7169015766841853


We can see major difference in the accuracy of the different folds, even after applying "shuffle" option. This can be caused by uneven distribution of some of the features (not the target one). This was seen in the first notebook ("Heart Failure - Insights"). 
Thus, for this classifier, we will consider the accuracy without cv

## Logistic Regression

In [8]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# accuracy of logreg
logreg_acc = round(logreg.score(X_test,y_test) * 100, 2)

# accurecy using cv
logreg_scores = cross_val_score(logreg, X, y, scoring='accuracy', cv=120)
logreg_acc_cv = round(logreg_scores.mean() * 100, 2)


print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(round(logreg.score(X_train, y_train)*100, 2)))
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'
     .format(logreg_acc))
print('Accuracy of Logistic Regression classifier on test set, using 120Fold cv: {:.2f}'
     .format(logreg_acc_cv))

# Verify balance of the data in the division of cv  
print("Standard Deviation of the cv accuracy score:", round(logreg_scores.std()*100, 2))

Accuracy of Logistic Regression classifier on training set: 85.37
Accuracy of Logistic Regression classifier on test set: 84.62
Accuracy of Logistic Regression classifier on test set, using 120Fold cv: 84.67
Standard Deviation of the cv accuracy score: 13.17


## KNN Algorithm

In [9]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# accuracy of knn
knn_acc = round(knn.score(X_test,y_test) * 100, 2)

# accuracy of knn using cv
knn_scores = cross_val_score(knn, X, y, scoring='accuracy', cv=50)
knn_acc_cv = round(knn_scores.mean() * 100, 2)


print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(round(knn.score(X_train, y_train)*100, 2)))
print('Accuracy of KNN classifier on test set: {:.2f}'
     .format(knn_acc))
print('Accuracy of KNN on test set, using 50Fold cv: {:.2f}'
     .format(knn_acc_cv))

# Verify balance of the data in the division of cv  
print("Standard Deviation of the cv accuracy score:", round(knn_scores.std()*100,2))

Accuracy of KNN classifier on training set: 79.08
Accuracy of KNN classifier on test set: 73.63
Accuracy of KNN on test set, using 50Fold cv: 70.94
Standard Deviation of the cv accuracy score: 12.89


As before, an examination of the accuracy on the different folds is required:

In [10]:
k = 50
kf = KFold(n_splits=k, random_state=None, shuffle=True)
model = KNeighborsClassifier()
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X[train_index,:],X[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = round(accuracy_score(pred_values , y_test)*100,2)
    acc_score.append(acc)
     
avg_acc_score = round(sum(acc_score)/k,2)
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))



accuracy of each fold - [57.89, 94.74, 63.16, 63.16, 73.68, 52.63, 68.42, 68.42, 73.68, 68.42, 68.42, 89.47, 78.95, 84.21, 63.16, 68.42, 73.68, 73.68, 72.22, 55.56, 77.78, 72.22, 55.56, 77.78, 66.67, 66.67, 61.11, 66.67, 66.67, 55.56, 61.11, 72.22, 55.56, 50.0, 77.78, 83.33, 66.67, 88.89, 77.78, 66.67, 83.33, 61.11, 66.67, 61.11, 66.67, 77.78, 61.11, 66.67, 83.33, 83.33]
Avg accuracy : 69.8


As we saw with the SVM classifier, here the distribution of some of the features is uneven over the different folds. Thus, we will consider the accuracy without cv.
In addition, we shall find the optimal k (=number of neighbors to compare), i.e., the number k which maximize the accuracy, and save it as the accuracy of the knn classifier. 

In [11]:
# check to find the best k for knn model

#create new a knn model
knn2 = KNeighborsClassifier()

#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, X_train.shape[0])}

#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid)

#fit model to data
knn_gscv.fit(X_train, y_train)

#check mean score for the top performing value of n_neighbors
knn_gscv.best_score_

#check top performing n_neighbors value
knn_gscv.best_params_

print('The optimal number of neighbors in the kNN algorithm is:', knn_gscv.best_params_)
print('The accuracy is:', round(knn_gscv.best_score_*100,2))

The optimal number of neighbors in the kNN algorithm is: {'n_neighbors': 39}
The accuracy is: 69.0


## Naive Bayes

In [12]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# naive bayes accuracy
gnb_acc = round(gnb.score(X_test,y_test) * 100, 2)

# naive bayes accuracy using cv
gnb_scores = cross_val_score(gnb, X, y, scoring='accuracy', cv=100)
gnb_acc_cv = round(gnb_scores.mean() * 100, 2)


print('Accuracy of Naive Bayes classifier on training set: {:.2f}'
     .format(round(gnb.score(X_train, y_train)*100, 2)))
print('Accuracy of Naive Bayes classifier on test set: {:.2f}'
     .format(gnb_acc))
print('Accuracy of Naive Bayes on test set, using 100Fold cv: {:.2f}'
     .format(gnb_acc_cv))

# Verify balance of the data in the division of cv  
print("Standard Deviation of the cv accuracy score:", gnb_scores.std())

Accuracy of Naive Bayes classifier on training set: 84.67
Accuracy of Naive Bayes classifier on test set: 88.89
Accuracy of Naive Bayes on test set, using 100Fold cv: 84.69
Standard Deviation of the cv accuracy score: 0.13200804315188575


## Decision Tree Algorithm

In [13]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

# decision tree accuracy
clf_acc = round(clf.score(X_test,y_test) * 100, 2)

# decision tree accuracy using cv
clf_scores = cross_val_score(clf, X, y, scoring='accuracy', cv=80)
clf_acc_cv = round(clf_scores.mean() * 100, 2)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(round(clf.score(X_train, y_train)*100, 2)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf_acc))
print('Accuracy of Decision Tree classifier on test set, using 80Fold cv: {:.2f}'
     .format(clf_acc_cv))

# Verify balance of the data in the division of cv  
print("Standard Deviation of the cv accuracy score:", clf_scores.std())

Accuracy of Decision Tree classifier on training set: 100.00
Accuracy of Decision Tree classifier on test set: 88.89
Accuracy of Decision Tree classifier on test set, using 80Fold cv: 77.20
Standard Deviation of the cv accuracy score: 0.11923457869534945


### Note: 
The difference between the accuracy over the train and dataset implies overfitting. This can be seen also in the next classifier - Random Forest. The overfitting and other metrics of evaluation of the decision tree classifier will be examined further in a new notebook - 'Heart Failure - Decision Tree in Details'.


## Random Forest

In [14]:
rf = RandomForestClassifier(n_estimators=250)
rf.fit(X_train,y_train)

# random forest accuracy
rf_acc = round(rf.score(X_test,y_test) * 100, 2)

# random forest accuracy using cv
rf_scores = cross_val_score(rf, X, y, scoring='accuracy', cv=80)
rf_acc_cv = round(rf_scores.mean() * 100, 2)

print('Accuracy of Random Forest classifier on training set: {:.2f}'
     .format(round(rf.score(X_train, y_train)*100, 2)))
print('Accuracy of Random Forest classifier on test set: {:.2f}'
     .format(rf_acc))
print('Accuracy of Random Forest classifier on test set, using 80Fold cv: {:.2f}'
     .format(rf_acc_cv))

# Verify balance of the data in the division of cv  
print("Standard Deviation of the cv accuracy score:", rf_scores.std())

Accuracy of Random Forest classifier on training set: 100.00
Accuracy of Random Forest classifier on test set: 94.44
Accuracy of Random Forest classifier on test set, using 80Fold cv: 86.68
Standard Deviation of the cv accuracy score: 0.11365968037300259


# Algorithm Comparison
Compare the accuracy score on the testset, with cv and without

In [15]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'Logistic Regression', 'KNN', 'Naive Bayes', 'Decision Tree',
              'Random Forest'],
    'Score': [svm_acc, logreg_acc, knn_acc, gnb_acc, clf_acc, rf_acc],
'Score after Cross-Validation': [svm_acc_cv, logreg_acc_cv, knn_acc_cv, gnb_acc_cv, clf_acc_cv, rf_acc_cv]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score,Score after Cross-Validation
5,Random Forest,94.44,86.68
3,Naive Bayes,88.89,84.69
4,Decision Tree,88.89,77.2
0,Support Vector Machines,85.33,70.89
1,Logistic Regression,84.62,84.67
2,KNN,73.63,70.94


# Conclusion and Future Work:
1. The maximal accuracy was achieved by a random forest classifier and is equal to 94.44%. Thus, this classifier may be a good fit for future heart failure detection.
2. Cross-validation technique did not improve the accuracy. It revealed the bias found in some of the features. 
3. Naive Bayes classifier must be re-checked for this purpose since it requires uncorrelated features. As seen in the heat map from the previous notebook, there is some (small) correlation between a few features of the model. I've added this classifier here for practice reasons.
4. The overfitting in the Decision tree and Random Forest classifiers will be addressed separately in the next notebook. 
5. Next notebook will also use the interpretability of the decision tree classifier to clarify the results and determine the features leading to a higher risk of heart failure. 
6. It will also address other metrics to evaluate the classification models, beyond accuracy.