In [30]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
%matplotlib inline

In [31]:
# Reading Training Data
from pathlib import Path
# Define the base directory dynamically (like home folder)
base_dir = Path.home()

# Build the full file path
file_path = base_dir / 'anaconda3' / 'envs' / 'ApplicationMLModels_PredictHeartDisease' / 'statlog+heart' /'heart_data.csv'
#print(file_path)
#file_name = 'heart_data.csv'
#complete_filepath = file_path / file_name
#filepath = complete_filepath.resolve()

# Brief overview of Dataset
heart_df = pd.read_csv(file_path)
heart_df.head()

Unnamed: 0,patient_id,heart_disease_present,age,sex,chest_pain_type,resting_blood_pressure,serum_cholesterol_mg_per_dl,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,max_heart_rate_achieved,exercise_induced_angina,oldpeak_eq_st_depression,slope_of_peak_exercise_st_segment,num_major_vessels,thal
0,034N95xw,1,70,1,4,130,322,0,2,109,0,2.4,2,3,3
1,IBygtBAP,0,67,0,3,115,564,0,2,160,0,1.6,2,0,7
2,zI70cfhM,1,57,1,2,124,261,0,0,141,0,0.3,1,0,7
3,qXkCdEiv,0,64,1,4,128,263,0,0,105,1,0.2,2,1,7
4,IAA2korm,0,74,0,2,120,269,0,2,121,1,0.2,1,1,3


In [32]:
heart_df.shape

(270, 15)

This above line of executed code indicates that there are 270 rows and 15 columns within the above dataset. In the next step let's divide the data into attributes and labels. 

## Data Preparation

In [33]:
heart_df.drop(['patient_id'], axis=1, inplace=True)
heart_df.head()

X = heart_df.drop('heart_disease_present', axis=1)  
y = heart_df['heart_disease_present'] 

In the above code lines, X contains the attributes of the data frame and y contains the labels that we are willing to predict. 

In [34]:
# Data Splitting
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)  

## 1. Simple SVM

In [35]:
# Training the Algorithm
from sklearn.svm import SVC  
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
svclassifier = SVC(kernel='linear', degree=8)  
svclassifier.fit(X_train, y_train) 

In [36]:
# Making Predictions
y_pred = svclassifier.predict(X_test)

In [37]:
#Algorithm Evaluation
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  

[[27  1]
 [ 9 17]]
              precision    recall  f1-score   support

           0       0.75      0.96      0.84        28
           1       0.94      0.65      0.77        26

    accuracy                           0.81        54
   macro avg       0.85      0.81      0.81        54
weighted avg       0.84      0.81      0.81        54



As compared to our previous Logistic Regression, this time using linear kernel helped in improving the model's performance to a high value.

### Hyperparameter Tuning

In [38]:
# Set the parameters by cross-validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from __future__ import print_function
print(__doc__)

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,   #5 fold cross-validation
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

Automatically created module for IPython interactive environment
# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 100, 'kernel': 'linear'}

Grid scores on development set:

0.703 (+/-0.139) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.710 (+/-0.100) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.657 (+/-0.095) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.704 (+/-0.127) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.607 (+/-0.072) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.776 (+/-0.135) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.622 (+/-0.041) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.738 (+/-0.093) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.824 (+/-0.155) for {'C': 1, 'kernel': 'linear'}
0.817 (+/-0.145) for {'C': 10, 'kernel': 'linear'}
0.831 (+/-0.139) for {'C': 100, 'kernel': 'linear'}
0.814 (+/-0.156) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model is trained o

## 2. Polynomial Kernel SVM

In [39]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='poly', degree=8)  
svclassifier.fit(X_train, y_train)  

In [40]:
# Making Predictions
y_pred = svclassifier.predict(X_test)  

In [41]:
# Algorithm Evaluation
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[25  3]
 [17  9]]
              precision    recall  f1-score   support

           0       0.60      0.89      0.71        28
           1       0.75      0.35      0.47        26

    accuracy                           0.63        54
   macro avg       0.67      0.62      0.59        54
weighted avg       0.67      0.63      0.60        54



This time our Model's performance seems to have detoriated than the previous Model.

## 3. Gaussian Kernel

In [42]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='rbf')  
svclassifier.fit(X_train, y_train) 

In [43]:
# Prediction and Evaluation
y_pred = svclassifier.predict(X_test)  

In [44]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6481481481481481


In [45]:
# Algorithm Evaluation
from sklearn.metrics import classification_report, confusion_matrix 
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[26  2]
 [17  9]]
              precision    recall  f1-score   support

           0       0.60      0.93      0.73        28
           1       0.82      0.35      0.49        26

    accuracy                           0.65        54
   macro avg       0.71      0.64      0.61        54
weighted avg       0.71      0.65      0.61        54



Okay this time, the Model's performance seemed to have detoriated much more than the previous Model. So, far Simple SVM seems to perform the best.

## 4. Sigmoid Kernel

In [46]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='sigmoid')  
svclassifier.fit(X_train, y_train) 

In [47]:
# Prediction and Evaluation
y_pred = svclassifier.predict(X_test) 

In [48]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5185185185185185


In [21]:
# Algorithm Evaluation
from sklearn.metrics import classification_report, confusion_matrix  
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[30  0]
 [24  0]]
              precision    recall  f1-score   support

           0       0.56      1.00      0.71        30
           1       0.00      0.00      0.00        24

    accuracy                           0.56        54
   macro avg       0.28      0.50      0.36        54
weighted avg       0.31      0.56      0.40        54



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


So, running all the Models, our Simple SVM Model still semms to ein over all other SVM Models and it even performs better than our previous Models ran on the basis of Logistic Regression.

## 5. Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier  
classifier = DecisionTreeClassifier()  
classifier.fit(X_train, y_train)

In [23]:
# Prediction and Evaluation
y_pred = classifier.predict(X_test)  

In [24]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7962962962962963


In [25]:
# Algorithm Evaluation
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[25  5]
 [ 6 18]]
              precision    recall  f1-score   support

           0       0.81      0.83      0.82        30
           1       0.78      0.75      0.77        24

    accuracy                           0.80        54
   macro avg       0.79      0.79      0.79        54
weighted avg       0.80      0.80      0.80        54



This Model performed however better than the previous Models, except for Simple SVM Model.

## 6. Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [27]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8518518518518519


In [28]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

[[24  6]
 [ 2 22]]
              precision    recall  f1-score   support

           0       0.92      0.80      0.86        30
           1       0.79      0.92      0.85        24

    accuracy                           0.85        54
   macro avg       0.85      0.86      0.85        54
weighted avg       0.86      0.85      0.85        54



In this Model, the Model Accuracy and Model F1 Score seems to be close and therefore we can conclude a better performance in terms of Model's predictive power.