## Heart Disease Prediction using Machine Learning

In [117]:
# Importing all necessary libraries
# numpy and pandas for basic operations on data
import pandas as pd
import numpy as np
# for PCA and feature scaling
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# for model seletion and different models
from sklearn.model_selection import train_test_split
# for counting classes and resampling
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
# models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# model evaluation metrics
from sklearn.metrics import accuracy_score, classification_report

# Reading the data
df = pd.read_csv('heart_disease_prediction.zip', compression='zip')

### 1. Exploratory Data Analysis
to understand the structure and properties of the given data set

In [118]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [119]:
df.shape

(4240, 16)

In [120]:
df.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4240.0,4240.0,4135.0,4240.0,4211.0,4187.0,4240.0,4240.0,4240.0,4190.0,4240.0,4240.0,4221.0,4239.0,3852.0,4240.0
mean,0.429245,49.580189,1.979444,0.494104,9.005937,0.029615,0.005896,0.310613,0.025708,236.699523,132.354599,82.897759,25.800801,75.878981,81.963655,0.151887
std,0.495027,8.572942,1.019791,0.500024,11.922462,0.169544,0.076569,0.462799,0.15828,44.591284,22.0333,11.910394,4.07984,12.025348,23.954335,0.358953
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,90.0,28.04,83.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [121]:
# Look for nulls
df.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [122]:
# Removing nulls and duplicates (if any)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [123]:
df.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0
mean,0.443685,49.551941,1.980317,0.489065,9.025424,0.030344,0.005741,0.311646,0.027064,236.847731,132.370558,82.917031,25.782802,75.730727,81.852925,0.152269
std,0.496886,8.562029,1.022656,0.499949,11.92159,0.171557,0.075561,0.463229,0.162292,44.097681,22.086866,11.974258,4.065601,11.981525,23.904164,0.359331
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,143.875,90.0,28.0375,82.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0,1.0


Roughly 14% of data is lost after removing the null rows. 

### 2. Feature Selection (using PCA)
to reduce the dimensonality of the dataset by reducing the number of features

In [124]:
X = df[df.columns[:-1]] # all features excluding the target labels
y = df['TenYearCHD'] # target labels

# Applying PCA for feature selection
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)

sum(pca.explained_variance_ratio_) # shows the fraction of information retained after PCA

0.966872171246028

the numer of components is selected arbitrarily as 5

### 3. Feature Scaling (using PCA)
to scale all the features to mean 0 and standadrd deviation 1

In [125]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_pca)

### 4. Train-Test split
to split the data into training and test sets

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=0)

### 5. Resampling
to resample the data (undersample or oversample) to balance the classes

In [127]:
print(f'Class distribution before sampling: {Counter(y)}')
oversampler = RandomOverSampler(sampling_strategy=0.5, random_state=42)
X_oversampled, y_oversampled = oversampler.fit_resample(X, y)
print(f'Oversampled class distribution: {Counter(y_oversampled)}')

Class distribution before sampling: Counter({0: 3101, 1: 557})
Oversampled class distribution: Counter({0: 3101, 1: 1550})


### 6. Model Pipeline
crating model pipelines for different models (not necessarily required here since we don't have any other steps involving transformation of data)

In [128]:
logistic_pipeline = Pipeline([
    ('model', LogisticRegression())
])

svm_pipeline = Pipeline([
    ('model', SVC())
])

dt_pipeline = Pipeline([
    ('model', DecisionTreeClassifier())
])

knn_pipeline = Pipeline([
    ('model', KNeighborsClassifier())
])

### 7. Modelling and Evaluation
comparing the performance of different models based on their accuracy score

In [129]:
models = {
    'Logistic Regression': logistic_pipeline,
    'SVM': svm_pipeline,
    'Decision Tree': dt_pipeline,
    'KNN': knn_pipeline
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Train set Accuracy: {model.score(X_train,y_train)}")
    print(f"Test set Accuracy: {accuracy}")
    print(f"Classification Report:\n{report}")
    print("="*30)

Model: Logistic Regression
Train set Accuracy: 0.8545388261028072
Test set Accuracy: 0.8349726775956284
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       768
           1       0.39      0.05      0.08       147

    accuracy                           0.83       915
   macro avg       0.62      0.52      0.50       915
weighted avg       0.77      0.83      0.78       915

Model: SVM
Train set Accuracy: 0.8585490339044841
Test set Accuracy: 0.8349726775956284
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       768
           1       0.25      0.01      0.03       147

    accuracy                           0.83       915
   macro avg       0.55      0.50      0.47       915
weighted avg       0.75      0.83      0.77       915

Model: Decision Tree
Train set Accuracy: 1.0
Test set Accuracy: 0.7180327868852459
Classification Report:
  

Both Logistic Regression and SVM have the same accuracy for the test set.

### 8. Apply Model

In [130]:
model = SVC(C=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Train set Accuracy: {model.score(X_train,y_train)}")
print(f"Test set Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")
print("="*30)

Train set Accuracy: 0.8829748450601531
Test set Accuracy: 0.8327868852459016
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       768
           1       0.41      0.09      0.15       147

    accuracy                           0.83       915
   macro avg       0.63      0.53      0.53       915
weighted avg       0.78      0.83      0.78       915



After doing hyperparameter tuning, we are able to slightly improve the precision of class 1 using SVC

## Strengths and Weaknesses of Different Models

### Logistic Regression   
*Strengths* -  
* Efficiency: Logistic regression is computationally efficient and can be trained quickly on large datasets.
* Less Prone to Overfitting: It is less likely to overfit compared to more complex models, making it a good choice for datasets with limited samples.
* Works well with Linearly Separable Data: When the data can be effectively separated by a linear boundary, logistic regression can perform well.

*Weaknesses* -  
* Limited Complexity: Logistic regression assumes a linear relationship between the features and the log-odds of the target variable, which can be a limitation for capturing complex relationships.
* Doesn't Handle Non-Linear Data: It performs poorly on data that cannot be effectively separated by a linear boundary.
* Sensitive to Outliers: Outliers can have a significant impact on the model's performance.

### Support Vector Machines (SVM)  
*Strengths* -  
* Effective in High-Dimensional Spaces: SVM can handle a large number of features efficiently, making it suitable for datasets with a high-dimensional feature space.
* Handles Non-Linear Data: By using kernel functions, SVM can effectively handle non-linear data.
* Margin Maximization: SVM aims to find the hyperplane with the largest margin, which often leads to a more robust model.

*Weaknesses* -  
* Computationally Intensive: Training an SVM can be computationally expensive, especially for large datasets.
* Sensitivity to Hyperparameters: The performance of an SVM can be highly dependent on the choice of kernel and other hyperparameters.
* Difficulty in Interpreting Complex Models: SVM models with complex kernels can be challenging to interpret.

### Decision Tree  
*Strengths* -  
* Interpretability: Decision trees are easy to interpret and can be visualized, making them valuable for understanding feature importance.  
* Handles Non-Linear Relationships: Decision trees can model complex, non-linear relationships in the data.  
* No Assumptions about Data Distribution: Decision trees do not make assumptions about the distribution of the data.  

*Weaknesses* -  
* Overfitting: Decision trees overfits to our training data resulting in an accuracy of perfect 1, but doesn't do so well on test set as it cannot generalise in a better way.
* Instability: Small changes in the data can lead to different tree structures, making them less stable compared to some other models.  
* Difficulty Handling Continuous Variables: Decision trees may have difficulty handling continuous variables without proper pre-processing.  

### KNN  
*Strengths* -  
* Simple and Easy to Implement: KNN is intuitive and straightforward to implement.
* Non-Parametric: KNN doesn't make assumptions about the underlying data distribution.
* Effective for Local Patterns: It's good at capturing local patterns or clusters in the data.  

*Weaknesses* -  
* Computationally Intensive at Inference: As the dataset grows, the computational cost of finding the nearest neighbors increases.
* Sensitive to Feature Scaling: KNN is sensitive to the scale of features, so it's important to scale them appropriately.
* Lack of Interpretability: KNN doesn't provide insights into the relationships between features and the target variable as readily as other models.