## Import & Load the data

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import scipy.stats as stats

In [2]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
df.HeartDisease.unique()

array([0, 1])

## We use Classification models

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Clean and prepare the data

In [5]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [6]:
df.isna().any()

Age               False
Sex               False
ChestPainType     False
RestingBP         False
Cholesterol       False
FastingBS         False
RestingECG        False
MaxHR             False
ExerciseAngina    False
Oldpeak           False
ST_Slope          False
HeartDisease      False
dtype: bool

## Convert text columns to numbers using label encoding and one hot encoding
## Apply Scaling

In [7]:
X = df.drop(columns=['HeartDisease'])
y = df.HeartDisease

X_encoded = pd.get_dummies(X, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype=int)
scale = StandardScaler()
X_scaled = scale.fit_transform(X_encoded)

## Now our data is prepared for predictions

## Build a classification model using various methods (SVM, logistic regression, random forest) and check which model gives you the best accuracy
***

# Cross Validation

## Cross-validation is a statistical method used to evaluate the performance of machine learning models by dividing the available data into multiple subsets, training the model on some subsets, and testing its performance on the remaining subsets. This process is repeated multiple times, with each subset serving as both training and testing data at some point.

**Types of Cross-Validation:**
- K-Fold Cross-Validation: The dataset is divided into k subsets (folds), and the model is trained and tested k times. Each fold is used as a test set once, and the remaining k-1 folds are used as the training set.
- Leave-One-Out Cross-Validation (LOOCV): Each sample in the dataset is used as a test set once, and the remaining samples are used as the training set.
- Leave-P-Out Cross-Validation (LpOC): Similar to LOOCV, but p samples are used as the test set instead of just one.

In [25]:
from sklearn.model_selection import cross_val_score, KFold

## Let's prove KNN

In [9]:
for k in range(1,20):
    scores = cross_val_score(KNeighborsClassifier(n_neighbors=k), X_scaled, y, cv=5, scoring='accuracy')
    print(f'scores:{scores} k numbers: {k}')

scores:[0.81521739 0.76630435 0.77173913 0.80327869 0.73224044] k numbers: 1
scores:[0.73369565 0.76086957 0.80978261 0.80874317 0.71038251] k numbers: 2
scores:[0.84782609 0.81521739 0.83695652 0.83060109 0.7704918 ] k numbers: 3
scores:[0.80978261 0.76630435 0.82608696 0.81967213 0.7431694 ] k numbers: 4
scores:[0.85869565 0.82065217 0.8423913  0.81967213 0.75409836] k numbers: 5
scores:[0.83695652 0.79891304 0.84782609 0.82513661 0.76502732] k numbers: 6
scores:[0.85869565 0.80978261 0.8423913  0.82513661 0.78688525] k numbers: 7
scores:[0.85326087 0.80434783 0.8423913  0.81967213 0.78142077] k numbers: 8
scores:[0.86956522 0.82065217 0.85326087 0.82513661 0.79781421] k numbers: 9
scores:[0.86956522 0.80978261 0.85326087 0.81420765 0.79234973] k numbers: 10
scores:[0.875      0.83152174 0.8423913  0.81967213 0.78688525] k numbers: 11
scores:[0.85869565 0.80978261 0.84782609 0.81967213 0.79781421] k numbers: 12
scores:[0.86956522 0.83152174 0.85326087 0.82513661 0.78142077] k numbers

## Now SVC

In [10]:
score = cross_val_score(SVC(random_state=42), X_scaled, y, cv=5, scoring='accuracy')
score

array([0.88043478, 0.82608696, 0.83695652, 0.82513661, 0.74863388])

## Now Decision Tree

In [11]:
for n in range(1,10):
    score = cross_val_score(DecisionTreeClassifier(max_depth=n, random_state=42), X_scaled, y, cv=5, scoring='accuracy')
    print(f'scores:{scores} max depth: {n}')

scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 1
scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 2
scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 3
scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 4
scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 5
scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 6
scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 7
scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 8
scores:[0.86956522 0.85326087 0.84782609 0.81967213 0.78688525] max depth: 9


## Now RandomForest

In [14]:
for n in range(1, 101, 11):
    scores= cross_val_score(RandomForestClassifier(n_estimators=n, random_state=42), X_scaled, y, cv=5, scoring='accuracy')
    print(f'scores:{scores} n estimators: {n}')

scores:[0.76630435 0.69565217 0.78804348 0.75409836 0.68306011] n estimators: 1
scores:[0.85326087 0.7826087  0.82065217 0.84699454 0.73224044] n estimators: 12
scores:[0.86413043 0.83695652 0.83695652 0.84699454 0.76502732] n estimators: 23
scores:[0.89673913 0.81521739 0.8423913  0.81967213 0.75409836] n estimators: 34
scores:[0.89673913 0.80434783 0.82608696 0.8579235  0.76502732] n estimators: 45
scores:[0.86413043 0.81521739 0.85869565 0.84699454 0.75956284] n estimators: 56
scores:[0.90217391 0.80978261 0.8423913  0.83606557 0.7704918 ] n estimators: 67
scores:[0.88043478 0.82065217 0.84782609 0.83060109 0.74863388] n estimators: 78
scores:[0.89130435 0.80434783 0.82608696 0.83060109 0.78688525] n estimators: 89
scores:[0.89130435 0.81521739 0.83152174 0.83606557 0.74863388] n estimators: 100


## And last, LogisticRegression

In [17]:
score = cross_val_score(LogisticRegression(random_state=42), X_scaled, y, cv=5, scoring='accuracy')
score

array([0.85326087, 0.83695652, 0.82608696, 0.8579235 , 0.75956284])

## Now We select the model with the BEST accuracy

In [23]:
cross_val_score(RandomForestClassifier(n_estimators=67,random_state=42), X_scaled, y, cv=5, scoring='accuracy')

array([0.89673913, 0.81521739, 0.8423913 , 0.84153005, 0.76502732])

## Search the model manually using K-fold

In [32]:
kf = KFold(n_splits=5, shuffle=False)

X_train, X_test, y_train, y_test = None, None, None, None

for fold_index, (train_index, test_index) in enumerate(kf.split(X_scaled)):
    if fold_index == 0:  # first fold
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y[train_index], y[test_index]
        break
model = RandomForestClassifier(n_estimators=67, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"first fold: {accuracy}")

first fold: 0.907608695652174


## Now use PCA to reduce dimensions, retrain your model and see what impact it has on your model in terms of accuracy. Keep in mind that many times doing PCA reduces the accuracy but computation is much lighter and that's the trade off you need to consider while building models in real life

In [59]:
pca = PCA(0.90)

In [60]:
X_pca = pca.fit_transform(X_scaled)

In [61]:
X_pca.shape

(918, 12)

In [62]:
X_scaled.shape

(918, 20)

In [63]:
kf = KFold(n_splits=5, shuffle=False)

X_train, X_test, y_train, y_test = None, None, None, None

for fold_index, (train_index, test_index) in enumerate(kf.split(X_pca)):
    if fold_index == 0:  # first fold
        X_train, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y[train_index], y[test_index]
        break
modelpca = RandomForestClassifier(n_estimators=67, random_state=42)
modelpca.fit(X_train, y_train)

y_pred = modelpca.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"first fold with pca: {accuracy}")

first fold with pca: 0.9021739130434783


the difference is 0.005