In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [3]:
# df = pd.read_csv('heart_disease.csv')
df = pd.read_csv('heart_v2.csv')

In [4]:
df.isnull().sum()

age              0
sex              0
BP               0
cholestrol       0
heart disease    0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

In [6]:
# now lets split the data into train and test
from sklearn.model_selection import train_test_split

X = df[["age","sex","BP","cholestrol"]]
y = df[["heart disease"]]

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)
clf = RandomForestClassifier()
clf.fit(x_train,y_train)
prediction = clf.predict(x_test)
accuracy = accuracy_score(prediction,y_test)
cm = confusion_matrix(prediction,y_test)
prfs = precision_recall_fscore_support(prediction,y_test)
print('Accuracy: ',accuracy)
print('\n')
print('Confusion Matrix: ',cm)
print('\n')
print('Precision: ', prfs[0])
print('Recall:    ', prfs[1])
print('Fscore:    ', prfs[2])
print('Support:   ', prfs[3])

Accuracy:  0.6617647058823529


Confusion Matrix:  [[29 12]
 [11 16]]


Precision:  [0.725      0.57142857]
Recall:     [0.70731707 0.59259259]
Fscore:     [0.71604938 0.58181818]
Support:    [41 27]


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support

# Define the hyperparameter grid
param_grid = {
    'n_estimators': range(5, 15),
    'max_depth': range(5, 15),
    'max_features': range(5, 13)
}

# Initialize the classifier
clf = RandomForestClassifier(random_state=0)

# Perform GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
prfs = precision_recall_fscore_support(y_test, y_pred)

# Print the best parameters and results
print("Best Parameters:", grid_search.best_params_)
print(f"Best Accuracy: {accuracy:.4f}\n")
print("Confusion Matrix:\n", cm)
print("\nPrecision:", prfs[0])
print("Recall:", prfs[1])
print("F-score:", prfs[2])
print("Support:", prfs[3])


Fitting 3 folds for each of 800 candidates, totalling 2400 fits
Best Parameters: {'max_depth': 5, 'max_features': 5, 'n_estimators': 7}
Best Accuracy: 0.7206

Confusion Matrix:
 [[32  8]
 [11 17]]

Precision: [0.74418605 0.68      ]
Recall: [0.8        0.60714286]
F-score: [0.77108434 0.64150943]
Support: [40 28]


### Let's test if standardization can improve the accuracy

In [8]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform X
X_standardized = scaler.fit_transform(X)

# Convert back to DataFrame for better readability (optional)
import pandas as pd
X_standardized = pd.DataFrame(X_standardized, columns=X.columns)

# Display standardized values
print(X_standardized.head())
print("**********")

x_train,x_test,y_train,y_test = train_test_split(X_standardized,y,test_size=0.25,random_state=42)

        age       sex        BP  cholestrol
0  1.712094  0.689500 -0.075410    1.402212
1  1.382140 -1.450327 -0.916759    6.093004
2  0.282294  0.689500 -0.411950    0.219823
3  1.052186  0.689500 -0.187590    0.258589
4  2.152032 -1.450327 -0.636310    0.374890
**********


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support

# Define the hyperparameter grid
param_grid = {
    'n_estimators': range(5, 15),
    'max_depth': range(5, 15),
    'max_features': range(5, 13)
}

# Initialize the classifier
clf = RandomForestClassifier(random_state=0)

# Perform GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=3, n_jobs=-1, verbose=1, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
prfs = precision_recall_fscore_support(y_test, y_pred)

# Print the best parameters and results
print("Best Parameters:", grid_search.best_params_)
print(f"Best Accuracy: {accuracy:.4f}\n")
print("Confusion Matrix:\n", cm)
print("\nPrecision:", prfs[0])
print("Recall:", prfs[1])
print("F-score:", prfs[2])
print("Support:", prfs[3])


Fitting 3 folds for each of 800 candidates, totalling 2400 fits
Best Parameters: {'max_depth': 5, 'max_features': 5, 'n_estimators': 7}
Best Accuracy: 0.7206

Confusion Matrix:
 [[32  8]
 [11 17]]

Precision: [0.74418605 0.68      ]
Recall: [0.8        0.60714286]
F-score: [0.77108434 0.64150943]
Support: [40 28]


### Not that much difference with standardiaztion
accuracy went from 72.06% to 72.06%.

### Random forest classifier helped in achieving accuracy upto 76 % which is very good cosidering the size and quality of data

In [10]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC-AUC Score: {roc_auc:.4f}") # > 0.8 is good

ROC-AUC Score: 0.7036
