# Heart Disease Prediction

In [1]:
# importing necessary libraries
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.utils import shuffle

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import precision_recall_fscore_support
import joblib

## Loading dataSet and exploration

In [2]:
df = pd.read_csv("/content/drive/MyDrive/Data Science My Repository/Projects/AI Health Guard Research /AI Health Guard Datasets/heart.csv")
df = shuffle(df, random_state=42)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1,0
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3,0
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3,1
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3,0
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2,1


In [3]:
df.shape

(303, 14)

In [4]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 303 entries, 179 to 102
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 35.5 KB


In [6]:
# check null values
null_checker = df.apply(lambda x: sum(x.isnull())).to_frame(name='count')
print(null_checker)

          count
age           0
sex           0
cp            0
trestbps      0
chol          0
fbs           0
restecg       0
thalach       0
exang         0
oldpeak       0
slope         0
ca            0
thal          0
target        0


In [7]:
df['target'].value_counts()

target
1    165
0    138
Name: count, dtype: int64

Here ,

`0` - person is not Heart Disease affected

`1` - person is Heart Disease affected

In [8]:
df.groupby('target').mean()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,56.601449,0.826087,0.478261,134.398551,251.086957,0.15942,0.449275,139.101449,0.550725,1.585507,1.166667,1.166667,2.543478
1,52.49697,0.563636,1.375758,129.30303,242.230303,0.139394,0.593939,158.466667,0.139394,0.58303,1.593939,0.363636,2.121212


In [9]:
df['cp'].value_counts()

cp
0    143
2     87
1     50
3     23
Name: count, dtype: int64

In [10]:
df['thal'].value_counts()

thal
2    166
3    117
1     18
0      2
Name: count, dtype: int64

In [None]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [None]:
# Distribution of target variable
fig = px.histogram(df, x='target', title='Distribution of Target Variable')
fig.show()

In [None]:
# Age distribution
fig = px.histogram(df, x='age', title='Age Distribution')
fig.show()

* `0 - 30` Age group count is `1`
* `30 - 40` Age group count is `15`
* `40 - 50` Age group count is `72`
* `50 - 60` Age group count is `125`
* `60 - 70` Age group count is `80`
* `70 - 80` Age group count is `9`

So, `50 - 60` age group people get the maximum number of heart affect.

In [None]:
# Cholesterol levels by target
fig = px.box(df, x='target', y='chol', title='Cholesterol Levels by Target')
fig.show()

In [None]:
# Pair plot for selected features
selected_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'target']
fig = px.scatter_matrix(df[selected_features], dimensions=selected_features[:-1], color='target', title='Scatter Matrix of Selected Features')
fig.show()

* Higher target values (yellow) are scattered across all features but are more concentrated in lower oldpeak values and higher thalach values.
Page | 26
* There is no strong linearity among the other features except for age that is uniformly distributed, forming a diagonal line.


## Model Building

In [None]:
# Split the data into features and target variable
X = df.drop(columns = 'target', axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (242, 13)
Shape of X_test: (61, 13)
Shape of y_train: (242,)
Shape of y_test: (61,)


In [None]:
# Initialize the models
models = {
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'KNeighbors Classifier': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC()
}

In [None]:
# Dictionary to store accuracies and confusion matrices
accuracies = {}

# Loop through the models, train, test, and store results
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Test the model
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[model_name] = accuracy

In [None]:
# Print accuracies
for model_name, accuracy in accuracies.items():
    print(f"{model_name} Accuracy: {accuracy}")

Gradient Boosting Classifier Accuracy: 0.7868852459016393
KNeighbors Classifier Accuracy: 0.5901639344262295
Logistic Regression Accuracy: 0.8524590163934426
Decision Tree Accuracy: 0.819672131147541
Random Forest Accuracy: 0.819672131147541
SVM Accuracy: 0.5737704918032787


In [None]:
def get_metrics(y_test, y_pred):
    metrics = {}
    MSE = mean_squared_error(y_test, y_pred)
    RMSE = np.sqrt(MSE)
    MAE = mean_absolute_error(y_test, y_pred)
    R2 = r2_score(y_test, y_pred)

    metrics['MSE'] = MSE
    metrics['RMSE'] = RMSE
    metrics['MAE'] = MAE
    metrics['R2'] = R2

    return metrics
# Create an empty DataFrame to store metrics
metrics_df = pd.DataFrame(columns=['Model', 'MSE', 'RMSE', 'MAE', 'R2'])
# Iterate through each model in the dictionary
for model_name, model in models.items():
    metrics = get_metrics(y_test, y_pred)
    metrics['Model'] = model_name
    metrics_df = pd.concat([metrics_df, pd.DataFrame(metrics, index=[0])], ignore_index=True)

# Print the DataFrame
print(metrics_df)

                          Model      MSE      RMSE      MAE       R2
0  Gradient Boosting Classifier  0.42623  0.652863  0.42623 -0.71645
1         KNeighbors Classifier  0.42623  0.652863  0.42623 -0.71645
2           Logistic Regression  0.42623  0.652863  0.42623 -0.71645
3                 Decision Tree  0.42623  0.652863  0.42623 -0.71645
4                 Random Forest  0.42623  0.652863  0.42623 -0.71645
5                           SVM  0.42623  0.652863  0.42623 -0.71645


## Hyperparameter Tuning

In [None]:
# Define the parameter grid for RandomForestClassifier
params_RF = {"min_samples_split": [2, 6, 20],
              "min_samples_leaf": [1, 2, 4],
              "n_estimators" :[50,100,200,300,400],
              "max_depth": [None, 10, 20, 30],
              "criterion": ["gini", "entropy"]
              }

# Initialize GridSearchCV
GridSearchCV_RF = GridSearchCV(estimator=RandomForestClassifier(), param_grid=params_RF, cv=3, scoring="accuracy", return_train_score=True)

# Fit the model
GridSearchCV_RF.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", GridSearchCV_RF.best_params_)

# Best estimator
RF_model = GridSearchCV_RF.best_estimator_

# Evaluate the best model
y_pred = RF_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best RandomForestClassifier Model after Hyperparameter Tuning:")
print(f"Accuracy: {accuracy}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2 Score: {r2}")
#10 m

Best parameters found:  {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 100}
Best RandomForestClassifier Model after Hyperparameter Tuning:
Accuracy: 0.8524590163934426
MSE: 0.14754098360655737
RMSE: 0.3841106397986879
MAE: 0.14754098360655737
R2 Score: 0.4058441558441558


In [None]:
# Define the parameter grid for LogisticRegression
params_LR = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear']
}

# Initialize GridSearchCV
GridSearchCV_LR = GridSearchCV(estimator=LogisticRegression(max_iter=1000), param_grid=params_LR, cv=3, scoring="accuracy", return_train_score=True)

# Fit the model
GridSearchCV_LR.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", GridSearchCV_LR.best_params_)

# Best estimator
LR_model = GridSearchCV_LR.best_estimator_

# Evaluate the best model
y_pred = LR_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best LogisticRegression Model after Hyperparameter Tuning:")
print(f"Accuracy: {accuracy}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2 Score: {r2}")

Best parameters found:  {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best LogisticRegression Model after Hyperparameter Tuning:
Accuracy: 0.8524590163934426
MSE: 0.14754098360655737
RMSE: 0.3841106397986879
MAE: 0.14754098360655737
R2 Score: 0.4058441558441558


In [None]:
# Create a DataFrame to store model evaluation results
results = pd.DataFrame({
    "Model": ["Gradient Boosting Classifier", "K-Nearest Neighbors", "Logistic Regression", "Decision Tree", "Random Forest", "SVM"],
    "Train Score": [
        # Calculate mean cross-validated accuracy for training set
        cross_val_score(GradientBoostingClassifier().fit(X_train, y_train), X_train, y_train, cv=3).mean(),
        cross_val_score(KNeighborsClassifier().fit(X_train, y_train), X_train, y_train, cv=3).mean(),
        cross_val_score(LR_model, X_train, y_train, cv=3).mean(),
        cross_val_score(DecisionTreeClassifier().fit(X_train, y_train), X_train, y_train, cv=3).mean(),
        cross_val_score(RF_model, X_train, y_train, cv=3).mean(),
        cross_val_score(SVC().fit(X_train, y_train), X_train, y_train, cv=3).mean(),
    ],
    "Test Score": [
        # Calculate accuracy on the test set
        GradientBoostingClassifier().fit(X_train, y_train).score(X_test, y_test),
        KNeighborsClassifier().fit(X_train, y_train).score(X_test, y_test),
        LR_model.score(X_test, y_test),
        DecisionTreeClassifier().fit(X_train, y_train).score(X_test, y_test),
        RF_model.score(X_test, y_test),
        SVC().fit(X_train, y_train).score(X_test, y_test),
    ]
})
# Additional Metrics (precision, recall, F1 score)
metrics = ["precision", "recall", "f1"]
# Fit models before calculating metrics
gbc_model = GradientBoostingClassifier().fit(X_train, y_train)
knn_model = KNeighborsClassifier().fit(X_train, y_train)
dt_model = DecisionTreeClassifier().fit(X_train, y_train)
svm_model = SVC().fit(X_train, y_train)

for metric in metrics:
    results[f"{metric.capitalize()}"] = [
        precision_recall_fscore_support(y_test, model.predict(X_test), average="weighted")[metrics.index(metric)]
        for model in [gbc_model, knn_model, LR_model, dt_model, RF_model, svm_model]
    ]

result_df = results.sort_values(by="Test Score", ascending=False)
result_df = result_df.set_index("Test Score")
result_df

Unnamed: 0_level_0,Model,Train Score,Precision,Recall,F1
Test Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.852459,Logistic Regression,0.83894,0.874275,0.852459,0.851903
0.852459,Decision Tree,0.760648,0.85706,0.852459,0.852697
0.852459,Random Forest,0.826286,0.863991,0.852459,0.852459
0.786885,Gradient Boosting Classifier,0.777058,0.772889,0.770492,0.770863
0.590164,K-Nearest Neighbors,0.603498,0.6029,0.590164,0.58862
0.57377,SVM,0.665329,0.601639,0.57377,0.56402


In [None]:
# Cross-validation for the best model
cv_scores = cross_val_score(LR_model, X, y, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Average Cross-Validation Score: {cv_scores.mean()}')

Cross-Validation Scores: [0.86885246 0.80327869 0.80327869 0.78333333 0.86666667]
Average Cross-Validation Score: 0.8250819672131147


## Model Saving and Predictions

In [None]:
# Save the best model
joblib.dump(LR_model, 'heart_model.pkl')
# Load the model
loaded_model = joblib.load('heart_model.pkl')

In [None]:
# Driver code for prediction
feature_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
def predict_risk_level(input_data):
    data = pd.DataFrame([input_data], columns=feature_names)
    prediction = loaded_model.predict(data)
    if prediction[0] == 0:
      result = 'The person is not Heart Disease affected'
    else:
      result = 'The person is Heart Disease affected'

    print(result)
# Example prediction
input_data = (57, 1, 2, 150, 126,	1, 1,	173, 0, 0.2, 2, 1, 3)
predict_risk_level(input_data)

The person is Heart Disease affected


In [None]:
# Example prediction
input_data = (56, 0, 0, 134, 409, 0, 0, 150, 1, 1.9, 1, 2, 3)
predict_risk_level(input_data)

The person is not Heart Disease affected


In [None]:
import sklearn
print(sklearn.__version__)

1.2.2
