
# Heart Disease Prediction

This notebook trains and evaluates machine learning models (Random Forest, KNN, Logistic Regression, Naïve Bayes, SVM, and XGBoost) on a heart disease dataset.
It also allows testing the models with custom input to predict the likelihood of heart disease.

### Dataset Overview
- The dataset contains features like age, sex, chest pain type, cholesterol, and more.
- The target variable indicates the presence (1) or absence (0) of heart disease.


In [6]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

# Load the dataset
file_path = 'heart.csv'
data = pd.read_csv(file_path)

# Display basic information
print(data.info())
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB
None


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [7]:

# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [8]:
# Random Forest (RF)
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)
# Print metrics
print(f"Accuracy: {rf_accuracy:}")
print(f"Precision: {rf_precision:}")
print(f"Recall: {rf_recall:}")
print(f"F1 Score: {rf_f1:}")

# Print detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_test, rf_predictions))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       105

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205



In [9]:
# Random Forest (knn)
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_precision = precision_score(y_test, knn_predictions)
knn_recall = recall_score(y_test, knn_predictions)
knn_f1 = f1_score(y_test, knn_predictions)
# Print metrics
print(f"Accuracy: {knn_accuracy:}")
print(f"Precision: {knn_precision:}")
print(f"Recall: {knn_recall:}")
print(f"F1 Score: {knn_f1:}")

# Print detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_test, knn_predictions))

Accuracy: 0.8634146341463415
Precision: 0.8737864077669902
Recall: 0.8571428571428571
F1 Score: 0.8653846153846154

Classification Report:

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       100
           1       0.87      0.86      0.87       105

    accuracy                           0.86       205
   macro avg       0.86      0.86      0.86       205
weighted avg       0.86      0.86      0.86       205



In [10]:
# Random Forest (svm)
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)
svm_f1 = f1_score(y_test, svm_predictions)
# Print metrics
print(f"Accuracy: {svm_accuracy:}")
print(f"Precision: {svm_precision:}")
print(f"Recall: {svm_recall:}")
print(f"F1 Score: {svm_f1:}")

# Print detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_test, svm_predictions))

Accuracy: 0.926829268292683
Precision: 0.9166666666666666
Recall: 0.9428571428571428
F1 Score: 0.9295774647887324

Classification Report:

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       100
           1       0.92      0.94      0.93       105

    accuracy                           0.93       205
   macro avg       0.93      0.93      0.93       205
weighted avg       0.93      0.93      0.93       205



In [None]:
import pickle
# Create instances of the top 3 models
rf_model = RandomForestClassifier()
xgb_model = XGBClassifier()
knn_model = KNeighborsClassifier()

# Create a VotingClassifier with the top 3 models
voting_classifier = VotingClassifier(
    estimators=[('rf', rf_model), ('xgb', xgb_model), ('knn', knn_model)],
    voting='hard'  # Use 'hard' voting to let each model vote for the final prediction
)

# Assuming you have your data and labels in X_train, X_test, y_train, y_test
# Fit the voting classifier to the training data
voting_classifier.fit(X_train, y_train)

# Evaluate the voting classifier on the test data
accuracy = voting_classifier.score(X_test, y_test)
print("Ensemble Voting Classifier Accuracy:", accuracy)
with open('voting_classifier.pkl', 'wb') as file:
    pickle.dump(voting_classifier, file)

print("Voting Classifier saved successfully!")

In [None]:

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)


Training Random Forest...
Results for Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92       112
           1       0.93      0.94      0.93       126

    accuracy                           0.93       238
   macro avg       0.93      0.93      0.93       238
weighted avg       0.93      0.93      0.93       238

--------------------------------------------------
Training KNN...
Results for KNN:
              precision    recall  f1-score   support

           0       0.85      0.79      0.82       112
           1       0.83      0.87      0.85       126

    accuracy                           0.84       238
   macro avg       0.84      0.83      0.83       238
weighted avg       0.84      0.84      0.84       238

--------------------------------------------------
Training Logistic Regression...
Results for Logistic Regression:
              precision    recall  f1-score   support

           0       0.84      0.82     

Parameters: { "use_label_encoder" } are not used.



In [None]:

# Test with custom input
def predict_custom_input(models, scaler):
    print("Enter the following details to predict heart disease:")
    features = [
        'age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
        'fasting blood sugar', 'resting ecg', 'max heart rate',
        'exercise angina', 'oldpeak', 'ST slope'
    ]
    custom_data = []
    for feature in features:
        value = float(input(f"Enter {feature}: "))
        custom_data.append(value)

    custom_data = np.array(custom_data).reshape(1, -1)
    custom_data = scaler.transform(custom_data)

    for name, model in models.items():
        prediction = model.predict(custom_data)
        probability = model.predict_proba(custom_data)[0][1] if hasattr(model, "predict_proba") else None
        print(f"{name} Prediction: {'Heart Disease' if prediction[0] == 1 else 'No Heart Disease'}")
        if probability is not None:
            print(f"Probability of Heart Disease: {probability:.2f}")
        print("-" * 30)

# Call the function to test
predict_custom_input(models, scaler)


Enter the following details to predict heart disease:
Enter age: 1
Enter sex: 1
Enter chest pain type: 1
Enter resting bp s: 1
Enter cholesterol: 1
Enter fasting blood sugar: 1
Enter resting ecg: 1
Enter max heart rate: 1
Enter exercise angina: 1
Enter oldpeak: 1
Enter ST slope: 1
Random Forest Prediction: Heart Disease
Probability of Heart Disease: 0.66
------------------------------
KNN Prediction: Heart Disease
Probability of Heart Disease: 0.60
------------------------------
Logistic Regression Prediction: No Heart Disease
Probability of Heart Disease: 0.30
------------------------------
Naive Bayes Prediction: Heart Disease
Probability of Heart Disease: 1.00
------------------------------
SVM Prediction: Heart Disease
Probability of Heart Disease: 0.73
------------------------------
XGBoost Prediction: Heart Disease
Probability of Heart Disease: 0.99
------------------------------




In [None]:
import joblib

# Assume Random Forest has the highest accuracy (replace with your best model)
best_model = models['XGBoost']

# Save the model to a file
model_filename = 'best_heart_disease_model.pkl'
joblib.dump(best_model, model_filename)
print(f"Model saved as {model_filename}")


Model saved as best_heart_disease_model.pkl
