
# Heart Disease Prediction

This notebook trains and evaluates machine learning models (Random Forest, KNN, Logistic Regression, Naïve Bayes, SVM, and XGBoost) on a heart disease dataset.
It also allows testing the models with custom input to predict the likelihood of heart disease.

### Dataset Overview
- The dataset contains features like age, sex, chest pain type, cholesterol, and more.
- The target variable indicates the presence (1) or absence (0) of heart disease.


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import numpy as np

# Load the dataset
file_path = 'heart_statlog_cleveland_hungary_final.csv'
data = pd.read_csv(file_path)

# Display basic information
print(data.info())
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB
None


Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [None]:

# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [4]:

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(probability=True, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Results for {name}:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)


Training Random Forest...
Results for Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.92      0.92       112
           1       0.93      0.94      0.93       126

    accuracy                           0.93       238
   macro avg       0.93      0.93      0.93       238
weighted avg       0.93      0.93      0.93       238

--------------------------------------------------
Training KNN...
Results for KNN:
              precision    recall  f1-score   support

           0       0.85      0.79      0.82       112
           1       0.83      0.87      0.85       126

    accuracy                           0.84       238
   macro avg       0.84      0.83      0.83       238
weighted avg       0.84      0.84      0.84       238

--------------------------------------------------
Training Logistic Regression...
Results for Logistic Regression:
              precision    recall  f1-score   support

           0       0.84      0.82     

Parameters: { "use_label_encoder" } are not used.



In [5]:

# Test with custom input
def predict_custom_input(models, scaler):
    print("Enter the following details to predict heart disease:")
    features = [
        'age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
        'fasting blood sugar', 'resting ecg', 'max heart rate',
        'exercise angina', 'oldpeak', 'ST slope'
    ]
    custom_data = []
    for feature in features:
        value = float(input(f"Enter {feature}: "))
        custom_data.append(value)

    custom_data = np.array(custom_data).reshape(1, -1)
    custom_data = scaler.transform(custom_data)

    for name, model in models.items():
        prediction = model.predict(custom_data)
        probability = model.predict_proba(custom_data)[0][1] if hasattr(model, "predict_proba") else None
        print(f"{name} Prediction: {'Heart Disease' if prediction[0] == 1 else 'No Heart Disease'}")
        if probability is not None:
            print(f"Probability of Heart Disease: {probability:.2f}")
        print("-" * 30)

# Call the function to test
predict_custom_input(models, scaler)


Enter the following details to predict heart disease:
Enter age: 1
Enter sex: 1
Enter chest pain type: 1
Enter resting bp s: 1
Enter cholesterol: 1
Enter fasting blood sugar: 1
Enter resting ecg: 1
Enter max heart rate: 1
Enter exercise angina: 1
Enter oldpeak: 1
Enter ST slope: 1
Random Forest Prediction: Heart Disease
Probability of Heart Disease: 0.66
------------------------------
KNN Prediction: Heart Disease
Probability of Heart Disease: 0.60
------------------------------
Logistic Regression Prediction: No Heart Disease
Probability of Heart Disease: 0.30
------------------------------
Naive Bayes Prediction: Heart Disease
Probability of Heart Disease: 1.00
------------------------------
SVM Prediction: Heart Disease
Probability of Heart Disease: 0.73
------------------------------
XGBoost Prediction: Heart Disease
Probability of Heart Disease: 0.99
------------------------------




In [6]:
import joblib

# Assume Random Forest has the highest accuracy (replace with your best model)
best_model = models['XGBoost']

# Save the model to a file
model_filename = 'best_heart_disease_model.pkl'
joblib.dump(best_model, model_filename)
print(f"Model saved as {model_filename}")


Model saved as best_heart_disease_model.pkl
