<a href="https://colab.research.google.com/github/anjaliravi2304/Heart-Disease-Prediction-Using-Machine-Learning-Classification-Models/blob/main/EdiGlobeMajorProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ==============================================================================
# Heart Disease Prediction Project Script
# Data Science Pipeline: EDA, Preprocessing, Modeling, Evaluation, Deployment Prep
# ==============================================================================

# ------------------------------------------------------------------------------
# 1. SETUP AND DATA LOADING
# ------------------------------------------------------------------------------

# Core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Deep Learning (AI Model)
import tensorflow as tf
from tensorflow import keras

# Define the dataset path (using the uploaded file name)
FILE_PATH = 'heart.csv'
TARGET_COLUMN = 'target'
RANDOM_SEED = 42

try:
    # Load the data into a pandas DataFrame
    df = pd.read_csv(FILE_PATH)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: The file '{FILE_PATH}' was not found. Please ensure it is uploaded.")
    exit()

# ------------------------------------------------------------------------------
# 2. DATA UNDERSTANDING AND PREPROCESSING
# ------------------------------------------------------------------------------

# Check for missing values (Previously confirmed clean)
# print("\nMissing Values Check:\n", df.isnull().sum().to_string())

# Convert categorical features (currently represented as numbers) to 'object' type
# This is crucial for proper One-Hot Encoding later
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    df[col] = df[col].astype('object')

# --- Exploratory Data Analysis (EDA) ---
# Check class balance
print("\nTarget Variable Distribution:\n", df[TARGET_COLUMN].value_counts())

# --- Feature Engineering (One-Hot Encoding) ---
# Convert categorical features into dummy/indicator variables
df_encoded = pd.get_dummies(df, drop_first=True)
print(f"\nDataFrame shape after One-Hot Encoding: {df_encoded.shape}")

# --- Split Data ---
X = df_encoded.drop(TARGET_COLUMN, axis=1) # Features
y = df_encoded[TARGET_COLUMN]             # Target

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)

# --- Feature Scaling (Normalization) ---
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit scaler ONLY on the training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\nData split and scaled successfully.")


# ------------------------------------------------------------------------------
# 3. MODEL TRAINING AND EVALUATION FUNCTION
# ------------------------------------------------------------------------------

def get_metrics(y_true, y_pred, y_proba):
    """Calculates standard classification metrics."""
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_proba)
    }

results = {}

# --- A. Logistic Regression ---
log_reg = LogisticRegression(random_state=RANDOM_SEED, solver='liblinear')
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
y_proba_log_reg = log_reg.predict_proba(X_test_scaled)[:, 1]
results['Logistic Regression'] = get_metrics(y_test, y_pred_log_reg, y_proba_log_reg)

# --- B. Decision Tree ---
dt_classifier = DecisionTreeClassifier(random_state=RANDOM_SEED)
dt_classifier.fit(X_train_scaled, y_train)
y_pred_dt = dt_classifier.predict(X_test_scaled)
y_proba_dt = dt_classifier.predict_proba(X_test_scaled)[:, 1]
results['Decision Tree'] = get_metrics(y_test, y_pred_dt, y_proba_dt)

# --- C. Random Forest ---
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_classifier.fit(X_train_scaled, y_train)
y_pred_rf = rf_classifier.predict(X_test_scaled)
y_proba_rf = rf_classifier.predict_proba(X_test_scaled)[:, 1]
results['Random Forest'] = get_metrics(y_test, y_pred_rf, y_proba_rf)

# --- D. Neural Network (AI Model) ---
input_dim = X_train_scaled.shape[1]
model = keras.Sequential([
    keras.layers.Dense(units=32, activation='relu', input_shape=(input_dim,)),
    keras.layers.Dense(units=16, activation='relu'),
    keras.layers.Dense(units=1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the NN (verbose=0 suppresses output)
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, verbose=0)

y_proba_nn = model.predict(X_test_scaled).flatten()
y_pred_nn = (y_proba_nn > 0.5).astype("int32")
results['Neural Network'] = get_metrics(y_test, y_pred_nn, y_proba_nn)


# ------------------------------------------------------------------------------
# 4. FINAL MODEL COMPARISON
# ------------------------------------------------------------------------------

# Create a final comparison DataFrame
df_comparison = pd.DataFrame(results).T
df_comparison = df_comparison.sort_values(by='ROC-AUC', ascending=False)

print("\n" + "="*50)
print("             FINAL MODEL COMPARISON (TEST SET)")
print("="*50)
print(df_comparison.to_string(float_format="{:.4f}".format))
print("="*50)

# Save the comparison table to CSV
df_comparison.to_csv('model_comparison_results.csv')
print("\nComparison results saved to 'model_comparison_results.csv'")


# ------------------------------------------------------------------------------
# 5. FEATURE IMPORTANCE (From Random Forest)
# ------------------------------------------------------------------------------

# Get feature importances
importances = rf_classifier.feature_importances_
feature_names = X.columns
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

# Plotting Feature Importance
plt.figure(figsize=(12, 6))
plt.title("Random Forest Feature Importance")
plt.bar(range(X_train_scaled.shape[1]), sorted_importances, align='center')
plt.xticks(range(X_train_scaled.shape[1]), sorted_feature_names, rotation=90)
plt.tight_layout()
plt.savefig('random_forest_feature_importance.png')
plt.close()
print("\nFeature importance plot saved as 'random_forest_feature_importance.png'")
#

# ------------------------------------------------------------------------------
# 6. MODEL SAVING FOR DEPLOYMENT
# ------------------------------------------------------------------------------

# NOTE: Replace 'rf_classifier' with your best model variable (e.g., 'log_reg', 'dt_classifier')
# if a different model performed best.

# --- A. Save the Scikit-learn Best Model (e.g., Random Forest) ---
with open('best_heart_model.pkl', 'wb') as file:
    pickle.dump(rf_classifier, file)
print("\nRandom Forest model (rf_classifier) saved as 'best_heart_model.pkl'")

# --- B. Save the StandardScaler (ESSENTIAL for preprocessing new data) ---
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
print("Scaler object saved as 'scaler.pkl'")

# --- C. Save the Neural Network Model (Optional, if it was the best model) ---
# Use the corrected saving format with the .keras extension
try:
    tf.keras.models.save_model(model, 'best_nn_model.keras')
    print("Neural Network model saved as 'best_nn_model.keras'")
except Exception as e:
    # This might fail if user doesn't have the right TensorFlow version/setup,
    # but the scikit-learn save should succeed.
    print(f"Failed to save Neural Network model due to: {e}")

# ==============================================================================
# SCRIPT END
# ==============================================================================

Dataset loaded successfully. Shape: (303, 14)

Target Variable Distribution:
 target
1    165
0    138
Name: count, dtype: int64

DataFrame shape after One-Hot Encoding: (303, 23)

Data split and scaled successfully.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step

             FINAL MODEL COMPARISON (TEST SET)
                     Accuracy  Precision  Recall  F1-Score  ROC-AUC
Logistic Regression    0.8689     0.8750  0.8750    0.8750   0.9310
Random Forest          0.8525     0.8966  0.8125    0.8525   0.9289
Neural Network         0.8361     0.8235  0.8750    0.8485   0.8922
Decision Tree          0.7213     0.7778  0.6562    0.7119   0.7247

Comparison results saved to 'model_comparison_results.csv'

Feature importance plot saved as 'random_forest_feature_importance.png'

Random Forest model (rf_classifier) saved as 'best_heart_model.pkl'
Scaler object saved as 'scaler.pkl'
Neural Network model saved as 'best_nn_model.keras'


In [2]:
# Main Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Deep Learning (AI Model)
import tensorflow as tf
from tensorflow import keras

FILE_PATH = 'heart.csv'
TARGET_COLUMN = 'target'
RANDOM_SEED = 42

try:
    df = pd.read_csv(FILE_PATH)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: The file '{FILE_PATH}' was not found. Please ensure it is uploaded.")
    exit()


categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
for col in categorical_cols:
    df[col] = df[col].astype('object')

print("\nTarget Variable Distribution:\n", df[TARGET_COLUMN].value_counts())
df_encoded = pd.get_dummies(df, drop_first=True)
print(f"\nDataFrame shape after One-Hot Encoding: {df_encoded.shape}")

X = df_encoded.drop(TARGET_COLUMN, axis=1)
y = df_encoded[TARGET_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\nData split and scaled successfully.")
def get_metrics(y_true, y_pred, y_proba):
    """Calculates standard classification metrics."""
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_proba)
    }
results = {}

# --- A. Logistic Regression ---
log_reg = LogisticRegression(random_state=RANDOM_SEED, solver='liblinear')
log_reg.fit(X_train_scaled, y_train)
y_pred_log_reg = log_reg.predict(X_test_scaled)
y_proba_log_reg = log_reg.predict_proba(X_test_scaled)[:, 1]
results['Logistic Regression'] = get_metrics(y_test, y_pred_log_reg, y_proba_log_reg)

# --- B. Decision Tree ---
dt_classifier = DecisionTreeClassifier(random_state=RANDOM_SEED)
dt_classifier.fit(X_train_scaled, y_train)
y_pred_dt = dt_classifier.predict(X_test_scaled)
y_proba_dt = dt_classifier.predict_proba(X_test_scaled)[:, 1]
results['Decision Tree'] = get_metrics(y_test, y_pred_dt, y_proba_dt)

# --- C. Random Forest ---
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
rf_classifier.fit(X_train_scaled, y_train)
y_pred_rf = rf_classifier.predict(X_test_scaled)
y_proba_rf = rf_classifier.predict_proba(X_test_scaled)[:, 1]
results['Random Forest'] = get_metrics(y_test, y_pred_rf, y_proba_rf)

# --- D. Neural Network (AI Model) ---
input_dim = X_train_scaled.shape[1]
model = keras.Sequential([
    keras.layers.Dense(units=32, activation='relu', input_shape=(input_dim,)),
    keras.layers.Dense(units=16, activation='relu'),
    keras.layers.Dense(units=1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, verbose=0)

y_proba_nn = model.predict(X_test_scaled).flatten()
y_pred_nn = (y_proba_nn > 0.5).astype("int32")
results['Neural Network'] = get_metrics(y_test, y_pred_nn, y_proba_nn)

df_comparison = pd.DataFrame(results).T
df_comparison = df_comparison.sort_values(by='ROC-AUC', ascending=False)

print("\n" + "="*50)
print("             FINAL MODEL COMPARISON (TEST SET)")
print("="*50)
print(df_comparison.to_string(float_format="{:.4f}".format))
print("="*50)

df_comparison.to_csv('model_comparison_results.csv')
print("\nComparison results saved to 'model_comparison_results.csv'")


importances = rf_classifier.feature_importances_
feature_names = X.columns
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

plt.figure(figsize=(12, 6))
plt.title("Random Forest Feature Importance")
plt.bar(range(X_train_scaled.shape[1]), sorted_importances, align='center')
plt.xticks(range(X_train_scaled.shape[1]), sorted_feature_names, rotation=90)
plt.tight_layout()
plt.savefig('random_forest_feature_importance.png')
plt.close()
print("\nFeature importance plot saved as 'random_forest_feature_importance.png'")

with open('best_heart_model.pkl', 'wb') as file:
    pickle.dump(rf_classifier, file)
print("\nRandom Forest model (rf_classifier) saved as 'best_heart_model.pkl'")

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
print("Scaler object saved as 'scaler.pkl'")

try:
    tf.keras.models.save_model(model, 'best_nn_model.keras')
    print("Neural Network model saved as 'best_nn_model.keras'")
except Exception as e:
    print(f"Failed to save Neural Network model due to: {e}")

Dataset loaded successfully. Shape: (303, 14)

Target Variable Distribution:
 target
1    165
0    138
Name: count, dtype: int64

DataFrame shape after One-Hot Encoding: (303, 23)

Data split and scaled successfully.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step

             FINAL MODEL COMPARISON (TEST SET)
                     Accuracy  Precision  Recall  F1-Score  ROC-AUC
Logistic Regression    0.8689     0.8750  0.8750    0.8750   0.9310
Random Forest          0.8525     0.8966  0.8125    0.8525   0.9289
Neural Network         0.8689     0.8529  0.9062    0.8788   0.8922
Decision Tree          0.7213     0.7778  0.6562    0.7119   0.7247

Comparison results saved to 'model_comparison_results.csv'

Feature importance plot saved as 'random_forest_feature_importance.png'

Random Forest model (rf_classifier) saved as 'best_heart_model.pkl'
Scaler object saved as 'scaler.pkl'
Neural Network model saved as 'best_nn_model.keras'
