## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve


## 2. Load Dataset

In [None]:
df = pd.read_csv("heart.csv")  # Replace with your actual path

## 3. EDA and Visualization

In [None]:
print(df.head())

In [None]:
print(df.tail())

In [None]:
print(df.info())

In [None]:
print(df.describe())

In [None]:
# checking unique categories
mappings = {
    "sex": {0: "female", 1: "male"},
    "cp": {0: "typical angina", 1: "atypical angina", 2: "non-anginal pain", 3: "asymptomatic"},
    "fbs": {0: "fbs <= 120 mg/dl", 1: "fbs > 120 mg/dl"},
    "exang": {0: "no", 1: "yes"},
    "restecg": {0: "normal", 1: "ST-T wave abnormality", 2: "left ventricular hypertrophy"}
}

category_columns = list(mappings.keys())

for col in category_columns:
    unique_codes = df[col].unique()
    labels = [mappings[col][code] for code in unique_codes]
    print(f"Unique categories in '{col}':")
    for code, label in zip(unique_codes, labels):
        print(f"  {code}: {label}")
    print()

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# checking the target distribution
df.target.value_counts()
# or df["target"].value_counts()
# % df["target"].value_counts(normalize = True)*100

In [None]:
# Distribution of Target Variable
custom_palette = ['#1f77b4', '#ff7f0e']  # blue and orange
ax = sns.countplot(x='target', data=df, palette=custom_palette)
plt.xlabel("Target")
plt.ylabel("Count")
plt.title("Heart Disease Class Distribution")
# Add custom legend
labels = ['No Disease', 'Heart Disease']
handles = [plt.Rectangle((0, 0), 1, 1, color=patch.get_facecolor()) for patch in ax.patches[:2]]
plt.legend(handles, labels, title="Condition")
plt.show()

In [None]:
# Correlation Matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# List of numerical features (excluding 'target')
features = df.drop(columns=['target']).select_dtypes(include=['int64', 'float64']).columns

# Plot
plt.figure(figsize=(16, 12))
for i, feature in enumerate(features):
    plt.subplot((len(features) + 2) // 3, 3, i + 1)
    sns.histplot(data=df, x=feature, hue='target', kde=True, element='step', stat='density', common_norm=False)
    plt.title(f'{feature} Distribution by Target')
    plt.xlabel(feature)
    plt.ylabel('Density')

plt.tight_layout()
plt.show()


## 4. Data Preprocessing

In [None]:
X = df.drop('target', axis=1)
y = df['target']
print("Shape of X (features):", X.shape)
print("Shape of y (target):", y.shape)

## 5. Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

## 6. Baseline Model (Majority Class Predictor)

In [None]:
from sklearn.dummy import DummyClassifier
# Most Frequent
baseline_mf = DummyClassifier(strategy="most_frequent")
baseline_mf.fit(X_train, y_train)
y_pred_mf = baseline_mf.predict(X_test)
acc_mf = accuracy_score(y_test, y_pred_mf)
print("Baseline Model - Most Frequent")
print("Accuracy:", acc_mf)
print("Classification Report:\n", classification_report(y_test, y_pred_mf))

In [None]:
# Stratified
baseline_strat = DummyClassifier(strategy="stratified", random_state=42)
baseline_strat.fit(X_train, y_train)
y_pred_strat = baseline_strat.predict(X_test)
acc_strat = accuracy_score(y_test, y_pred_strat)
print("Baseline Model - Stratified")
print("Accuracy:", acc_strat)
print("Classification Report:\n", classification_report(y_test, y_pred_strat))

In [None]:
# Uniform
baseline_uniform = DummyClassifier(strategy="uniform", random_state=42)
baseline_uniform.fit(X_train, y_train)
y_pred_uniform = baseline_uniform.predict(X_test)
acc_uniform = accuracy_score(y_test, y_pred_uniform)
print("Baseline Model - Uniform")
print("Accuracy:", acc_uniform)
print("Classification Report:\n", classification_report(y_test, y_pred_uniform))

## 7. Build and Train Multiple Models

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(probability=True),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"\n{name}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Accuracy:", acc)
    print("ROC AUC Score:", auc)
    results.append((name, acc, auc))

# Add baselines to results
results.insert(0, ("Baseline (Uniform)", acc_uniform, None))
results.insert(0, ("Baseline (Stratified)", acc_strat, None))
results.insert(0, ("Baseline (Most Frequent)", acc_mf, None))

## 8. Model Comparison Table

In [None]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "ROC AUC"])
print("\nModel Comparison:")
print(results_df.sort_values(by="ROC AUC", ascending=False))

In [None]:
# ROC Curve for All Models
plt.figure(figsize=(10, 8))
for name, model in models.items():
    try:
        y_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        auc_score = roc_auc_score(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc_score:.2f})")
    except AttributeError:
        print(f"Skipping ROC curve for {name}: no predict_proba method.")

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves of All Models")
plt.legend()
plt.show()

## 9. Feature Importance (Random Forest Example)

In [None]:
rf_model = models["Random Forest"]
importances = rf_model.feature_importances_
features = df.drop('target', axis=1).columns
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=features)
plt.title("Feature Importance (Random Forest)")
plt.show()

## 10. Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print("Cross-Validation Results:")
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

In [None]:
# Display the cross validation result in a tabular format
results = []
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    results.append({
        'Model': name,
        'Mean Accuracy': scores.mean(),
        'Std Dev': scores.std()
    })

# Create a DataFrame for display
results_df = pd.DataFrame(results).sort_values(by='Mean Accuracy', ascending= False)

# Format and display
results_df = results_df.round(4)
print("Cross-Validation Results:")
display(results_df)


## 11. Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200,300,400],
    'max_depth': [None, 10, 20, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1,2,4]
    
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters (Random Forest):", grid_search.best_params_)
print("Best Cross-Validated Accuracy:", grid_search.best_score_)

In [None]:
# Evaluate best RF on test set
best_rf = grid_search.best_estimator_
y_rf_pred = best_rf.predict(X_test)
print("Test Accuracy (Best RF):", accuracy_score(y_test, y_rf_pred))
print("Classification Report (Best RF):\n", classification_report(y_test, y_rf_pred))      

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(best_rf, X_test, y_test, cmap='Blues')
plt.title("Confusion Matrix - Best RF Model")
plt.show()

## 12. Save the Best Model

In [None]:
import pickle
best_model = models["Random Forest"]
with open('heart_disease_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
# ✅ Save the scaler for use in deployment with pickle
# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# or Joblib
# import joblib
# best_model = models["Gradient Boosting"]
# joblib.dump(best_model, 'heart_disease_model.pkl'
# ✅ Save the scaler for use in deployment
#joblib.dump(scaler, 'scaler.pkl')
          