# Heart Disease Prediction - Machine Learning Project

## Project Overview
This project predicts heart disease using 5 different ML models with comprehensive analysis.

**Features:** age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal, target

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)
print("Libraries imported!")

## 2. Load Dataset

In [None]:
try:
    from google.colab import files
    print("Upload heart.csv file")
    uploaded = files.upload()
    df = pd.read_csv('heart.csv')
except:
    df = pd.read_csv('heart.csv')
print(f"Dataset loaded: {df.shape}")
df.head()

## 3. EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print("Missing values:", df.isnull().sum().sum())
print("\nTarget distribution:")
print(df['target'].value_counts())

## 4. Visualizations

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
target_counts = df['target'].value_counts()
axes[0].bar(['No Disease', 'Disease'], target_counts.values, color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Target Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count')
axes[1].pie(target_counts.values, labels=['No Disease', 'Disease'], autopct='%1.1f%%',
            colors=['#2ecc71', '#e74c3c'], explode=(0.05, 0.05))
axes[1].set_title('Target Proportion', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(14, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f', linewidths=1, square=True)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
target_corr = df.corr()['target'].abs().sort_values(ascending=False)[1:]
plt.figure(figsize=(12, 6))
target_corr.plot(kind='barh', color='steelblue')
plt.title('Feature Correlation with Target', fontsize=14, fontweight='bold')
plt.xlabel('Absolute Correlation')
plt.tight_layout()
plt.show()

## 5. Data Preprocessing

In [None]:
X = df.drop('target', axis=1)
y = df['target']
print(f"Features: {X.shape}")
print(f"Target: {y.shape}")

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print(f"Training: {X_train.shape[0]} ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Validation: {X_val.shape[0]} ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"Test: {X_test.shape[0]} ({X_test.shape[0]/len(df)*100:.1f}%)")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
print("Feature scaling completed!")

## 6. Model Training

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, C=0.5),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10),
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42, max_depth=8, min_samples_split=8),
    'SVM': SVC(kernel='rbf', random_state=42, C=0.8, probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=7, weights='uniform')
}

trained_models = {}
results = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    y_train_pred = model.predict(X_train_scaled)
    y_val_pred = model.predict(X_val_scaled)
    y_test_pred = model.predict(X_test_scaled)
    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    results.append({
        'Model': name,
        'Train Accuracy': train_acc,
        'Validation Accuracy': val_acc,
        'Test Accuracy': test_acc,
        'CV Score': cv_scores.mean(),
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Overfitting': train_acc - test_acc
    })
    trained_models[name] = model
    print(f"  Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}")

print("\nAll models trained!")

## 7. Model Evaluation

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test Accuracy', ascending=False).reset_index(drop=True)
print("Model Performance Summary:")
print(results_df.to_string(index=False))
print(f"\nBest: {results_df.iloc[0]['Model']} - {results_df.iloc[0]['Test Accuracy']:.4f}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
x_pos = np.arange(len(results_df))
width = 0.25
axes[0].bar(x_pos - width, results_df['Train Accuracy'], width, label='Train', alpha=0.8)
axes[0].bar(x_pos, results_df['Validation Accuracy'], width, label='Validation', alpha=0.8)
axes[0].bar(x_pos + width, results_df['Test Accuracy'], width, label='Test', alpha=0.8)
axes[0].set_xlabel('Models', fontweight='bold')
axes[0].set_ylabel('Accuracy', fontweight='bold')
axes[0].set_title('Model Accuracy Comparison', fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[0].legend()
axes[0].set_ylim([0.7, 1.0])
axes[1].bar(x_pos - width, results_df['Precision'], width, label='Precision', alpha=0.8)
axes[1].bar(x_pos, results_df['Recall'], width, label='Recall', alpha=0.8)
axes[1].bar(x_pos + width, results_df['F1 Score'], width, label='F1', alpha=0.8)
axes[1].set_xlabel('Models', fontweight='bold')
axes[1].set_ylabel('Score', fontweight='bold')
axes[1].set_title('Performance Metrics', fontweight='bold')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[1].legend()
axes[1].set_ylim([0.7, 1.0])
plt.tight_layout()
plt.show()

## 8. Overfitting Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
colors = ['green' if x < 0.05 else 'orange' if x < 0.1 else 'red' for x in results_df['Overfitting']]
axes[0].barh(results_df['Model'], results_df['Overfitting'], color=colors, alpha=0.7)
axes[0].axvline(x=0.05, color='orange', linestyle='--', label='Warning')
axes[0].axvline(x=0.1, color='red', linestyle='--', label='Overfitting')
axes[0].set_xlabel('Overfitting Gap', fontweight='bold')
axes[0].set_title('Overfitting Analysis', fontweight='bold')
axes[0].legend()
axes[1].scatter(results_df['Train Accuracy'], results_df['Test Accuracy'], s=200, alpha=0.6)
min_acc = results_df[['Train Accuracy', 'Test Accuracy']].min().min()
max_acc = results_df[['Train Accuracy', 'Test Accuracy']].max().max()
axes[1].plot([min_acc, max_acc], [min_acc, max_acc], 'r--', label='Perfect', linewidth=2)
for idx, row in results_df.iterrows():
    axes[1].annotate(row['Model'], (row['Train Accuracy'], row['Test Accuracy']),
                     xytext=(5, 5), textcoords='offset points', fontsize=9)
axes[1].set_xlabel('Train Accuracy', fontweight='bold')
axes[1].set_ylabel('Test Accuracy', fontweight='bold')
axes[1].set_title('Train vs Test', fontweight='bold')
axes[1].legend()
plt.tight_layout()
plt.show()
print("\nOverfitting Summary:")
for idx, row in results_df.iterrows():
    status = "Good" if row['Overfitting'] < 0.05 else "Slight" if row['Overfitting'] < 0.1 else "Significant"
    print(f"{row['Model']:25s} - {row['Overfitting']:.4f} - {status}")

## 9. Confusion Matrices

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()
for idx, (name, model) in enumerate(trained_models.items()):
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['No Disease', 'Disease'],
                yticklabels=['No Disease', 'Disease'])
    axes[idx].set_title(f'{name}\nAcc: {accuracy_score(y_test, y_pred):.4f}', fontweight='bold')
    axes[idx].set_ylabel('Actual')
    axes[idx].set_xlabel('Predicted')
fig.delaxes(axes[5])
plt.tight_layout()
plt.show()

## 10. ROC Curves

In [None]:
plt.figure(figsize=(12, 8))
for name, model in trained_models.items():
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        y_pred_proba = model.decision_function(X_test_scaled)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, linewidth=2, label=f'{name} (AUC={roc_auc:.3f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random')
plt.xlabel('False Positive Rate', fontweight='bold')
plt.ylabel('True Positive Rate', fontweight='bold')
plt.title('ROC Curves', fontweight='bold')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 11. Learning Curves

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()
for idx, (name, model) in enumerate(trained_models.items()):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X_train_scaled, y_train, cv=5, 
        train_sizes=np.linspace(0.1, 1.0, 10), scoring='accuracy', n_jobs=-1
    )
    train_mean = train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    val_mean = val_scores.mean(axis=1)
    val_std = val_scores.std(axis=1)
    axes[idx].plot(train_sizes, train_mean, 'o-', color='r', label='Training')
    axes[idx].fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
    axes[idx].plot(train_sizes, val_mean, 'o-', color='g', label='Validation')
    axes[idx].fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color='g')
    axes[idx].set_title(f'{name}', fontweight='bold')
    axes[idx].set_xlabel('Training Examples')
    axes[idx].set_ylabel('Accuracy')
    axes[idx].legend(loc='lower right', fontsize=9)
    axes[idx].set_ylim([0.65, 1.05])
fig.delaxes(axes[5])
plt.tight_layout()
plt.show()

## 12. Best Model Analysis

In [None]:
best_model_name = results_df.iloc[0]['Model']
best_model = trained_models[best_model_name]
print(f"\nBest Model: {best_model_name}")
y_pred = best_model.predict(X_test_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Disease', 'Disease']))
cm = confusion_matrix(y_test, y_pred)
print(f"\nTrue Negatives: {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives: {cm[1,1]}")

## 13. Feature Importance

In [None]:
tree_models = ['Decision Tree', 'Random Forest']
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
for idx, model_name in enumerate(tree_models):
    if model_name in trained_models:
        model = trained_models[model_name]
        feature_importance = pd.DataFrame({
            'Feature': X.columns,
            'Importance': model.feature_importances_
        }).sort_values('Importance', ascending=False)
        axes[idx].barh(feature_importance['Feature'], feature_importance['Importance'], 
                       color='steelblue', alpha=0.8)
        axes[idx].set_xlabel('Importance', fontweight='bold')
        axes[idx].set_title(f'Feature Importance - {model_name}', fontweight='bold')
        axes[idx].invert_yaxis()
plt.tight_layout()
plt.show()

## 14. Cross-Validation

In [None]:
cv_results = {}
print("Cross-Validation (5-Fold):")
for name, model in trained_models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_results[name] = cv_scores
    print(f"\n{name}:")
    print(f"  Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"  Scores: {[f'{s:.4f}' for s in cv_scores]}")

In [None]:
plt.figure(figsize=(12, 6))
bp = plt.boxplot([scores for scores in cv_results.values()], 
                 labels=cv_results.keys(), patch_artist=True)
for patch in bp['boxes']:
    patch.set_facecolor('lightblue')
plt.ylabel('CV Accuracy', fontweight='bold')
plt.title('Cross-Validation Scores Distribution', fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 15. Final Summary

In [None]:
print("\n" + "="*80)
print(" "*25 + "PROJECT SUMMARY")
print("="*80)
print(f"\nDataset: {len(df)} samples, {X.shape[1]} features")
print(f"Train: {X_train.shape[0]} | Val: {X_val.shape[0]} | Test: {X_test.shape[0]}")
print("\nModels Trained:")
for i, m in enumerate(results_df['Model'], 1):
    print(f"  {i}. {m}")
print("\nTop 3 Models:")
for i, row in results_df.head(3).iterrows():
    print(f"\n  {i+1}. {row['Model']}")
    print(f"     Test Accuracy: {row['Test Accuracy']:.4f}")
    print(f"     F1 Score: {row['F1 Score']:.4f}")
    print(f"     Overfitting: {row['Overfitting']:.4f}")
print("\n" + "="*80)
print(" "*25 + "PROJECT COMPLETE!")
print("="*80)

## 16. Save Results

In [None]:
results_df.to_csv('model_results.csv', index=False)
print("Results saved to model_results.csv")