In [None]:
# ==============================================
# 1. Import Libraries
# ==============================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

import pickle

In [None]:
# ==============================================
# 2. Load Dataset
# ==============================================
df = pd.read_csv("heart.csv")
df.head()

In [None]:
# ==============================================
# 3. Exploratory Data Analysis (EDA)
# ==============================================
# Check dataset info
df.info()

# Summary statistics
df.describe()

# Check null values
df.isnull().sum()

# Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation")
plt.show()

# Distribution of target
sns.countplot(x='target', data=df)
plt.title("Target Distribution")
plt.show()

# High-impact feature visualizations
sns.boxplot(x='target', y='chol', data=df)
plt.title("Cholesterol vs Heart Disease")
plt.show()

sns.boxplot(x='target', y='thalach', data=df)
plt.title("Max Heart Rate vs Heart Disease")
plt.show()

sns.countplot(x='cp', hue='target', data=df)
plt.title("Chest Pain Type vs Heart Disease")
plt.show()

In [None]:
# ==============================================
# 4. Data Preprocessing & Feature Engineering
# ==============================================
# Encode categorical variables
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])
df['cp'] = le.fit_transform(df['cp'])
df['thal'] = le.fit_transform(df['thal'])

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# ==============================================
# 5. Split Data into Train/Test
# ==============================================
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# ==============================================
# 6. Model Training & Hyperparameter Tuning
# ==============================================
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

params = {
    "Logistic Regression": {"C": [0.1, 1, 10]},
    "Random Forest": {"n_estimators": [100, 200], "max_depth": [4, 6, 8]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
}

best_models = {}

for name in models:
    print(f"--- Training {name} ---")
    clf = GridSearchCV(models[name], params[name], cv=5, scoring='f1')
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()
    
    best_models[name] = clf.best_estimator_

# Save the best model (Random Forest as example)
pickle.dump(best_models["Random Forest"], open("best_model.pkl", "wb"))

In [None]:
# ==============================================
# 7. Feature Importance (Random Forest)
# ==============================================
importances = best_models["Random Forest"].feature_importances_
features = X.columns
feat_importance = pd.Series(importances, index=features).sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_importance, y=feat_importance.index)
plt.title("Feature Importance (Random Forest)")
plt.show()


In [None]:
# ==============================================
# 8. Prediction Function
# ==============================================
def predict_heart_disease(model, input_data):
    input_array = np.array(input_data).reshape(1, -1)
    prediction = model.predict(input_array)
    return "Heart Disease Detected" if prediction[0]==1 else "No Heart Disease"

# Example usage
example_input = X_test[0]  # First test record
predict_heart_disease(best_models["Random Forest"], example_input)

In [None]:
# ==============================================
# 9. Save Preprocessing Objects (Optional)
# ==============================================
# Save scaler for deployment
pickle.dump(scaler, open("scaler.pkl", "wb"))