# Heart Disease — EDA & Baseline Models
This notebook reproduces a compact exploratory analysis and two simple baseline models (Linear Regression and Random Forest Regressor) on the heart disease dataset.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Optional: make output a bit wider
pd.set_option('display.max_columns', None)


In [None]:
# Load data
# If your CSV is elsewhere, update the path below.
csv_path = 'heart-1.csv'
data = pd.read_csv(csv_path)

print("Shape:", data.shape)
data.head()

In [None]:
# Basic info / null checks
print(data.info())
print("\nNull counts:\n", data.isnull().sum())
display(data.describe(include='all'))

In [None]:
# Quick distributions for a few numeric features
num_cols = ['age','trtbps','chol','thalachh','oldpeak']
for col in num_cols:
    plt.figure()
    data[col].hist(bins=20)
    plt.title(f'Distribution: {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

In [None]:
# Train/test split
X = data.drop(columns=['output'])
y = data['output']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

In [None]:
# Linear Regression (as a regressor)
lr = LinearRegression()
lr.fit(X_train, y_train)

lr_pred_reg = lr.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred_reg)
lr_r2  = r2_score(y_test, lr_pred_reg)

print("Linear Regression — MSE:", lr_mse)
print("Linear Regression — R² :", lr_r2)

In [None]:
# Random Forest Regressor (baseline)
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

rf_pred_reg = rf.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred_reg)
rf_r2  = r2_score(y_test, rf_pred_reg)

print("Random Forest Regressor — MSE:", rf_mse)
print("Random Forest Regressor — R² :", rf_r2)

In [None]:
# (Optional) Treat the regression outputs as probabilities and threshold at 0.5 to get rough classification metrics
def threshold_preds(preds, thr=0.5):
    return (preds >= thr).astype(int)

for name, preds in [('LR', lr_pred_reg), ('RF', rf_pred_reg)]:
    y_hat = threshold_preds(preds, 0.5)
    acc = accuracy_score(y_test, y_hat)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_hat, average='binary', zero_division=0)
    cm = confusion_matrix(y_test, y_hat)
    print(f"\n{name} thresholded @0.5 — Acc: {acc:.4f}  Prec: {prec:.4f}  Rec: {rec:.4f}  F1: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    print("\nClassification report:\n", classification_report(y_test, y_hat, zero_division=0))

### Next ideas
- Try feature scaling and regularization (e.g., Ridge/Lasso) for LR.
- Tune RF hyperparameters with `RandomizedSearchCV`/`GridSearchCV`.
- Try Gradient Boosting models (XGBoost, LightGBM) or calibrated classifiers.
- Handle class imbalance if present (e.g., class weights, resampling).