**Heart Disease Prediction Model**

**Import Libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, recall_score
import joblib
import pymongo
from dotenv import load_dotenv
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

load_dotenv('../backend/.env')
print("Done")


Done


**Load Data**

In [2]:
data = pd.read_csv('../data/raw/heart_disease_uci.csv')
print(f"Shape: {data.shape}")
print("Columns:", data.columns.tolist())
print(f"Target counts:\n{data['num'].value_counts()}")
data.head()


Shape: (920, 16)
Columns: ['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
Target counts:
num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


**Explore Data**

In [3]:
print("Data info:")
print(data.info())
print("\nMissing values:")
print(data.isnull().sum())
print("\nBasic stats:")
print(data.describe())


Data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB
None

Missing values:
id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59


**Feature Engineering**

In [4]:
def clean_and_prep(df):
    new_df = df.copy()
    
    new_df = new_df.rename(columns={'num': 'target'})
    
    new_df['sex'] = new_df['sex'].map({'Male': 1, 'Female': 0})
    
    cp_map = {'typical angina': 0, 'atypical angina': 1, 'non-anginal': 2, 'asymptomatic': 3}
    new_df['cp'] = new_df['cp'].map(cp_map)
    
    new_df['fbs'] = new_df['fbs'].map({True: 1, False: 0})
    new_df['exang'] = new_df['exang'].map({True: 1, False: 0})
    
    restecg_map = {'normal': 0, 'st-t abnormality': 1, 'lv hypertrophy': 2}
    new_df['restecg'] = new_df['restecg'].map(restecg_map)
    
    slope_map = {'upsloping': 0, 'flat': 1, 'downsloping': 2}
    new_df['slope'] = new_df['slope'].map(slope_map)
    
    thal_map = {'normal': 1, 'fixed defect': 2, 'reversable defect': 3}
    new_df['thal'] = new_df['thal'].map(thal_map)
    
    if 'thalch' in new_df.columns and 'thalach' not in new_df.columns:
        new_df['thalach'] = new_df['thalch']
    
    new_df['target'] = (new_df['target'] > 0).astype(int)
    
    cols = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
            'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
    new_df = new_df[cols].dropna()
    
    return new_df

def make_features(df):
    new_df = df.copy()
    
    new_df['chol_ratio'] = new_df['chol'] / new_df['age']
    new_df['hr_ratio'] = new_df['thalach'] / new_df['age']
    new_df['oldpeak_ratio'] = new_df['oldpeak'] / new_df['age']
    
    new_df['cp_hr'] = new_df['cp'] * new_df['thalach']
    new_df['sex_age'] = new_df['sex'] * new_df['age']
    
    new_df['age_bin'] = pd.cut(new_df['age'], bins=[0, 40, 55, 70, 100], 
                               labels=['young', 'middle', 'senior', 'old'])
    
    return new_df

clean_data = clean_and_prep(data)
print(f"After cleaning: {clean_data.shape}")

clean_data = make_features(clean_data)
print(f"After feature engineering: {clean_data.shape}")
print(f"Target distribution: {clean_data['target'].value_counts()}")


After cleaning: (299, 14)
After feature engineering: (299, 20)
Target distribution: target
0    160
1    139
Name: count, dtype: int64


**Prepare Data**

In [5]:
X = clean_data.drop('target', axis=1)
y = clean_data['target']

cat_features = ['age_bin']
num_features = [col for col in X.columns if col not in cat_features]

print(f"Categorical: {cat_features}")
print(f"Numerical: {num_features}")

prep = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), cat_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=y)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")


Categorical: ['age_bin']
Numerical: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'chol_ratio', 'hr_ratio', 'oldpeak_ratio', 'cp_hr', 'sex_age']
Train: (239, 19), Test: (60, 19)


**Train Models**

In [6]:
models = {
    'lr': {
        'model': LogisticRegression(random_state=42),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__penalty': ['l2']
        }
    },
    'rf': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [5, 10, None],
            'model__min_samples_split': [2, 5]
        }
    },
    'xgb': {
        'model': XGBClassifier(random_state=42, eval_metric='logloss'),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [3, 5, 7],
            'model__learning_rate': [0.01, 0.1]
        }
    }
}

best_model = None
best_score = 0
best_name = ""
results = {}

for name, config in models.items():
    print(f"\nTraining {name}...")
    
    pipe = Pipeline([
        ('prep', prep),
        ('model', config['model'])
    ])
    
    search = GridSearchCV(pipe, config['params'], cv=5, scoring='roc_auc', n_jobs=-1)
    search.fit(X_train, y_train)
    
    pred = search.predict(X_test)
    pred_prob = search.predict_proba(X_test)[:, 1]
    
    auc = roc_auc_score(y_test, pred_prob)
    rec = recall_score(y_test, pred)
    
    results[name] = {
        'model': search.best_estimator_,
        'auc': auc,
        'recall': rec,
        'params': search.best_params_
    }
    
    print(f"AUC: {auc:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"Best params: {search.best_params_}")
    
    if auc > best_score:
        best_score = auc
        best_model = search.best_estimator_
        best_name = name

print(f"\nBest: {best_name} with AUC: {best_score:.4f}")



Training lr...
AUC: 0.9330
Recall: 0.7857
Best params: {'model__C': 1, 'model__penalty': 'l2'}

Training rf...
AUC: 0.9163
Recall: 0.6786
Best params: {'model__max_depth': 5, 'model__min_samples_split': 5, 'model__n_estimators': 100}

Training xgb...
AUC: 0.9062
Recall: 0.6429
Best params: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 200}

Best: lr with AUC: 0.9330


**Save Model**

In [7]:
joblib.dump(best_model, '../model.joblib')
print(f"Saved {best_name} model to model.joblib")

print("\nAll results:")
for name, res in results.items():
    print(f"{name}: AUC={res['auc']:.4f}, Recall={res['recall']:.4f}")


Saved lr model to model.joblib

All results:
lr: AUC=0.9330, Recall=0.7857
rf: AUC=0.9163, Recall=0.6786
xgb: AUC=0.9062, Recall=0.6429


**Save to Database**

In [8]:
def save_data(df, collection='patients'):
    try:
        uri = os.getenv('MONGO_URI')
        if not uri:
            print("No MONGO_URI found")
            return
        
        client = pymongo.MongoClient(uri)
        db = client.heart_disease_db
        coll = db[collection]
        
        coll.delete_many({})
        
        records = df.to_dict('records')
        for rec in records:
            rec['created_at'] = datetime.utcnow()
            rec['source'] = 'notebook'
        
        result = coll.insert_many(records)
        print(f"Saved {len(result.inserted_ids)} records to {collection}")
        
        coll.create_index([('created_at', -1)])
        client.close()
        
    except Exception as e:
        print(f"Database error: {e}")

save_data(clean_data)


Saved 299 records to patients


**Feature Importance**

In [9]:
def get_importance(model, feature_names):
    try:
        if hasattr(model.named_steps['model'], 'feature_importances_'):
            imp = model.named_steps['model'].feature_importances_
        elif hasattr(model.named_steps['model'], 'coef_'):
            imp = np.abs(model.named_steps['model'].coef_[0])
        else:
            return None
        
        names = model.named_steps['prep'].get_feature_names_out()
        
        df = pd.DataFrame({
            'feature': names,
            'importance': imp
        }).sort_values('importance', ascending=False)
        
        return df
    except Exception as e:
        print(f"Error: {e}")
        return None

importance = get_importance(best_model, X.columns)
if importance is not None:
    print("\nTop 10 features:")
    print(importance.head(10))
else:
    print("Could not get feature importance")

print("\nDone!")



Top 10 features:
                feature  importance
19  cat__age_bin_senior    1.313660
11              num__ca    1.062286
12            num__thal    0.781825
0              num__age    0.662419
14        num__hr_ratio    0.483439
3         num__trestbps    0.481944
20   cat__age_bin_young    0.467620
1              num__sex    0.444718
6          num__restecg    0.442851
15   num__oldpeak_ratio    0.394705

Done!
