In [5]:
# ============================================
# IMPROVED MODEL WITH SMOTE BALANCING
# ============================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("HEALTH PREDICTION MODEL TRAINING (WITH SMOTE)")
print("="*70)

# ============================================
# 1. LOAD & PREPROCESS
# ============================================
df = pd.read_csv('CVD_cleaned.csv')
print(f"\nDataset: {df.shape[0]} rows")

df['Diabetes_Binary'] = df['Diabetes'].apply(lambda x: 1 if 'Yes' in str(x) else 0)
df['Heart_Disease_Binary'] = (df['Heart_Disease'] == 'Yes').astype(int)

print(f"Diabetes: {df['Diabetes_Binary'].mean():.1%} | Heart Disease: {df['Heart_Disease_Binary'].mean():.1%}")

# ============================================
# 2. ENCODE FEATURES
# ============================================

binary_cols = ['Exercise', 'Skin_Cancer', 'Other_Cancer', 'Depression', 'Arthritis', 'Smoking_History']
for col in binary_cols:
    df[f'{col}_enc'] = (df[col] == 'Yes').astype(int)

health_map = {'Poor': 1, 'Fair': 2, 'Good': 3, 'Very Good': 4, 'Excellent': 5}
df['General_Health_enc'] = df['General_Health'].map(health_map)

checkup_map = {'Never': 0, '5 or more years ago': 1, 'Within the past 5 years': 2,
               'Within the past 2 years': 3, 'Within the past year': 4}
df['Checkup_enc'] = df['Checkup'].map(checkup_map)

df['Sex_enc'] = (df['Sex'] == 'Male').astype(int)

age_map = {'18-24': 1, '25-29': 2, '30-34': 3, '35-39': 4, '40-44': 5, '45-49': 6,
           '50-54': 7, '55-59': 8, '60-64': 9, '65-69': 10, '70-74': 11, '75-79': 12, '80+': 13}
df['Age_enc'] = df['Age_Category'].map(age_map)

feature_columns = [
    'General_Health_enc', 'Checkup_enc', 'Exercise_enc', 'Skin_Cancer_enc',
    'Other_Cancer_enc', 'Depression_enc', 'Arthritis_enc', 'Sex_enc', 'Age_enc',
    'Height_(cm)', 'Weight_(kg)', 'BMI', 'Smoking_History_enc',
    'Alcohol_Consumption', 'Fruit_Consumption', 'Green_Vegetables_Consumption',
    'FriedPotato_Consumption'
]

X = df[feature_columns].fillna(df[feature_columns].median())
y_diabetes = df['Diabetes_Binary']
y_heart = df['Heart_Disease_Binary']

# ============================================
# 3. SPLIT & SCALE
# ============================================

X_train, X_test, y_dia_train, y_dia_test, y_heart_train, y_heart_test = train_test_split(
    X, y_diabetes, y_heart, test_size=0.2, random_state=42, stratify=y_diabetes
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nOriginal training: {X_train.shape[0]} samples")
print(f"  Diabetes=Yes: {y_dia_train.sum()} ({y_dia_train.mean():.1%})")
print(f"  Heart Disease=Yes: {y_heart_train.sum()} ({y_heart_train.mean():.1%})")

# ============================================
# 4. APPLY SMOTE (BALANCE THE DATA)
# ============================================

print("\nApplying SMOTE to balance classes...")

smote = SMOTE(random_state=42, k_neighbors=5)

# Balance diabetes data
X_dia_balanced, y_dia_balanced = smote.fit_resample(X_train_scaled, y_dia_train)
print(f"Diabetes after SMOTE: {X_dia_balanced.shape[0]} samples")
print(f"  Diabetes=Yes: {y_dia_balanced.sum()} ({y_dia_balanced.mean():.1%})")

# Balance heart disease data
X_heart_balanced, y_heart_balanced = smote.fit_resample(X_train_scaled, y_heart_train)
print(f"Heart Disease after SMOTE: {X_heart_balanced.shape[0]} samples")
print(f"  Heart Disease=Yes: {y_heart_balanced.sum()} ({y_heart_balanced.mean():.1%})")

# ============================================
# 5. TRAIN DIABETES MODEL (on balanced data)
# ============================================

print("\n" + "="*70)
print("TRAINING DIABETES MODEL")
print("="*70)

diabetes_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=12,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

diabetes_model.fit(X_dia_balanced, y_dia_balanced)

dia_pred = diabetes_model.predict(X_test_scaled)
dia_prob = diabetes_model.predict_proba(X_test_scaled)[:, 1]

dia_acc = accuracy_score(y_dia_test, dia_pred)
dia_auc = roc_auc_score(y_dia_test, dia_prob)

print(f"\n✅ Accuracy: {dia_acc:.2%} | ROC-AUC: {dia_auc:.3f}")
print("\nClassification Report:")
print(classification_report(y_dia_test, dia_pred, target_names=['No', 'Yes']))

cm = confusion_matrix(y_dia_test, dia_pred)
print(f"Caught {cm[1][1]}/{cm[1].sum()} diabetic cases ({cm[1][1]/cm[1].sum():.1%} recall)")

# ============================================
# 6. TRAIN HEART DISEASE MODEL (on balanced data)
# ============================================

print("\n" + "="*70)
print("TRAINING HEART DISEASE MODEL")
print("="*70)

heart_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=12,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

heart_model.fit(X_heart_balanced, y_heart_balanced)

heart_pred = heart_model.predict(X_test_scaled)
heart_prob = heart_model.predict_proba(X_test_scaled)[:, 1]

heart_acc = accuracy_score(y_heart_test, heart_pred)
heart_auc = roc_auc_score(y_heart_test, heart_prob)

print(f"\n✅ Accuracy: {heart_acc:.2%} | ROC-AUC: {heart_auc:.3f}")
print("\nClassification Report:")
print(classification_report(y_heart_test, heart_pred, target_names=['No', 'Yes']))

cm = confusion_matrix(y_heart_test, heart_pred)
print(f"Caught {cm[1][1]}/{cm[1].sum()} heart disease cases ({cm[1][1]/cm[1].sum():.1%} recall)")

# ============================================
# 7. SUMMARY
# ============================================

print("\n" + "="*70)
print("FINAL RESULTS")
print("="*70)
print(f"Diabetes:      Accuracy={dia_acc:.2%} | AUC={dia_auc:.3f}")
print(f"Heart Disease: Accuracy={heart_acc:.2%} | AUC={heart_auc:.3f}")
print("\n✅ Models ready (not saved yet)")
print("="*70)

# Test examples
print("\n" + "="*70)
print("SAMPLE PREDICTIONS")
print("="*70)

# Healthy person
healthy = scaler.transform([[4, 4, 1, 0, 0, 0, 0, 0, 3, 165, 60, 22, 0, 2, 30, 20, 5]])
print(f"\nHealthy 30F: Diabetes={diabetes_model.predict_proba(healthy)[0][1]*100:.1f}% | Heart={heart_model.predict_proba(healthy)[0][1]*100:.1f}%")

# At-risk person
atrisk = scaler.transform([[2, 2, 0, 0, 0, 0, 1, 1, 9, 170, 95, 32.9, 1, 15, 5, 3, 20]])
print(f"At-risk 60M: Diabetes={diabetes_model.predict_proba(atrisk)[0][1]*100:.1f}% | Heart={heart_model.predict_proba(atrisk)[0][1]*100:.1f}%")

print("\n" + "="*70)

HEALTH PREDICTION MODEL TRAINING (WITH SMOTE)

Dataset: 308854 rows
Diabetes: 13.9% | Heart Disease: 8.1%

Original training: 247083 samples
  Diabetes=Yes: 34254 (13.9%)
  Heart Disease=Yes: 19954 (8.1%)

Applying SMOTE to balance classes...
Diabetes after SMOTE: 425658 samples
  Diabetes=Yes: 212829 (50.0%)
Heart Disease after SMOTE: 454258 samples
  Heart Disease=Yes: 227129 (50.0%)

TRAINING DIABETES MODEL

✅ Accuracy: 77.32% | ROC-AUC: 0.790

Classification Report:
              precision    recall  f1-score   support

          No       0.93      0.80      0.86     53208
         Yes       0.33      0.60      0.42      8563

    accuracy                           0.77     61771
   macro avg       0.63      0.70      0.64     61771
weighted avg       0.84      0.77      0.80     61771

Caught 5129/8563 diabetic cases (59.9% recall)

TRAINING HEART DISEASE MODEL

✅ Accuracy: 82.30% | ROC-AUC: 0.822

Classification Report:
              precision    recall  f1-score   support

     

In [6]:
# ============================================
# SAVE MODELS
# ============================================
import pickle

with open('diabetes_model.pkl', 'wb') as f:
    pickle.dump(diabetes_model, f)

with open('heart_model.pkl', 'wb') as f:
    pickle.dump(heart_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save feature names for Streamlit
with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(feature_columns, f)

print("\n✅ ALL MODELS SAVED!")
print("Files: diabetes_model.pkl, heart_model.pkl, scaler.pkl, feature_columns.pkl")


✅ ALL MODELS SAVED!
Files: diabetes_model.pkl, heart_model.pkl, scaler.pkl, feature_columns.pkl
