In [1]:
# 📦 Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import shap
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 📥 Load Dataset
df = pd.read_csv('Lifestyle_and_Health_Risk_Prediction_Synthetic_Dataset.csv')  # Replace with your dataset path

In [3]:
# 🧹 Preprocessing
target_col = 'health_risk'
X = df.drop(columns=[target_col])
y = df[target_col]

In [4]:
# Encode target label
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)

In [5]:
# Encode categorical features
cat_features = ['exercise', 'sugar_intake', 'smoking', 'alcohol', 'married', 'profession']
encoders = {}
for col in cat_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

In [6]:
# 🧪 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)

In [7]:
# 🚀 Define Models to compare
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42)
}

results = {}
all_reports = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    all_reports[name] = classification_report(
        y_test, y_pred, target_names=target_encoder.classes_, output_dict=True
    )
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))


Training Random Forest...
Accuracy: 0.9870
              precision    recall  f1-score   support

        high       0.99      0.99      0.99       708
         low       0.97      0.98      0.98       292

    accuracy                           0.99      1000
   macro avg       0.98      0.99      0.98      1000
weighted avg       0.99      0.99      0.99      1000


Training XGBoost...
Accuracy: 0.9950
              precision    recall  f1-score   support

        high       1.00      1.00      1.00       708
         low       0.99      0.99      0.99       292

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       1.00      0.99      1.00      1000


Training LightGBM...
[LightGBM] [Info] Number of positive: 1218, number of negative: 2782
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memo

In [8]:
# Select best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\n✅ Best Model: {best_model_name} with Accuracy = {results[best_model_name]:.4f}")


✅ Best Model: CatBoost with Accuracy = 0.9960


In [9]:
# Save best model and encoders
joblib.dump(best_model, 'health_risk_model.pkl')
joblib.dump(encoders, 'encoders.pkl')
joblib.dump(target_encoder, 'target_encoder.pkl')

['target_encoder.pkl']

In [10]:
# 🧠 SHAP explainability
print("Generating SHAP explainer and summary plot...")

explainer = shap.Explainer(best_model, X_train)
joblib.dump(explainer, 'shap_explainer.pkl')

shap_values = explainer(X_train)

plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_train, show=False)
plt.savefig('shap_summary_plot.png')
plt.close()

print("✅ SHAP explainer and summary plot saved!")

Generating SHAP explainer and summary plot...




✅ SHAP explainer and summary plot saved!
