# 🧪 Notebook 6 — Model Deployment & Inference

**Objective:**  

Prepare the final, tuned model for deployment. This notebook will:
 
- Load the best model(s) from Notebook 5.  
  
- Build preprocessing + prediction pipeline.  
  
- Create functions for inference on new data.  
  
- Test predictions with sample inputs.  
  
- Visualize feature importance for interpretability.  
  
- Save the pipeline for deployment.
 
 
This ensures that new data can be fed into the model consistently and predictions are reproducible.

---


In [28]:
## 6.1 Load Tuned Models & Training Schema

import os
import joblib
import pandas as pd

# Load tuned models
tuned_dir = "models/tuned_v2"
tuned_models = {
    "Logistic Regression": joblib.load(f"{tuned_dir}/best_logistic_regression_pipeline.pkl"),
    "Random Forest": joblib.load(f"{tuned_dir}/best_random_forest_pipeline.pkl"),
    "XGBoost": joblib.load(f"{tuned_dir}/best_xgboost_pipeline.pkl"),
    "LightGBM": joblib.load(f"{tuned_dir}/best_lightgbm_pipeline.pkl"),
}

print(f"✅ Loaded {len(tuned_models)} tuned models from {tuned_dir}")


✅ Loaded 4 tuned models from models/tuned_v2


In [29]:
import numpy as np

def get_expected_features(model_pipeline):
    """
    Extract original feature names from the pipeline.
    Falls back to None if unavailable.
    """
    if "preprocessor" in model_pipeline.named_steps:
        preprocessor = model_pipeline.named_steps["preprocessor"]
        if hasattr(preprocessor, "feature_names_in_"):
            return list(preprocessor.feature_names_in_)
    return None

def align_input(sample_data: pd.DataFrame, expected_features):
    """
    Align new input to match training schema.
    Missing cols -> filled with 0
    Extra cols -> dropped
    """
    return sample_data.reindex(columns=expected_features, fill_value=0)

def predict_pipeline(model_pipeline, new_data: pd.DataFrame):
    """
    Run inference using a preprocessing + model pipeline.
    Returns predicted class and probability.
    """
    pred_class = model_pipeline.predict(new_data)
    pred_proba = model_pipeline.predict_proba(new_data)[:, 1]
    return pred_class, pred_proba

def predict_from_dict(model_pipeline, patient_dict: dict):
    """
    Convenience wrapper: pass patient record as dict.
    Auto-aligns to training schema.
    """
    df = pd.DataFrame([patient_dict])
    expected_features = get_expected_features(model_pipeline)
    if expected_features is not None:
        df = align_input(df, expected_features)
    return predict_pipeline(model_pipeline, df)



In [30]:
# Example raw patient record (simple input)
sample_patient = {
    "age": 55,
    "sex": 1,
    "cp": 3,
    "trestbps": 140,
    "chol": 220,
    "fbs": 0,
    "restecg": 1,
    "thalch": 150,
    "exang": 0,
    "oldpeak": 1.5,
}

for name, model in tuned_models.items():
    pred_class, pred_proba = predict_from_dict(model, sample_patient)
    print(f"\n🔹 {name}")
    print("Prediction:", int(pred_class[0]))
    print("Probability:", round(pred_proba[0], 3))


🔹 Logistic Regression
Prediction: 0
Probability: 0.021

🔹 Random Forest
Prediction: 0
Probability: 0.417

🔹 XGBoost
Prediction: 0
Probability: 0.354

🔹 LightGBM
Prediction: 0
Probability: 0.221


In [31]:
# ## 6.4 Feature Importance (Optional)

import matplotlib.pyplot as plt
import seaborn as sns

for name, model in tuned_models.items():
    final_estimator = model[-1]  # last step in pipeline
    
    if hasattr(final_estimator, "feature_importances_"):
        importances = final_estimator.feature_importances_
        expected_features = get_expected_features(model)
        feature_names = expected_features if expected_features is not None else np.arange(len(importances))
        
        if len(importances) == len(feature_names):
            fi_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
            fi_df = fi_df.sort_values("Importance", ascending=False)
            
            plt.figure(figsize=(10,6))
            sns.barplot(x="Importance", y="Feature", data=fi_df, palette="viridis")
            plt.title(f"{name} - Feature Importances")
            plt.show()
        else:
            print(f"⚠️ Skipping {name}: mismatch between features and importances")


⚠️ Skipping Random Forest: mismatch between features and importances
⚠️ Skipping XGBoost: mismatch between features and importances
⚠️ Skipping LightGBM: mismatch between features and importances


In [32]:
## 6.5 Save Pipelines for Deployment
os.makedirs("models/deployment", exist_ok=True)

for name, model in tuned_models.items():
    file_name = f"models/deployment/final_{name.replace(' ', '_').lower()}_pipeline.pkl"
    joblib.dump(model, file_name)

print("✅ Final tuned pipelines saved for deployment in 'models/deployment'")

✅ Final tuned pipelines saved for deployment in 'models/deployment'


In [33]:
## 6.6 Enhanced Inference with Risk Bands & Explanations
import numpy as np
import pandas as pd

def risk_band(prob):
    """
    Translate probability into risk band.
    """
    if prob < 0.2:
        return "Low"
    elif prob < 0.5:
        return "Medium"
    else:
        return "High"

def enhanced_predict(model_pipeline, new_data: pd.DataFrame, top_n=3):
    """
    Enhanced prediction: class, probability, risk band,
    top contributing features, and recommendation.
    """
    # Align new data to match training schema
    expected_features = get_expected_features(model_pipeline)
    if expected_features is not None:
        new_data = align_input(new_data, expected_features)

    # Predict class + probability
    pred_class = int(model_pipeline.predict(new_data)[0])
    pred_proba = float(model_pipeline.predict_proba(new_data)[:, 1][0])
    band = risk_band(pred_proba)

    # Recommendation
    recommendation = (
        "Maintain healthy lifestyle" if band == "Low"
        else "Recommend further testing"
    )

    # Get feature names from preprocessor
    preprocessor = model_pipeline.named_steps.get("preprocessor")
    feature_names = (
        preprocessor.get_feature_names_out()
        if hasattr(preprocessor, "get_feature_names_out")
        else [f"f{i}" for i in range(new_data.shape[1])]
    )

    contributions = None
    # Logistic Regression → use coefficients × scaled input
    if "log_reg" in model_pipeline.named_steps:
        model = model_pipeline.named_steps["log_reg"]
        X_scaled = preprocessor.transform(new_data)
        contributions = (X_scaled.toarray() if hasattr(X_scaled, "toarray") else X_scaled)[0] * model.coef_[0]

    # Tree-based models → approximate with feature importances
    elif any(k in model_pipeline.named_steps for k in ["rf", "xgb", "lgbm"]):
        model = list(model_pipeline.named_steps.values())[-1]
        importances = model.feature_importances_
        contributions = importances * pred_proba

    # Build top contributions dataframe
    if contributions is not None and len(contributions) == len(feature_names):
        contrib_df = pd.DataFrame({
            "Feature": feature_names,
            "Contribution": contributions
        }).reindex(feature_names)
        contrib_df = contrib_df.reindex(contrib_df.Contribution.abs().sort_values(ascending=False).index)
        top_contrib = contrib_df.head(top_n)
    else:
        top_contrib = pd.DataFrame(columns=["Feature", "Contribution"])

    return {
        "Prediction": pred_class,
        "Probability": round(pred_proba, 3),
        "Risk Band": band,
        "Recommendation": recommendation,
        "Top Contributions": top_contrib
    }

In [35]:
## 6.7 Test Enhanced Inference with Sample Data

# Example patient
sample_data = pd.DataFrame({
    "age": [68],
    "sex": [1],
    "cp": [4],
    "trestbps": [180],
    "chol": [300],
    "fbs": [1],
    "restecg": [2],
    "thalch": [120],
    "exang": [1],
    "oldpeak": [3.0],
})

for name, model in tuned_models.items():
    result = enhanced_predict(model, sample_data)
    print(f"\n🔹 {name} Enhanced Prediction")
    print("Prediction:", result["Prediction"])
    print("Probability:", result["Probability"])
    print("Risk Band:", result["Risk Band"])
    print("Recommendation:", result["Recommendation"])
    print("Top Contributions:\n", result["Top Contributions"])


🔹 Logistic Regression Enhanced Prediction
Prediction: 0
Probability: 0.059
Risk Band: Low
Recommendation: Maintain healthy lifestyle
Top Contributions:
               Feature  Contribution
num__id           NaN           NaN
num__age          NaN           NaN
num__trestbps     NaN           NaN

🔹 Random Forest Enhanced Prediction
Prediction: 0
Probability: 0.491
Risk Band: Medium
Recommendation: Recommend further testing
Top Contributions:
               Feature  Contribution
num__id           NaN           NaN
num__age          NaN           NaN
num__trestbps     NaN           NaN

🔹 XGBoost Enhanced Prediction
Prediction: 1
Probability: 0.514
Risk Band: High
Recommendation: Recommend further testing
Top Contributions:
               Feature  Contribution
num__id           NaN           NaN
num__age          NaN           NaN
num__trestbps     NaN           NaN

🔹 LightGBM Enhanced Prediction
Prediction: 0
Probability: 0.364
Risk Band: Medium
Recommendation: Recommend further testi