# 07 â€“ Deployment with Streamlit

This notebook demonstrates how to deploy the best model using a simple Streamlit app.  The app allows a user to enter input features and receive a predicted CTR.


In [None]:
import streamlit as st
import joblib
import pandas as pd
import os
import numpy as np

# ===============================
# Paths
# ===============================
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
models_dir = os.path.join(project_root, "models")
processed_dir = os.path.join(project_root, "data", "processed")

MODEL_NAME = "Stacking_model.pkl"  # change to the best model if needed
model_path = os.path.join(models_dir, MODEL_NAME)
preprocessor_path = os.path.join(processed_dir, "preprocessor.joblib")

# ===============================
# Load model and preprocessor
# ===============================
@st.cache_resource
def load_artifacts():
    model = joblib.load(model_path)
    preprocessor = joblib.load(preprocessor_path)
    return model, preprocessor

model, preprocessor = load_artifacts()

st.set_page_config(page_title="CTR Prediction", layout="centered")
st.title("CTR Prediction App")

st.write(
    """
This application predicts the probability that a user will click on an advertisement
based on user, ad, and behavioral features.

The pipeline uses target encoding / label encoding for categorical variables,
median imputation + standardization for numeric variables, and an ensemble model.
"""
)

# ===============================
# Helper: robust transform using saved preprocessor
# ===============================
def transform_input(input_df: pd.DataFrame, preproc: dict) -> pd.DataFrame:
    df = input_df.copy()

    # Required items from preprocessor
    numeric_cols = preproc.get("numeric_cols", [])
    high_card_cols = preproc.get("high_card_cols", [])
    low_card_cols = preproc.get("low_card_cols", [])
    enc_mappings = preproc.get("high_card_mappings", {})
    label_mappings = preproc.get("low_card_mappings", {})
    global_mean = float(preproc.get("global_mean", 0.0))
    top_features = preproc.get("top_features", [])

    # 1) Drop ID-like columns (must mirror training behavior)
    id_keywords = ["user", "userid", "nick", "adgroup_id", "campaign_id", "customer", "pid"]
    id_like_cols = [c for c in df.columns if any(k in c for k in id_keywords)]
    df = df.drop(columns=id_like_cols, errors="ignore")

    # 2) Ensure all columns referenced by encoders exist (avoid KeyError)
    for c in set(high_card_cols + low_card_cols + numeric_cols):
        if c not in df.columns:
            df[c] = np.nan

    # Start encoded frame
    encoded_df = df.copy()

    # 3) Target encoding for high-cardinality columns
    for c in high_card_cols:
        mapping = enc_mappings.get(c, {}).get("mapping", {})
        encoded_df[c] = df[c].map(mapping).fillna(global_mean).astype(np.float32)

    # 4) Label encoding for low-cardinality columns
    for c in low_card_cols:
        mapping = label_mappings.get(c, {})
        encoded_df[c] = df[c].map(mapping).fillna(-1).astype(np.int32)

    # 5) Numeric imputation + scaling
    stats = preproc.get("numeric_imputer_statistics", [])
    means = preproc.get("numeric_scaler_mean", [])
    scales = preproc.get("numeric_scaler_scale", [])

    # Safety: lengths must align
    if not (len(stats) == len(means) == len(scales) == len(numeric_cols)):
        # fallback: just fill NaNs with 0 and skip scaling if preprocessor is inconsistent
        for c in numeric_cols:
            encoded_df[c] = pd.to_numeric(encoded_df[c], errors="coerce").fillna(0).astype(np.float32)
    else:
        for i, c in enumerate(numeric_cols):
            median = float(stats[i])
            mean = float(means[i])
            scale = float(scales[i]) if float(scales[i]) != 0 else 1.0
            encoded_df[c] = pd.to_numeric(encoded_df[c], errors="coerce").fillna(median)
            encoded_df[c] = ((encoded_df[c] - mean) / scale).astype(np.float32)

    # 6) Final alignment to top_features (same order, missing filled with 0)
    final_df = encoded_df.reindex(columns=top_features, fill_value=0)

    # Ensure float dtype for models expecting continuous features
    final_df = final_df.astype(np.float32)

    return final_df

# ===============================
# UI Inputs (only meaningful non-ID features)
# ===============================
st.header("Input Features")

st.subheader("User Information")
age_level = st.selectbox("Age Level", [0, 1, 2, 3, 4, 5, 6])
gender = st.selectbox("Gender Code", ["1", "2"])  # keep string to match original category type
shopping_level = st.selectbox("Shopping Level", [1, 2, 3])
pvalue_level = st.selectbox("Consumption Level", [1, 2, 3])

st.subheader("Ad Information")
price = st.number_input("Price", min_value=0.0, step=1.0)

# Optional categorical fields if you kept them after ID dropping in training:
# If your training removed these IDs, do NOT include them here.
# cate_id / brand might still be used depending on your preprocessing choices.
cate_id = st.number_input("Category ID (cate_id)", min_value=0, step=1)
brand = st.number_input("Brand ID (brand)", min_value=0, step=1)

st.subheader("User Behavior (Aggregated)")
buy = st.number_input("Buy count", min_value=0, step=1)
cart = st.number_input("Cart count", min_value=0, step=1)
fav = st.number_input("Favorite count", min_value=0, step=1)
pv = st.number_input("Page views", min_value=0, step=1)

# ===============================
# Predict
# ===============================
if st.button("Predict CTR"):
    # Build input with feature names that exist in your fused data BEFORE preprocessing
    # Keep only non-ID columns (IDs were removed during training)
    input_df = pd.DataFrame({
        "price": [price],
        "age_level": [age_level],
        "final_gender_code": [gender],
        "shopping_level": [shopping_level],
        "pvalue_level": [pvalue_level],
        "buy": [buy],
        "cart": [cart],
        "fav": [fav],
        "pv": [pv],
        "cate_id": [cate_id],
        "brand": [brand],
    })

    try:
        features_df = transform_input(input_df, preprocessor)

        # Predict probability
        proba = float(model.predict_proba(features_df)[0, 1])

        st.success(f"Predicted Click Probability: {proba:.4f}")

        # Use a safer default threshold for imbalanced CTR (optional)
        threshold = 0.10
        if proba >= threshold:
            st.info("High click likelihood (above threshold)")
        else:
            st.warning("Low click likelihood")

        # Show transformed features for debugging (optional)
        with st.expander("Show transformed features (debug)"):
            st.write(features_df)

    except Exception as e:
        st.error("Prediction failed. Check feature compatibility with preprocessor/top_features.")
        st.exception(e)


2025-12-21 17:46:42.908 
  command:

    streamlit run d:\projects\Ai\project_fusion_ecu\.venv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-12-21 17:46:44.013 Session state does not function when running a script without `streamlit run`


### Modifications Summary
The deployment application now loads a precomputed preprocessor along with the trained model.
User inputs are transformed using target encoding for high-cardinality categories, label encoding for low-cardinality categories, and standardisation for numeric features, before prediction.
