# 0) Imports & versions

In [1]:
import os, sys, warnings
import numpy as np
import pandas as pd
import h2o

from sklearn.metrics import (
    roc_auc_score, average_precision_score, classification_report,
    confusion_matrix, brier_score_loss
)

warnings.filterwarnings("ignore", category=UserWarning)

try:
    import evidently
    from evidently.report import Report
    from evidently import metrics as evm
    # handle 0.4.x vs 0.7.x naming
    DatasetDriftMetric  = getattr(evm, "DatasetDriftMetric", None)
    DatasetMissingValuesMetric = getattr(evm, "DatasetMissingValuesMetric", None)
    ColumnSummaryMetric = getattr(evm, "ColumnSummaryMetric", None)
    BinaryClassificationQualityMetric = (
        getattr(evm, "BinaryClassificationQualityMetric", None)
        or getattr(evm, "ClassificationPerformanceMetric", None)
    )
    print("Evidently:", getattr(evidently, "__version__", "(unknown)"))
except Exception as e:
    DatasetDriftMetric = DatasetMissingValuesMetric = ColumnSummaryMetric = BinaryClassificationQualityMetric = None
    print("Evidently not available -> reports will be skipped:", e)

Evidently: 0.4.40


# 1) Configuration (paths, IDs, dtypes, sampling)

In [2]:

ROOT        = "/home/cc/MLOps/MLOps_Final_Instacart_Reorder_Prediction-main"
PATH_TRAIN  = f"{ROOT}/train_data.csv"   # processed CSVs from your 02 notebook
PATH_TEST   = f"{ROOT}/test_data.csv"

MODEL_ID    = "XGBoost_3_AutoML_1_20250820_184414"  # your screenshot / cluster id
TARGET_COL  = "reordered"
ID_COL      = "order_id"
CATEGORICAL_FORCE = ["aisle_id", "department_id", TARGET_COL]  # treated as H2O factors

# keep notebook responsive; raise/remove once stable
REF_MAX = 300_000
CUR_MAX = 200_000

# engineered feature names used in training
ENGINEERED = [
    "times_bought_by_user",
    "avg_user_product_position",
    "last_order_number",
    "num_orders",
    "avg_days_since_prior_order",
    "num_items",
    "user_reorder_prop",
    "product_total_orders",
    "product_reorder_prop",
    "avg_add_to_cart_order",
]

# 2) H2O: attach to cluster and fetch model

In [4]:
try:
    import h2o
    h2o.cluster().shutdown(prompt=False)
except Exception:
    pass

import h2o
h2o.init(ip="localhost", port=54329, start_h2o=True, nthreads=-1, max_mem_size="8G")
print("H2O server:", h2o.cluster().version)

Checking whether there is an H2O instance running at http://localhost:54329. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,19 hours 48 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 24 days
H2O_cluster_name:,H2O_from_python_cc_19f0xo
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.702 Gb
H2O_cluster_total_cores:,48
H2O_cluster_allowed_cores:,48


H2O server: 3.46.0.7


In [5]:
import os, h2o
MODEL_ID = os.path.basename("/home/cc/MLOps/MLOps_Final_Instacart_Reorder_Prediction-main/models/XGBoost_3_AutoML_1_20250820_184414")

try:
    model = h2o.get_model(MODEL_ID)   
    print("Found in cluster:", model.model_id)
except Exception:
    model = None
    print("Model not present in this H2O session:", MODEL_ID)

Found in cluster: XGBoost_3_AutoML_1_20250820_184414


# 3) Data loading helpers (sample to keep memory in check)

In [6]:

def load_df(path, n=None):
    df = pd.read_csv(path)
    if n is not None and len(df) > n:
        df = df.sample(n, random_state=42).reset_index(drop=True)
    return df

def ensure_object_cats(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # stringify potential categoricals
    for c in df.columns:
        if df[c].dtype == "object" or str(df[c].dtype).startswith("category"):
            df[c] = df[c].astype(str)
    for c in CATEGORICAL_FORCE:
        if c in df.columns and df[c].dtype != "object":
            df[c] = df[c].astype(str)
    return df


# 4) Load processed train/test (from 02_Data_Preparation)

In [7]:
assert os.path.exists(PATH_TRAIN) and os.path.exists(PATH_TEST), "Processed train/test CSVs not found."
ref_full = load_df(PATH_TRAIN, REF_MAX)
cur_full = load_df(PATH_TEST,  CUR_MAX)

# drop extras to match your modeling notebook
for df in (ref_full, cur_full):
    df.drop(columns=["Unnamed: 0", "product_name"], errors="ignore", inplace=True)
    # IMPORTANT: keep 'aisle_id' & 'department_id' (categoricals) and the ENGINEERED cols
    # Any missing engineered cols -> create (rare if CSVs are the latest)
    for c in ENGINEERED:
        if c not in df.columns:
            df[c] = np.nan

ref_full = ensure_object_cats(ref_full)
cur_full = ensure_object_cats(cur_full)

print("Frames:", ref_full.shape, cur_full.shape)

Frames: (300000, 24) (200000, 24)


# 5) Model schema alignment (NO target column inside features)

In [8]:
def expected_features_no_target(h2o_model, target=TARGET_COL):
    out = h2o_model._model_json.get("output", {}) or {}
    names = out.get("names", []) or []
    resp  = out.get("response_column_name", None)
    drop = {target, resp, "predict", "p0", "p1", None}
    return [c for c in names if c not in drop]

def coerce_to_expected(df: pd.DataFrame, exp_cols, cat_cols=("aisle_id","department_id")) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]  # normalize names
    # add any missing expected columns (cats -> '0', nums -> 0.0)
    missing = [c for c in exp_cols if c not in df.columns]
    for c in missing:
        if c in cat_cols:
            df[c] = "0"
        else:
            df[c] = 0.0
    # order strictly to expected features (target kept outside)
    return df[exp_cols + [c for c in df.columns if c not in exp_cols]]

def median_fill_expected(train_df: pd.DataFrame, test_df: pd.DataFrame, exp_cols, cat_cols=("aisle_id","department_id")):
    num_exp = [c for c in exp_cols if c not in cat_cols]
    med = pd.to_numeric(train_df[num_exp].stack(), errors="coerce").groupby(level=1).median()
    for c in num_exp:
        train_df[c] = pd.to_numeric(train_df[c], errors="coerce").fillna(med.get(c, 0.0))
        test_df[c]  = pd.to_numeric(test_df[c],  errors="coerce").fillna(med.get(c, 0.0))
    return train_df, test_df

exp = expected_features_no_target(model)  # <-- no target column here
ref_full = coerce_to_expected(ref_full, exp)
cur_full = coerce_to_expected(cur_full, exp)
ref_full, cur_full = median_fill_expected(ref_full, cur_full, exp)

print("Missing vs model (train):", [c for c in exp if c not in ref_full.columns])
print("Missing vs model (test) :", [c for c in exp if c not in cur_full.columns])

Missing vs model (train): []
Missing vs model (test) : []


# 6) Scoring (features only), metrics at fixed threshold

In [12]:
FIXED_THRESHOLD = 0.5

def enforce_factor_cols(hf, cols):
    for c in cols:
        if c in hf.columns:
            hf[c] = hf[c].asfactor()

def score_and_metrics_features(df_features: pd.DataFrame, df_with_target: pd.DataFrame, h2o_model, threshold=0.7448951904827723):
    # H2OFrame with FEATURES ONLY (prevents duplicate target parsing)
    hf = h2o.H2OFrame(df_features[exp])
    enforce_factor_cols(hf, CATEGORICAL_FORCE)

    pred = h2o_model.predict(hf).as_data_frame(use_multi_thread=True)
    p1   = pred["p1"].values if "p1" in pred.columns else pred.iloc[:, -1].values

    y = (df_with_target[TARGET_COL].astype(str).values == "1").astype(int)
    yhat = (p1 >= threshold).astype(int)

    scored = pd.DataFrame(index=df_with_target.index)
    if ID_COL in df_with_target.columns:
        scored[ID_COL] = df_with_target[ID_COL].values
    scored["proba"]    = p1
    scored["y_pred"]   = yhat
    scored[TARGET_COL] = y

    metrics = dict(
        auc   = roc_auc_score(y, p1),
        ap    = average_precision_score(y, p1),
        brier = brier_score_loss(y, p1),
        report= classification_report(y, yhat, digits=4),
        cm    = confusion_matrix(y, yhat),
    )
    return scored, metrics

# We need the target column alongside features for metrics.
# Your processed CSVs include TARGET_COL; we align dataframes by index:
ref_with_target = load_df(PATH_TRAIN, REF_MAX)
cur_with_target = load_df(PATH_TEST,  CUR_MAX)
for df in (ref_with_target, cur_with_target):
    df = df  # (placeholder to keep structure similar)
    # ensure target dtype consistent
    if TARGET_COL in df.columns and df[TARGET_COL].dtype != "object":
        df[TARGET_COL] = df[TARGET_COL].astype(int)

ref_scored, m_ref = score_and_metrics_features(ref_full, ref_with_target, model, FIXED_THRESHOLD)
cur_scored, m_cur = score_and_metrics_features(cur_full, cur_with_target, model, FIXED_THRESHOLD)

print("REF  -> AUC: %.4f | PR-AUC: %.4f | Brier: %.4f" % (m_ref["auc"], m_ref["ap"], m_ref["brier"]))
print("CUR  -> AUC: %.4f | PR-AUC: %.4f | Brier: %.4f" % (m_cur["auc"], m_cur["ap"], m_cur["brier"]))
print("\nREF @thr report:\n", m_ref["report"])
print("\nCUR @thr report:\n", m_cur["report"])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
REF  -> AUC: 0.4689 | PR-AUC: 0.5673 | Brier: 0.5892
CUR  -> AUC: 0.4648 | PR-AUC: 0.5737 | Brier: 0.5984

REF @thr report:
               precision    recall  f1-score   support

           0     0.4107    1.0000    0.5823    123218
           1     0.0000    0.0000    0.0000    176782

    accuracy                         0.4107    300000
   macro avg     0.2054    0.5000    0.2911    300000
weighted avg     0.1687    0.4107

# 7) Evidently reports 

In [13]:
def build_report(ref_df, cur_df, out_html):
    if not (Report and (DatasetDriftMetric or DatasetMissingValuesMetric)):
        print("Evidently is not installed/compatible; skipping report.")
        return
    metrics = []
    if DatasetDriftMetric:          metrics.append(DatasetDriftMetric())
    if DatasetMissingValuesMetric:  metrics.append(DatasetMissingValuesMetric())
    if ColumnSummaryMetric:         metrics.append(ColumnSummaryMetric(column_name="proba"))
    if BinaryClassificationQualityMetric:
        # some versions want 'prediction' (pred label), others support proba via params
        metrics.append(BinaryClassificationQualityMetric(target=TARGET_COL, prediction="y_pred"))
    rep = Report(metrics=metrics)
    rep.run(reference_data=ref_df, current_data=cur_df)
    rep.save_html(out_html)
    print("Saved:", out_html)

build_report(ref_scored, cur_scored, "baseline_vs_test.html")


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



Saved: baseline_vs_test.html


# 8) "Changed" test: tweak 2 features & re-monitor

In [14]:
# =========================================================

# =========================================================
changed = cur_full.copy()
if "order_hour_of_day" in load_df(PATH_TEST, 5).columns:  # peek columns safely
    # shift hour by +4 modulo 24 if present
    base = load_df(PATH_TEST, CUR_MAX)
    if "order_hour_of_day" in base.columns:
        hours = pd.to_numeric(base["order_hour_of_day"], errors="coerce").fillna(0).astype(int)
        base["order_hour_of_day"] = (hours + 4) % 24
        # reflect into 'changed' if that feature is in exp (it usually isn't), safe no-op otherwise

# second tweak: bump user-product frequency by +2 (if present)
if "times_bought_by_user" in changed.columns:
    changed["times_bought_by_user"] = pd.to_numeric(changed["times_bought_by_user"], errors="coerce").fillna(0) + 2

chg_with_target = load_df(PATH_TEST, CUR_MAX)
chg_scored, m_chg = score_and_metrics_features(changed, chg_with_target, model, FIXED_THRESHOLD)
print("\nCHGD -> AUC: %.4f | PR-AUC: %.4f | Brier: %.4f" % (m_chg["auc"], m_chg["ap"], m_chg["brier"]))
print("\nCHGD @thr report:\n", m_chg["report"])
build_report(ref_scored, chg_scored, "baseline_vs_changed.html")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%

CHGD -> AUC: 0.4613 | PR-AUC: 0.5700 | Brier: 0.5984

CHGD @thr report:
               precision    recall  f1-score   support

           0     0.4015    1.0000    0.5730     80302
           1     0.0000    0.0000    0.0000    119698

    accuracy                         0.4015    200000
   macro avg     0.2008    0.5000    0.2865    200000
weighted avg     0.1612    0.4015    0.2301    200000




invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



Saved: baseline_vs_changed.html
