In [None]:
# 1. Create virtual environment
!python3 -m venv .venv

In [1]:
!pip install -q polars pyarrow

In [13]:
import numpy as np
import pandas as pd
import h2o
import os

In [3]:
PROJECT_ID = "01988537-d38a-7fa5-8ebe-87045673a6ce" 
EVIDENTLY_TOKEN = "dG9rbgHOSuU9IDxNfK1fMOuuLz0c3siLm3sfaeCs00UoaRJYNABQuIoGM+we0DJ7haTgTxsk4En7gaLAfV75eGdaGlHAo8qzPAuUMnLq22HVIlPFXeaVIBqJwInGRa9yMZ9sMv0eSgXgVMpbjRX64u6BqDi25TXrzER0"

In [4]:
try:
    import h2o
    h2o.cluster().shutdown(prompt=False)
except Exception:
    pass

# Start fresh with larger heap (adjust "8G" to what your VM has: 6G/8G/12G)
import h2o
h2o.init(ip="localhost", port=54329, start_h2o=True, nthreads=-1, max_mem_size="8G")
print("H2O server:", h2o.cluster().version)

Checking whether there is an H2O instance running at http://localhost:54329..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.27" 2025-04-15; OpenJDK Runtime Environment (build 11.0.27+6-post-Ubuntu-0ubuntu120.04); OpenJDK 64-Bit Server VM (build 11.0.27+6-post-Ubuntu-0ubuntu120.04, mixed mode, sharing)
  Starting server from /home/cc/jlenv/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp87gf9y4f
  JVM stdout: /tmp/tmp87gf9y4f/h2o_cc_started_from_python.out
  JVM stderr: /tmp/tmp87gf9y4f/h2o_cc_started_from_python.err
  Server is running at http://127.0.0.1:54329
Connecting to H2O server at http://127.0.0.1:54329 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 23 days
H2O_cluster_name:,H2O_from_python_cc_ko0ng6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,48
H2O_cluster_allowed_cores:,48


H2O server: 3.46.0.7


In [5]:
def downcast(df: pd.DataFrame) -> pd.DataFrame:
    for c in df.select_dtypes(include=["float64"]).columns:
        df[c] = df[c].astype("float32")
    for c in df.select_dtypes(include=["int64"]).columns:
        df[c] = df[c].astype("int32")
    return df

def extract_preds(pdf: pd.DataFrame):
    if "predict" in pdf.columns:
        try:
            yhat = pdf["predict"].astype(int).values
        except Exception:
            yhat = (pdf["predict"].astype(str) == "1").astype(int).values
    else:
        yhat = (pdf.iloc[:, -1].values >= 0.5).astype(int)
    proba = pdf["p1"].values if "p1" in pdf.columns else pdf.iloc[:, -1].values
    return yhat, proba

def h2o_predict_binary_batched(h2o_model, X: pd.DataFrame, batch_rows: int = 50_000):
    import numpy as np, h2o
    yhat_parts, p_parts = [], []
    n = len(X)
    for start in range(0, n, batch_rows):
        end = min(start + batch_rows, n)
        hf = h2o.H2OFrame(X.iloc[start:end])
        pred_hf = h2o_model.predict(hf)
        # 👉 multi-threaded conversion (requires polars + pyarrow)
        try:
            pred_pdf = pred_hf.as_data_frame(use_multi_thread=True)
        except TypeError:
            # older h2o fallback
            pred_pdf = pred_hf.as_data_frame()
        # extract preds as before
        if "predict" in pred_pdf:
            try:
                yhat = pred_pdf["predict"].astype(int).values
            except Exception:
                yhat = (pred_pdf["predict"].astype(str) == "1").astype(int).values
            proba = pred_pdf["p1"].values if "p1" in pred_pdf else pred_pdf.iloc[:, -1].values
        else:
            proba = pred_pdf.iloc[:, -1].values
            yhat = (proba >= 0.5).astype(int)
        yhat_parts.append(yhat); p_parts.append(proba)
        # free JVM keys
        try: h2o.remove(hf.frame_id); h2o.remove(pred_hf.frame_id)
        except: pass
    return np.concatenate(yhat_parts), np.concatenate(p_parts)

In [6]:
model_path = "/home/cc/MLOps/MLOps_Final_Instacart_Reorder_Prediction-main/models/XGBoost_grid_1_AutoML_1_20250818_235405_model_3"

def ensure_model_loaded(model_path: str):
    try:
        _ = h2o_model  
        try:
            h2o.get_model(h2o_model.model_id) 
            return h2o_model
        except Exception:
            pass
    except NameError:
        pass
    m = h2o.load_model(model_path)
    print("Loaded model:", m.model_id)
    return m

h2o_model = ensure_model_loaded(model_path)

try:
    md = h2o.api("GET /3/Models")["models"]
    print("Models on server:", [m["model_id"]["name"] for m in md])
except Exception:
    pass

Server: 3.46.0.7
Loaded model: XGBoost_grid_1_AutoML_1_20250818_235405_model_3
Models on server: ['XGBoost_grid_1_AutoML_1_20250818_235405_model_3']


In [7]:
train_path = "/home/cc/MLOps/MLOps_Final_Instacart_Reorder_Prediction-main/train_data.csv"
test_path  = "/home/cc/MLOps/MLOps_Final_Instacart_Reorder_Prediction-main/test_data.csv"
TARGET_COL = "reordered"     
ID_COL     = "order_id"      

df_ref = pd.read_csv(train_path)
df_cur = pd.read_csv(test_path)

# Build features/targets
drop_cols = [c for c in [TARGET_COL, ID_COL] if c in df_ref.columns]
X_ref, y_ref = df_ref.drop(columns=drop_cols), df_ref[TARGET_COL].values
X_cur, y_cur = df_cur.drop(columns=drop_cols), df_cur[TARGET_COL].values

# Memory-friendly: downcast before sending to H2O
X_ref = downcast(X_ref)
X_cur = downcast(X_cur)

In [8]:
missing_in_test = [c for c in X_ref.columns if c not in X_cur.columns]
extra_in_test   = [c for c in X_cur.columns if c not in X_ref.columns]
print("Missing in test:", missing_in_test[:20], " ...", len(missing_in_test))
print("Extra in test:", extra_in_test[:20], " ...", len(extra_in_test))

Missing in test: []  ... 0
Extra in test: []  ... 0


In [9]:
# 1) What columns does the model expect?
out = h2o_model._model_json["output"]
expected_all = out.get("names") or out.get("original_names")
target_from_model = out.get("response_column_name") or out.get("response_column") or "reordered"
expected_feats = [c for c in expected_all if c != target_from_model]
print(f"Model expects {len(expected_feats)} features. First 10:", expected_feats[:10])

# 2) Align any pandas DF to the model’s expected columns
def align_to_model_schema(X: pd.DataFrame, expected_cols: list) -> pd.DataFrame:
    X = X.copy()
    # add missing cols
    for c in expected_cols:
        if c not in X.columns:
            X[c] = np.nan
    # drop extras + enforce order
    X = X[expected_cols]
    # dtype hygiene + impute (neutral defaults)
    num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat_cols = [c for c in X.columns if c not in num_cols]
    X[num_cols] = X[num_cols].astype("float32").fillna(0.0)           # neutral numeric fill
    for c in cat_cols:
        X[c] = X[c].astype("string").fillna("missing")                # neutral categorical fill
    return X

# 3) Apply to BOTH reference & current sets
X_ref_m = align_to_model_schema(X_ref, expected_feats)
X_cur_m = align_to_model_schema(X_cur, expected_feats)

Model expects 12 features. First 10: ['aisle_id', 'department_id', 'times_bought_by_user', 'avg_user_product_position', 'last_order_number', 'num_orders', 'avg_days_since_prior_order', 'num_items', 'user_reorder_prop', 'product_total_orders']


In [10]:
# 4) Re-score in batches (your existing helper)
yhat_ref, p_ref = h2o_predict_binary_batched(h2o_model, X_ref_m, batch_rows=50_000)
yhat_cur, p_cur = h2o_predict_binary_batched(h2o_model, X_cur_m, batch_rows=50_000)

# 5) Attach predictions for Evidently
ref_for_eval = df_ref.copy(); ref_for_eval["prediction"]=yhat_ref; ref_for_eval["prediction_proba"]=p_ref
cur_for_eval = df_cur.copy(); cur_for_eval["prediction"]=yhat_cur; cur_for_eval["prediction_proba"]=p_cur

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |████████████████████

In [12]:
MAX_ROWS = 100_000
ref_eval = ref_for_eval.sample(n=min(MAX_ROWS, len(ref_for_eval)), random_state=32021)
cur_eval = cur_for_eval.sample(n=min(MAX_ROWS, len(cur_for_eval)), random_state=32021)

In [10]:
from IPython.display import IFrame
IFrame("baseline_vs_test.html", width="100%", height="100%")

In [15]:
# ---- Metrics & reporting helpers ----
import numpy as np, pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

def compute_binary_metrics(y_true: np.ndarray, y_pred: np.ndarray, proba: np.ndarray):
    # Defensive: handle single-class edge cases in AUC/AP
    metrics = {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1": float(f1_score(y_true, y_pred, zero_division=0)),
    }
    try:
        # Interpret proba as P(class=1)
        if len(np.unique(y_true)) > 1:
            metrics["roc_auc"] = float(roc_auc_score(y_true, proba))
            metrics["avg_precision"] = float(average_precision_score(y_true, proba))
    except Exception:
        pass
    return metrics

def show_metrics_table(title, m):
    print(f"\n=== {title} ===")
    for k, v in m.items():
        print(f"{k:>14}: {v:.4f}")

def attach_preds_for_evidently(df: pd.DataFrame, yhat: np.ndarray, p1: np.ndarray, target_col: str):
    out = df.copy()
    out["prediction"] = yhat.astype(int)
    # Evidently likes proba per-class; for binary we can provide p0 and p1
    out["proba_1"] = p1.astype(float)
    out["proba_0"] = 1.0 - out["proba_1"]
    # Ensure target exists
    if target_col not in out.columns:
        raise ValueError(f"Target column '{target_col}' not found in dataframe given to Evidently.")
    return out


In [16]:
# ---- Load data, align to model schema, score, metrics ----
import pandas as pd, numpy as np, h2o

df_ref = pd.read_csv(train_path)
df_cur = pd.read_csv(test_path)

drop_cols = [c for c in [TARGET_COL, ID_COL] if c in df_ref.columns]
X_ref, y_ref = df_ref.drop(columns=drop_cols), df_ref[TARGET_COL].values
X_cur, y_cur = df_cur.drop(columns=drop_cols), df_cur[TARGET_COL].values

# (from your earlier cells)
X_ref = downcast(X_ref); X_cur = downcast(X_cur)

# Get expected feature order from the model & align
out = h2o_model._model_json["output"]
expected_all = out.get("names") or out.get("original_names")
target_from_model = out.get("response_column_name") or out.get("responseName")
expected_feats = [c for c in expected_all if c != target_from_model]

X_ref_m = align_to_model_schema(X_ref, expected_feats)
X_cur_m = align_to_model_schema(X_cur, expected_feats)

# Batched H2O predictions (uses your helper)
yhat_ref, p_ref = h2o_predict_binary_batched(h2o_model, X_ref_m, batch_rows=50_000)
yhat_cur, p_cur = h2o_predict_binary_batched(h2o_model, X_cur_m, batch_rows=50_000)

# Metrics
m_ref = compute_binary_metrics(y_ref, yhat_ref, p_ref)
m_cur = compute_binary_metrics(y_cur, yhat_cur, p_cur)
show_metrics_table("Reference (TRAIN) metrics", m_ref)
show_metrics_table("Current (TEST) metrics", m_cur)


NameError: name 'train_path' is not defined