### High-level modeling plan

In 03_modeling.ipynb, weâ€™ll:

- Load preprocessed data

- Set up MLflow experiment

- Each model run = one MLflow run

- Log params, metrics, and model artifact

- Train and compare multiple models

- Compare on validation metrics roc_auc, auc_pr (PR-AUC), f1, precision, recall


In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import sparse

from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    f1_score,
    precision_recall_fscore_support,
)


In [2]:
class Config:
    features_dir = Path("../data/features")


In [6]:
import mlflow
import mlflow.sklearn

# Use local mlruns (default) or point to ../mlflow if you want
# mlflow.set_tracking_uri("file:../mlflow")

experiment_name = "hubspot_prospect_conversion"
mlflow.set_experiment(experiment_name)
print("Using MLflow experiment:", experiment_name)


  return FileStore(store_uri, store_uri)
2025/11/08 17:07:33 INFO mlflow.tracking.fluent: Experiment with name 'hubspot_prospect_conversion' does not exist. Creating a new experiment.


Using MLflow experiment: hubspot_prospect_conversion


In [3]:
X_train = sparse.load_npz(Config.features_dir / "X_train.npz")
X_val   = sparse.load_npz(Config.features_dir / "X_val.npz")
X_test  = sparse.load_npz(Config.features_dir / "X_test.npz")

y_train = pd.read_csv(Config.features_dir / "y_train.csv")["is_customer"].values \
          if "is_customer" in pd.read_csv(Config.features_dir / "y_train.csv").columns \
          else pd.read_csv(Config.features_dir / "y_train.csv").iloc[:, 0].values

y_val = pd.read_csv(Config.features_dir / "y_val.csv").iloc[:, 0].values
y_test = pd.read_csv(Config.features_dir / "y_test.csv").iloc[:, 0].values

print("X shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y shapes:", y_train.shape, y_val.shape, y_test.shape)
print("Train class balance:", np.bincount(y_train))


X shapes: (3640, 174) (780, 174) (780, 174)
y shapes: (3640,) (780,) (780,)
Train class balance: [3500  140]


In [5]:
def evaluate_model(model, X, y, threshold=0.5):
    """Return a dict of metrics for binary classifier with predict_proba."""
    y_proba = model.predict_proba(X)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)
    
    roc_auc = roc_auc_score(y, y_proba)
    pr_auc = average_precision_score(y, y_proba)  # AUC-PR
    precision, recall, f1, _ = precision_recall_fscore_support(
        y, y_pred, average="binary", zero_division=0
    )
    return {
        "roc_auc": roc_auc,
        "auc_pr": pr_auc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [7]:
def run_experiment(
    model_name: str,
    model,
    X_train,
    y_train,
    X_val,
    y_val,
    params: dict | None = None,
    tags: dict | None = None,
):
    with mlflow.start_run(run_name=model_name):
        # Log params & tags for traceability
        if params:
            mlflow.log_params(params)
        if tags:
            mlflow.set_tags(tags)
        
        # Train
        model.fit(X_train, y_train)
        
        # Metrics
        train_metrics = evaluate_model(model, X_train, y_train)
        val_metrics = evaluate_model(model, X_val, y_val)
        
        # Log metrics (prefix train_ / val_)
        mlflow.log_metrics({f"train_{k}": v for k, v in train_metrics.items()})
        mlflow.log_metrics({f"val_{k}": v for k, v in val_metrics.items()})
        
        # Log model artifact
        mlflow.sklearn.log_model(model, artifact_path="model")
        
        print(f"[{model_name}] Val metrics:", val_metrics)
        return model, train_metrics, val_metrics


In [8]:
from sklearn.linear_model import LogisticRegression

logreg_params = {
    "class_weight": "balanced",
    "max_iter": 2000,
}

logreg = LogisticRegression(**logreg_params)

logreg_model, logreg_train, logreg_val = run_experiment(
    model_name="logreg_baseline",
    model=logreg,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    params=logreg_params,
    tags={"model_type": "logistic_regression"},
)




[logreg_baseline] Val metrics: {'roc_auc': 0.9869333333333333, 'auc_pr': 0.7687074196746351, 'precision': 0.3783783783783784, 'recall': 0.9333333333333333, 'f1': 0.5384615384615384}


In [9]:
from xgboost import XGBClassifier

# Compute scale_pos_weight = (#negative / #positive)
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

xgb_params = {
    "n_estimators": 300,
    "learning_rate": 0.05,
    "max_depth": 4,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "random_state": 42,
    "scale_pos_weight": scale_pos_weight,
}

xgb_clf = XGBClassifier(**xgb_params)

xgb_model, xgb_train, xgb_val = run_experiment(
    model_name="xgboost_baseline",
    model=xgb_clf,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    params=xgb_params,
    tags={"model_type": "xgboost"},
)




[xgboost_baseline] Val metrics: {'roc_auc': 0.9786666666666666, 'auc_pr': 0.7713532594501465, 'precision': 0.6410256410256411, 'recall': 0.8333333333333334, 'f1': 0.7246376811594203}


In [10]:
try:
    from catboost import CatBoostClassifier

    X_train_dense = X_train.toarray()
    X_val_dense = X_val.toarray()

    cat_params = {
        "depth": 4,
        "learning_rate": 0.05,
        "iterations": 300,
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "random_seed": 42,
        "verbose": False,
        "scale_pos_weight": scale_pos_weight,
    }

    cat_clf = CatBoostClassifier(**cat_params)

    cat_model, cat_train, cat_val = run_experiment(
        model_name="catboost_baseline",
        model=cat_clf,
        X_train=X_train_dense,
        y_train=y_train,
        X_val=X_val_dense,
        y_val=y_val,
        params=cat_params,
        tags={"model_type": "catboost"},
    )
except ImportError:
    print("CatBoost not installed; skipping CatBoost experiment.")




[catboost_baseline] Val metrics: {'roc_auc': 0.9893777777777777, 'auc_pr': 0.8148504145666479, 'precision': 0.5094339622641509, 'recall': 0.9, 'f1': 0.6506024096385542}


In [11]:
results = pd.DataFrame([
    {"model": "logreg",  **logreg_val},
    {"model": "xgboost", **xgb_val},
    *(
        [{"model": "catboost", **cat_val}]
        if "cat_val" in locals() else []
    )
])

results


Unnamed: 0,model,roc_auc,auc_pr,precision,recall,f1
0,logreg,0.986933,0.768707,0.378378,0.933333,0.538462
1,xgboost,0.978667,0.771353,0.641026,0.833333,0.724638
2,catboost,0.989378,0.81485,0.509434,0.9,0.650602


In [12]:
results.sort_values("auc_pr", ascending=False)


Unnamed: 0,model,roc_auc,auc_pr,precision,recall,f1
2,catboost,0.989378,0.81485,0.509434,0.9,0.650602
1,xgboost,0.978667,0.771353,0.641026,0.833333,0.724638
0,logreg,0.986933,0.768707,0.378378,0.933333,0.538462
