### High-level modeling plan

In 03_modeling.ipynb, weâ€™ll:

- Load preprocessed data

- Set up MLflow experiment

- Each model run = one MLflow run

- Log params, metrics, and model artifact

- Train and compare multiple models

- Compare on validation metrics roc_auc, auc_pr (PR-AUC), f1, precision, recall


In [15]:
import numpy as np
import pandas as pd
from pathlib import Path

from scipy import sparse

from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    f1_score,
    precision_recall_fscore_support,
)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

from xgboost import XGBClassifier

import mlflow
import mlflow.sklearn

In [16]:
class Config:
    features_dir = Path("../data/features")

In [17]:
X_train = sparse.load_npz(Config.features_dir / "X_train.npz").toarray()
X_val   = sparse.load_npz(Config.features_dir / "X_val.npz").toarray()
X_test  = sparse.load_npz(Config.features_dir / "X_test.npz").toarray()

y_train = pd.read_csv(Config.features_dir / "y_train.csv").iloc[:, 0].values
y_val   = pd.read_csv(Config.features_dir / "y_val.csv").iloc[:, 0].values
y_test  = pd.read_csv(Config.features_dir / "y_test.csv").iloc[:, 0].values

print("X shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y shapes:", y_train.shape, y_val.shape, y_test.shape)
print("Train class balance:", np.bincount(y_train))

X shapes: (3640, 174) (780, 174) (780, 174)
y shapes: (3640,) (780,) (780,)
Train class balance: [3500  140]


In [18]:
def evaluate_model(model, X, y):
    if hasattr(model, "predict_proba"):
        scores = model.predict_proba(X)[:, 1]
    elif hasattr(model, "decision_function"):
        scores = model.decision_function(X)
    else:
        scores = model.predict(X)

    y_pred = model.predict(X)

    roc_auc = roc_auc_score(y, scores)
    pr_auc = average_precision_score(y, scores)
    precision, recall, f1_val, _ = precision_recall_fscore_support(
        y, y_pred, average="binary", zero_division=0
    )
    return {
        "roc_auc": roc_auc,
        "auc_pr": pr_auc,
        "precision": precision,
        "recall": recall,
        "f1": f1_val,
    }

In [19]:
experiment_name = "hubspot_prospect_conversion_gridsearch"
mlflow.set_experiment(experiment_name)
print("Using MLflow experiment:", experiment_name)

2025/11/08 17:18:37 INFO mlflow.tracking.fluent: Experiment with name 'hubspot_prospect_conversion_gridsearch' does not exist. Creating a new experiment.


Using MLflow experiment: hubspot_prospect_conversion_gridsearch


In [20]:
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
scale_pos_weight = neg / pos

In [21]:
models_grids = {
    "LogisticRegression": (
        LogisticRegression(max_iter=2000, solver="lbfgs"),
        {
            "C": [0.1, 1.0, 10.0],
            "class_weight": ["balanced", None],
        },
    ),
    "SVC": (
        SVC(probability=True),
        {
            "C": [0.1, 1.0],
            "kernel": ["linear", "rbf"],
            "class_weight": ["balanced", None],
        },
    ),
    "KNeighborsClassifier": (
        KNeighborsClassifier(),
        {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"],
        },
    ),
    "DecisionTreeClassifier": (
        DecisionTreeClassifier(random_state=42),
        {
            "max_depth": [3, 5, None],
            "class_weight": ["balanced", None],
        },
    ),
    "RandomForestClassifier": (
        RandomForestClassifier(random_state=42, n_jobs=-1),
        {
            "n_estimators": [100, 200],
            "max_depth": [3, 5, None],
            "class_weight": ["balanced", None],
        },
    ),
    "ExtraTreesClassifier": (
        ExtraTreesClassifier(random_state=42, n_jobs=-1),
        {
            "n_estimators": [100, 200],
            "max_depth": [3, 5, None],
            "class_weight": ["balanced", None],
        },
    ),
    "AdaBoostClassifier": (
        AdaBoostClassifier(random_state=42),
        {
            "n_estimators": [50, 100],
            "learning_rate": [0.05, 0.1, 0.5],
        },
    ),
    "GradientBoostingClassifier": (
        GradientBoostingClassifier(random_state=42),
        {
            "n_estimators": [50, 100],
            "learning_rate": [0.05, 0.1],
            "max_depth": [2, 3],
        },
    ),
    "XGBClassifier": (
        XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42,
            n_jobs=-1,
        ),
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 4],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.8, 1.0],
            "scale_pos_weight": [scale_pos_weight],
        },
    ),
    "GaussianNB": (
        GaussianNB(),
        {},  # no hyperparams to tune
    ),
    "LinearDiscriminantAnalysis": (
        LinearDiscriminantAnalysis(),
        {},  # keep simple
    ),
    "QuadraticDiscriminantAnalysis": (
        QuadraticDiscriminantAnalysis(),
        {},  # can be unstable but ok for small data
    ),
}