# Modeling Pipeline and AutoML
## Load data and packages

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

import h2o
from h2o.automl import H2OAutoML

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

In [4]:
train_data = pd.read_csv('data/processed/train_data.csv')
test_data  = pd.read_csv('data/processed/test_data.csv')

In [5]:
train_data = train_data.drop(columns = ['Unnamed: 0','product_name'])
test_data = test_data.drop(columns = ['Unnamed: 0','product_name'])

In [6]:
X_train = train_data.drop(columns=['reordered','product_id','user_id'])
y_train = train_data['reordered']

X_test  = test_data.drop(columns=['reordered','product_id','user_id'])
y_test  = test_data['reordered']

## Preprocessing

In [8]:
categorical_cols = ['aisle_id', 'department_id']
numeric_cols = [col for col in X_train.columns if col not in categorical_cols]

In [9]:
# Transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

## Modeling

Since our model is trying to predict `reordered`, which is a binary variable, we will choose F1 score as our main metrics, which balances between precision and recall. We will also look at accuracy as well.

### Baseline Model (Random Forest)

In [11]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=32021))
])

In [12]:
# Log baseline results with MLFlow
with mlflow.start_run(run_name='Baseline_RandomForest'):
    rf_pipeline.fit(X_train, y_train)
    y_pred = rf_pipeline.predict(X_test)
    
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    mlflow.log_param('Model', 'RandomForest')
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('accuracy', accuracy)
    mlflow.sklearn.log_model(rf_pipeline, 'baseline_rf_pipeline')

    # Confusion matrix plot
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title('Confusion Matrix for Baseline RandomForest')
    cm_path = 'confusion_matrix_rf.png'
    plt.savefig(cm_path)
    plt.close(fig)
    mlflow.log_artifact(cm_path)
    
print('Baseline RF logged.')



Baseline RF logged.


### H2O AutoML

In [14]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "13" 2019-09-17; Java(TM) SE Runtime Environment (build 13+33); Java HotSpot(TM) 64-Bit Server VM (build 13+33, mixed mode, sharing)
  Starting server from /opt/anaconda3/envs/MLOps/lib/python3.13/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/kf/8ms7_h8j11bf86d6c5ykldrw0000gn/T/tmpf5xrsdms
  JVM stdout: /var/folders/kf/8ms7_h8j11bf86d6c5ykldrw0000gn/T/tmpf5xrsdms/h2o_quynhanhnd2402_started_from_python.out
  JVM stderr: /var/folders/kf/8ms7_h8j11bf86d6c5ykldrw0000gn/T/tmpf5xrsdms/h2o_quynhanhnd2402_started_from_python.err
  Server is running at http://127.0.0.1:54321
 successful.o H2O server at http://127.0.0.1:54321 ...
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 22 days
H2O_cluster_name:,H2O_from_python_quynhanhnd2402_ct59tv
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [15]:
train_h2o = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test_h2o  = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))

train_h2o['reordered'] = train_h2o['reordered'].asfactor()
test_h2o['reordered']  = test_h2o['reordered'].asfactor()

train_h2o['aisle_id'] = train_h2o['aisle_id'].asfactor()
train_h2o['department_id'] = train_h2o['department_id'].asfactor()

test_h2o['aisle_id'] = test_h2o['aisle_id'].asfactor()
test_h2o['department_id'] = test_h2o['department_id'].asfactor()

train, valid = train_h2o.split_frame(ratios=[0.8], seed=32021)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [16]:
target = 'reordered'
features = numeric_cols + categorical_cols

In [17]:
aml = H2OAutoML(max_models=20, seed=32021, exclude_algos = ['StackedEnsemble', 'DeepLearning'], nfolds=0)
aml.train(x=features, y=target, training_frame=train, validation_frame=valid)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,115.0

Unnamed: 0,0,1,Error,Rate
0,368610.0,12871.0,0.0337,(12871.0/381481.0)
1,12996.0,13637.0,0.488,(12996.0/26633.0)
Total,381606.0,26508.0,0.0634,(25867.0/408114.0)

metric,threshold,value,idx
max f1,0.2344261,0.5132384,175.0
max f2,0.1347738,0.5819668,235.0
max f0point5,0.3465599,0.5751454,126.0
max accuracy,0.375018,0.9473652,116.0
max precision,0.9088273,1.0,0.0
max recall,0.0028556,1.0,396.0
max specificity,0.9088273,1.0,0.0
max absolute_mcc,0.2476601,0.4796831,169.0
max min_per_class_accuracy,0.0837108,0.8214621,278.0
max mean_per_class_accuracy,0.0850156,0.8226431,277.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100021,0.5272979,13.6493597,13.6493597,0.8907398,0.6287511,0.8907398,0.6287511,0.1365224,0.1365224,1264.9359681,1264.9359681,0.1353532
2,0.0200018,0.4263993,10.5737116,12.111724,0.690027,0.4720297,0.7903957,0.5504,0.1057335,0.2422558,957.3711575,1111.1724017,0.2377707
3,0.0300014,0.3592944,8.3320547,10.8519371,0.5437393,0.3910023,0.7081836,0.4972718,0.0833177,0.3255735,733.2054682,985.1937138,0.3162074
4,0.0400011,0.3111608,6.5672662,9.780835,0.4285714,0.3341029,0.6382848,0.456482,0.0656704,0.3912439,556.7266174,878.0835012,0.3757648
5,0.0500007,0.2750092,5.3206496,8.8888416,0.3472188,0.2922579,0.5800745,0.4236388,0.0532047,0.4444486,432.064961,788.8841646,0.4219862
6,0.1000015,0.1712266,3.6435465,6.2661941,0.2377732,0.2154344,0.4089238,0.3195366,0.18218,0.6266286,264.3546478,526.6194062,0.5633935
7,0.1499998,0.1213348,2.1027267,4.878417,0.1372213,0.1439148,0.3183593,0.2609979,0.1051327,0.7317613,110.2726734,387.841696,0.6223771
8,0.2000005,0.091254,1.3944901,4.0074246,0.0910026,0.1052213,0.2615194,0.2220533,0.0697255,0.8014869,39.4490068,300.7424567,0.643479
9,0.2999995,0.0570096,0.843323,2.9527326,0.0550342,0.07216,0.1926916,0.1720893,0.0843315,0.8858183,-15.667705,195.2732642,0.6267176
10,0.400001,0.0382527,0.4971203,2.338822,0.0324414,0.0468264,0.1526285,0.1407732,0.0497128,0.9355311,-50.2879685,133.8822039,0.572918

Unnamed: 0,0,1,Error,Rate
0,89993.0,5107.0,0.0537,(5107.0/95100.0)
1,3724.0,2889.0,0.5631,(3724.0/6613.0)
Total,93717.0,7996.0,0.0868,(8831.0/101713.0)

metric,threshold,value,idx
max f1,0.2036946,0.3955096,185.0
max f2,0.1016175,0.4871172,256.0
max f0point5,0.3282532,0.4218082,124.0
max accuracy,0.4524168,0.9380512,76.0
max precision,0.8508968,1.0,0.0
max recall,0.001264,1.0,399.0
max specificity,0.8508968,1.0,0.0
max absolute_mcc,0.2110715,0.3511625,181.0
max min_per_class_accuracy,0.0660766,0.7634961,291.0
max mean_per_class_accuracy,0.0669451,0.7649903,290.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100086,0.4977545,9.7451803,9.7451803,0.6335953,0.6002116,0.6335953,0.6002116,0.0975352,0.0975352,874.5180283,874.5180283,0.093613
2,0.0200073,0.402268,7.5769551,8.6616004,0.4926254,0.4456633,0.563145,0.5229754,0.0757599,0.173295,657.6955108,766.1600429,0.163947
3,0.030006,0.3438383,6.4729277,7.9322819,0.4208456,0.371053,0.5157274,0.4723512,0.064721,0.238016,547.2927717,693.2281901,0.2224745
4,0.0400047,0.300698,5.0664271,7.2159943,0.3294002,0.3222146,0.469157,0.4348263,0.0506578,0.2886738,406.6427068,621.5994271,0.2659609
5,0.0500034,0.2664456,4.7639538,6.7256826,0.3097345,0.2837124,0.4372788,0.4046094,0.0476334,0.3363073,376.395381,572.5682602,0.3062126
6,0.1000069,0.1694614,3.2116344,4.9686585,0.2088085,0.2121755,0.3230436,0.3083924,0.1605928,0.4969,221.1634408,396.8658505,0.4244921
7,0.1500005,0.1203659,2.1868817,4.0415211,0.1421829,0.1427555,0.2627646,0.2531874,0.1093301,0.6062302,118.6881654,304.1521107,0.4879547
8,0.2000039,0.0912312,1.5392862,3.4159316,0.1000786,0.1048753,0.2220911,0.2161075,0.0769696,0.6831998,53.9286171,241.5931622,0.516796
9,0.300001,0.0571934,1.0751867,2.6357089,0.0699046,0.0722682,0.171364,0.1681626,0.1075155,0.7907153,7.518671,163.5708888,0.5248372
10,0.399998,0.0383338,0.7198156,2.1567473,0.0467997,0.0468491,0.1402237,0.137835,0.0719794,0.8626947,-28.0184425,115.6747333,0.4948713

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
,2025-08-19 00:05:17,7 min 29.831 sec,0.0,0.5,0.6931472,0.5,0.0652587,1.0,0.9347413,0.5,0.6931472,0.5,0.0650163,1.0,0.9349837
,2025-08-19 00:05:19,7 min 31.419 sec,5.0,0.2510640,0.2563795,0.7795201,0.2572316,7.6486749,0.1085015,0.2515575,0.2570448,0.7750008,0.2347029,6.3003724,0.1171040
,2025-08-19 00:05:21,7 min 34.255 sec,10.0,0.2306797,0.2029434,0.8158275,0.3102914,9.0132323,0.0996780,0.2326565,0.2061320,0.8040217,0.2722811,7.7054914,0.1073904
,2025-08-19 00:05:25,7 min 37.612 sec,15.0,0.2269240,0.1909913,0.8321137,0.3436830,9.9141801,0.0947186,0.2301916,0.1967611,0.8144463,0.2906628,8.4760405,0.1036347
,2025-08-19 00:05:28,7 min 40.847 sec,20.0,0.2245113,0.1854449,0.8438584,0.3711360,10.7250332,0.0891295,0.2287394,0.1932464,0.8213715,0.3078056,9.1257192,0.0954155
,2025-08-19 00:05:31,7 min 44.137 sec,25.0,0.2226561,0.1817028,0.8518789,0.3918111,11.2280624,0.0834595,0.2277992,0.1911337,0.8263922,0.3172777,9.3523513,0.0945995
,2025-08-19 00:05:34,7 min 46.921 sec,30.0,0.2215166,0.1795136,0.8568037,0.4034662,11.3857282,0.0839912,0.2272511,0.1900325,0.8289673,0.3221677,9.4278953,0.0982667
,2025-08-19 00:05:37,7 min 49.727 sec,35.0,0.2203680,0.1774516,0.8614107,0.4159644,11.6184731,0.0803869,0.2267705,0.1891608,0.8309821,0.3269241,9.4278953,0.1024943
,2025-08-19 00:05:40,7 min 52.840 sec,40.0,0.2195199,0.1758876,0.8649049,0.4244504,11.8437101,0.0815458,0.2264187,0.1885433,0.8322078,0.3310781,9.5940922,0.1018159
,2025-08-19 00:05:44,7 min 56.530 sec,45.0,0.2182544,0.1735399,0.8704687,0.4384324,12.1064865,0.0788603,0.2259275,0.1876199,0.8344410,0.3363075,9.6243098,0.0998004

variable,relative_importance,scaled_importance,percentage
times_bought_by_user,25957.6367188,1.0,0.2315935
num_orders,16035.4169922,0.6177533,0.1430677
user_reorder_prop,13584.8310547,0.5233462,0.1212036
num_items,12737.8144531,0.4907155,0.1136465
avg_days_since_prior_order,12266.5117188,0.4725589,0.1094416
last_order_number,10680.7441406,0.4114683,0.0952934
product_reorder_prop,7650.6098633,0.2947345,0.0682586
product_total_orders,4036.3142090,0.1554962,0.0360119
avg_add_to_cart_order,3637.2243652,0.1401216,0.0324512
avg_user_product_position,3061.6958008,0.1179497,0.0273164


#### Insights and best model

In [19]:
lb = aml.leaderboard
print(lb.head(10))

best_model = aml.leader
print(best_model)

model_id                                              auc    logloss     aucpr    mean_per_class_error      rmse        mse
XGBoost_grid_1_AutoML_1_20250818_235405_model_3  0.844614   0.183354  0.3579                  0.308417  0.223709  0.0500456
XGBoost_grid_1_AutoML_1_20250818_235405_model_2  0.844427   0.184076  0.353576                0.307824  0.224014  0.0501824
XGBoost_grid_1_AutoML_1_20250818_235405_model_4  0.833544   0.18809   0.332579                0.315052  0.226253  0.0511903
XGBoost_3_AutoML_1_20250818_235405               0.833281   0.188579  0.325574                0.312446  0.226773  0.0514258
XGBoost_grid_1_AutoML_1_20250818_235405_model_1  0.833269   0.188456  0.330874                0.31309   0.226351  0.0512346
XGBoost_grid_1_AutoML_1_20250818_235405_model_6  0.831525   0.189903  0.323041                0.30752   0.227193  0.0516165
XGBoost_2_AutoML_1_20250818_235405               0.830977   0.189128  0.328011                0.312985  0.226665  0.0513769
XGBoost_

#### Predict on test set and log results

In [21]:
preds = best_model.predict(test_h2o)
pred_labels = preds.as_data_frame()['predict']

f1 = f1_score(y_test, pred_labels)
accuracy = accuracy_score(y_test, pred_labels)

cm = confusion_matrix(y_test, pred_labels)
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title("Confusion Matrix for H2O AutoML's best model")
cm_path = 'confusion_matrix_aml.png'
plt.savefig(cm_path)
plt.close(fig)

# Log automl's best model results with mlflow
with mlflow.start_run(run_name='H2O_AutoML_Best'):
    model_path = h2o.save_model(best_model, path='models', force=True)
    mlflow.log_artifact(model_path, 'h2o_aml_model')

    mlflow.log_param('model_type', 'H2O_AutoML_Leader')
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('accuracy', accuracy)

    mlflow.log_artifact(cm_path)

print('H2O AutoML results logged.')

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%





H2O AutoML results logged.
