# Modeling Pipeline and AutoML
## Load data and packages

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

import h2o
from h2o.automl import H2OAutoML

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import confusion_matrix

In [3]:
train_data = pd.read_csv('data/processed/train_data.csv')
test_data  = pd.read_csv('data/processed/test_data.csv')

In [4]:
train_data = train_data.drop(columns = ['Unnamed: 0','product_name'])
test_data = test_data.drop(columns = ['Unnamed: 0','product_name'])

In [5]:
X_train = train_data.drop(columns=['reordered','product_id','user_id'])
y_train = train_data['reordered']

X_test  = test_data.drop(columns=['reordered','product_id','user_id'])
y_test  = test_data['reordered']

## Preprocessing

In [7]:
categorical_cols = ['aisle_id', 'department_id']
numeric_cols = [col for col in X_train.columns if col not in categorical_cols]

In [8]:
# Transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

## Modeling

Since our model is trying to predict `reordered`, which is a binary variable, we will choose F1 score as our main metrics, which balances between precision and recall. We will also look at accuracy as well.

### Baseline Model (Random Forest)

In [10]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=32021, class_weight='balanced'))
])

In [11]:
# Log baseline results with MLFlow
with mlflow.start_run(run_name='Baseline_RandomForest_Balanced'):
    rf_pipeline.fit(X_train, y_train)
    y_pred = rf_pipeline.predict(X_test)
    
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    mlflow.log_param('Model', 'RandomForest Balanced Data')
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('accuracy', accuracy)
    mlflow.sklearn.log_model(rf_pipeline, 'baseline_rf_pipeline_balanced')

    # Confusion matrix plot
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_title('Confusion Matrix for Baseline RandomForest with Balanced Data')
    cm_path = 'confusion_matrix_rf_balanced.png'
    plt.savefig(cm_path)
    plt.close(fig)
    mlflow.log_artifact(cm_path)
    
print('Baseline RF logged.')



Baseline RF logged.


### H2O AutoML

In [13]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "13" 2019-09-17; Java(TM) SE Runtime Environment (build 13+33); Java HotSpot(TM) 64-Bit Server VM (build 13+33, mixed mode, sharing)
  Starting server from /opt/anaconda3/envs/MLOps/lib/python3.13/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/kf/8ms7_h8j11bf86d6c5ykldrw0000gn/T/tmpcmazrn_h
  JVM stdout: /var/folders/kf/8ms7_h8j11bf86d6c5ykldrw0000gn/T/tmpcmazrn_h/h2o_quynhanhnd2402_started_from_python.out
  JVM stderr: /var/folders/kf/8ms7_h8j11bf86d6c5ykldrw0000gn/T/tmpcmazrn_h/h2o_quynhanhnd2402_started_from_python.err
  Server is running at http://127.0.0.1:54321
 successful.o H2O server at http://127.0.0.1:54321 ...
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 24 days
H2O_cluster_name:,H2O_from_python_quynhanhnd2402_xtprzf
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [14]:
train_h2o = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test_h2o  = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))

train_h2o['reordered'] = train_h2o['reordered'].asfactor()
test_h2o['reordered']  = test_h2o['reordered'].asfactor()

train_h2o['aisle_id'] = train_h2o['aisle_id'].asfactor()
train_h2o['department_id'] = train_h2o['department_id'].asfactor()

test_h2o['aisle_id'] = test_h2o['aisle_id'].asfactor()
test_h2o['department_id'] = test_h2o['department_id'].asfactor()

train, valid = train_h2o.split_frame(ratios=[0.8], seed=32021)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [15]:
target = 'reordered'
features = numeric_cols + categorical_cols

In [16]:
aml = H2OAutoML(max_models=20, seed=32021, exclude_algos = ['StackedEnsemble', 'DeepLearning'], nfolds=0)
aml.train(x=features, y=target, training_frame=train, validation_frame=valid)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,950.0

Unnamed: 0,0,1,Error,Rate
0,363681.0,17800.0,0.0467,(17800.0/381481.0)
1,9094.0,17539.0,0.3415,(9094.0/26633.0)
Total,372775.0,35339.0,0.0659,(26894.0/408114.0)

metric,threshold,value,idx
max f1,0.7448952,0.5660298,76.0
max f2,0.6160068,0.6775733,120.0
max f0point5,0.8572871,0.5719385,38.0
max accuracy,0.867541,0.9464292,34.0
max precision,0.9914,1.0,0.0
max recall,0.0831171,1.0,337.0
max specificity,0.9914,1.0,0.0
max absolute_mcc,0.7106549,0.5395199,87.0
max min_per_class_accuracy,0.5695906,0.8822169,136.0
max mean_per_class_accuracy,0.5068911,0.8871452,158.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100021,0.9377016,12.4030485,12.4030485,0.8094072,0.9583445,0.8094072,0.9583445,0.1240566,0.1240566,1140.3048511,1140.3048511,0.1220172
2,0.0200018,0.9081735,10.1644308,11.2838768,0.6633178,0.9227257,0.7363714,0.9405373,0.1016408,0.2256974,916.4430836,1028.3876793,0.2200563
3,0.0300014,0.8809353,9.1581259,10.575351,0.5976476,0.8944638,0.6901339,0.9251807,0.0915781,0.3172756,815.8125899,957.5351034,0.3073301
4,0.0400011,0.8547484,7.7425402,9.8671917,0.5052683,0.8674936,0.6439204,0.9107598,0.0774227,0.3946983,674.2540223,886.7191712,0.3794603
5,0.0500007,0.8301569,6.6611379,9.2260124,0.4346974,0.8423419,0.6020778,0.8970769,0.0666091,0.4613074,566.1137903,822.6012373,0.4400219
6,0.1000015,0.7125741,5.0230178,7.1245151,0.3277957,0.7707225,0.4649368,0.8338997,0.2511546,0.712462,402.3017806,612.451509,0.6552193
7,0.1499998,0.6043082,2.7913697,5.6801805,0.1821612,0.6576616,0.3706813,0.7751556,0.1395637,0.8520257,179.1369739,468.0180495,0.7510377
8,0.2000005,0.5074584,1.4290332,4.6173806,0.0932569,0.5546571,0.3013244,0.7200303,0.0714527,0.9234784,42.9033172,361.7380643,0.7739873
9,0.2999995,0.3509789,0.5617147,3.265503,0.0366568,0.4254288,0.2131026,0.6218306,0.0561709,0.9796493,-43.8285337,226.5503025,0.7270994
10,0.400001,0.2364938,0.1520647,2.4871339,0.0099236,0.2907452,0.1623072,0.5390582,0.0152067,0.994856,-84.7935251,148.713392,0.6363847

Unnamed: 0,0,1,Error,Rate
0,89085.0,6015.0,0.0632,(6015.0/95100.0)
1,3554.0,3059.0,0.5374,(3554.0/6613.0)
Total,92639.0,9074.0,0.0941,(9569.0/101713.0)

metric,threshold,value,idx
max f1,0.7208742,0.3900045,79.0
max f2,0.5179457,0.4798408,149.0
max f0point5,0.85326,0.402683,37.0
max accuracy,0.9294689,0.9373138,14.0
max precision,0.9907848,0.875,0.0
max recall,0.000283,1.0,399.0
max specificity,0.9907848,0.9999685,0.0
max absolute_mcc,0.7208742,0.3453975,79.0
max min_per_class_accuracy,0.3911641,0.760623,196.0
max mean_per_class_accuracy,0.408304,0.761635,189.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100086,0.9294897,9.4127865,9.4127865,0.6119843,0.9534547,0.6119843,0.9534547,0.0942084,0.0942084,841.2786537,841.2786537,0.0900549
2,0.0200073,0.8972503,7.0929979,8.2534622,0.4611603,0.9128294,0.5366093,0.933152,0.0709209,0.1651293,609.2997896,725.3462189,0.1552134
3,0.030006,0.8686361,6.1250835,7.5442351,0.3982301,0.882362,0.490498,0.9162276,0.061243,0.2263723,512.5083471,654.4235073,0.2100211
4,0.0400047,0.8408852,5.1420454,6.9438352,0.3343166,0.8542513,0.4514623,0.9007373,0.0514139,0.2777862,414.2045383,594.3835242,0.2543161
5,0.0500034,0.8157306,4.5975935,6.4746792,0.2989184,0.8278759,0.4209595,0.8861679,0.0459701,0.3237562,359.7593519,547.467916,0.2927888
6,0.1000069,0.6953411,3.3265516,4.9006154,0.21628,0.7543129,0.3186197,0.8202404,0.166339,0.4900953,232.6551647,390.0615403,0.4172141
7,0.1500005,0.5940122,2.1959559,3.9991804,0.1427729,0.6429488,0.2600118,0.7611509,0.1097838,0.599879,119.5955852,299.9180402,0.4811619
8,0.2000039,0.5028522,1.5392862,3.3841766,0.1000786,0.5470529,0.2200265,0.7076238,0.0769696,0.6768486,53.9286171,238.4176614,0.5100032
9,0.300001,0.353024,1.0721623,2.6135304,0.069708,0.424868,0.169922,0.6133749,0.1072131,0.7840617,7.2162275,161.3530424,0.517721
10,0.399998,0.241352,0.7304011,2.1427597,0.047488,0.2941726,0.1393142,0.5335763,0.073038,0.8570997,-26.9598902,114.2759664,0.4888872

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
,2025-08-20 19:02:48,0.009 sec,0.0,0.5,0.6931472,0.5,0.0652587,1.0,0.9347413,0.5,0.6931472,0.5,0.0650163,1.0,0.9349837
,2025-08-20 19:02:49,1.470 sec,5.0,0.4342310,0.5615538,0.7949805,0.2402775,6.6968517,0.1181043,0.4342583,0.5615934,0.7895348,0.2306444,6.3696594,0.1226195
,2025-08-20 19:02:50,2.622 sec,10.0,0.4265230,0.5410358,0.8079150,0.2564837,7.1133350,0.1132478,0.4264283,0.5408048,0.8019098,0.2490667,6.7687454,0.1114312
,2025-08-20 19:02:51,3.717 sec,15.0,0.4222904,0.5303542,0.8158948,0.2703884,7.5454381,0.1137746,0.4225003,0.5306705,0.8078438,0.2622701,7.2371184,0.1078623
,2025-08-20 19:02:53,4.820 sec,20.0,0.4196822,0.5236869,0.8211285,0.2803916,7.8570159,0.1082590,0.4201081,0.5244145,0.8120444,0.2730507,7.6148385,0.1066924
,2025-08-20 19:02:54,6.059 sec,25.0,0.4180131,0.5197145,0.8242925,0.2856386,8.0032526,0.1114493,0.4187498,0.5211256,0.8140637,0.2763296,7.6299474,0.1149411
,2025-08-20 19:02:55,7.211 sec,30.0,0.4168568,0.5172191,0.8267617,0.2887827,8.1310542,0.1056028,0.4177922,0.5190549,0.8155406,0.2779580,7.5392945,0.1118736
,2025-08-20 19:02:56,8.275 sec,35.0,0.4151652,0.5133580,0.8292165,0.2941624,8.2812122,0.1080286,0.4162470,0.5154993,0.8168217,0.2815014,7.9321235,0.1151377
,2025-08-20 19:02:57,9.359 sec,40.0,0.4144179,0.5114126,0.8316694,0.2983929,8.4050925,0.1069799,0.4156060,0.5137859,0.8183680,0.2843816,7.9472323,0.1146264
,2025-08-20 19:02:58,10.526 sec,45.0,0.4137660,0.5097859,0.8335788,0.3012642,8.5252189,0.1070828,0.4150759,0.5124200,0.8188630,0.2860295,7.9321235,0.1141349

variable,relative_importance,scaled_importance,percentage
times_bought_by_user,196662.3750000,1.0,0.1779123
num_orders,151453.6093750,0.7701199,0.1370138
user_reorder_prop,130255.1640625,0.6623288,0.1178364
avg_days_since_prior_order,111143.2187500,0.5651473,0.1005466
num_items,108882.3125000,0.5536510,0.0985013
last_order_number,89529.0156250,0.4552422,0.0809932
product_reorder_prop,83566.0937500,0.4249216,0.0755988
product_total_orders,54696.7265625,0.2781250,0.0494819
avg_add_to_cart_order,54422.9921875,0.2767331,0.0492342
avg_user_product_position,43611.2226562,0.2217568,0.0394533


#### Insights and best model

In [18]:
lb = aml.leaderboard
print(lb.head(10))

best_model = aml.leader
print(best_model)

model_id                                              auc    logloss     aucpr    mean_per_class_error      rmse        mse
XGBoost_3_AutoML_1_20250820_184414               0.840823   0.376077  0.340135                0.300338  0.348989  0.121794
XGBoost_grid_1_AutoML_1_20250820_184414_model_2  0.839789   0.185635  0.345569                0.293925  0.225061  0.0506526
XGBoost_grid_1_AutoML_1_20250820_184414_model_5  0.837796   0.186441  0.341477                0.301487  0.225301  0.0507605
XGBoost_grid_1_AutoML_1_20250820_184414_model_6  0.831625   0.189317  0.321966                0.311297  0.227037  0.0515459
XGBoost_grid_1_AutoML_1_20250820_184414_model_4  0.823079   0.193015  0.299415                0.319264  0.229271  0.0525651
XGBoost_grid_1_AutoML_1_20250820_184414_model_3  0.822749   0.261164  0.299321                0.322373  0.274199  0.0751849
GBM_grid_1_AutoML_1_20250820_184414_model_3      0.821725   0.194081  0.294721                0.325819  0.230053  0.0529246
XGBoost_2

#### Predict on test set and log results

In [20]:
preds = best_model.predict(test_h2o)
pred_labels = preds.as_data_frame()['predict']

f1 = f1_score(y_test, pred_labels)
accuracy = accuracy_score(y_test, pred_labels)

cm = confusion_matrix(y_test, pred_labels)
fig, ax = plt.subplots(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title("Confusion Matrix for H2O AutoML's best model with Balanced data")
cm_path = 'confusion_matrix_aml_balanced.png'
plt.savefig(cm_path)
plt.close(fig)

# Log automl's best model results with mlflow
with mlflow.start_run(run_name='H2O_AutoML_Balanced'):
    model_path = h2o.save_model(best_model, path='models', force=True)
    mlflow.log_artifact(model_path, 'h2o_aml_model_balanced')

    mlflow.log_param('model_type', 'H2O_AutoML_Leader_Balanced')
    mlflow.log_metric('f1_score', f1)
    mlflow.log_metric('accuracy', accuracy)

    mlflow.log_artifact(cm_path)

print('H2O AutoML results logged.')

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%





H2O AutoML results logged.
