# Model Selection
This notebook aims to use AutoGluon to select the best model for the task of predicting if a customer will complete an offer given they view it

In [13]:
pip install autogluon bokeh==2.0.1

Collecting autogluon
  Using cached autogluon-0.6.2-py3-none-any.whl (9.8 kB)
Collecting bokeh==2.0.1
  Using cached bokeh-2.0.1-py3-none-any.whl
Collecting autogluon.features==0.6.2
  Using cached autogluon.features-0.6.2-py3-none-any.whl (60 kB)
Collecting autogluon.timeseries[all]==0.6.2
  Using cached autogluon.timeseries-0.6.2-py3-none-any.whl (103 kB)
Collecting autogluon.core[all]==0.6.2
  Using cached autogluon.core-0.6.2-py3-none-any.whl (226 kB)
Collecting autogluon.multimodal==0.6.2
  Using cached autogluon.multimodal-0.6.2-py3-none-any.whl (303 kB)
Collecting autogluon.text==0.6.2
  Using cached autogluon.text-0.6.2-py3-none-any.whl (62 kB)
Collecting autogluon.vision==0.6.2
  Using cached autogluon.vision-0.6.2-py3-none-any.whl (49 kB)
Collecting autogluon.tabular[all]==0.6.2
  Using cached autogluon.tabular-0.6.2-py3-none-any.whl (292 kB)
Collecting autogluon.common==0.6.2
  Using cached autogluon.common-0.6.2-py3-none-any.whl (44 kB)
Collecting scipy<1.10.0,>=1.5.4
  Usi

In [3]:
import os
import pickle
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

In [5]:
import sagemaker

role = sagemaker.get_execution_role() 
session = sagemaker.Session() 
region = session.boto_region_name
bucket = session.default_bucket()

print(bucket)

sagemaker-us-east-1-256735873794


In [32]:
processed_dataset = pd.read_feather('../data/processed/processed_dataset.feather')
processed_dataset

Unnamed: 0,age,became_member_on,income,membership_days,gender_F,gender_M,gender_O,age_group,person,offer_id,...,reward,difficulty,duration,email,mobile,social,web,offer_bogo,offer_discount,offer_informational
0,55.0,2017-07-15,112000.0,376,1,0,0,45-54,0610b486422d4921ae7d2bf64640c50b,9b98b8c7a33c4b65b9aebfe6a799e6d9,...,5.0,5.0,7.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
1,75.0,2017-05-09,100000.0,443,1,0,0,65+,78afa995795e4d85b5d9ceeca43f5fef,5a8bc65990b245e5a138643cd4eb9837,...,0.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2,75.0,2017-05-09,100000.0,443,1,0,0,65+,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9,...,5.0,5.0,7.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
3,75.0,2017-05-09,100000.0,443,1,0,0,65+,78afa995795e4d85b5d9ceeca43f5fef,ae264e3637204a6fb9bb56bc8210ddfd,...,10.0,10.0,7.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
4,75.0,2017-05-09,100000.0,443,1,0,0,65+,78afa995795e4d85b5d9ceeca43f5fef,f19421c1d4aa40978ebb69ca19b0e20d,...,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47011,83.0,2016-03-07,50000.0,871,1,0,0,65+,9dc1421481194dcd9400aec7c9ae6366,4d5c57ea9a6940dd891ad53e9dbe8da0,...,10.0,10.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
47012,83.0,2016-03-07,50000.0,871,1,0,0,65+,9dc1421481194dcd9400aec7c9ae6366,9b98b8c7a33c4b65b9aebfe6a799e6d9,...,5.0,5.0,7.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
47013,83.0,2016-03-07,50000.0,871,1,0,0,65+,9dc1421481194dcd9400aec7c9ae6366,ae264e3637204a6fb9bb56bc8210ddfd,...,10.0,10.0,7.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
47014,62.0,2017-07-22,82000.0,369,1,0,0,55-64,e4052622e5ba45a8b96b59aba68cf068,2298d6c36e964ae4a3e7e9706d1fb8c2,...,3.0,7.0,7.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [33]:
target_column = 'offer_completed_after_view'
non_train_features = ['became_member_on', 'age_group', 'person', 'offer_id', 'offer_viewed']

# Remove Nan features from dataset
print(f'Nan features in target: {processed_dataset[target_column].isna().sum()}')
processed_dataset = processed_dataset[processed_dataset[target_column].notna()]
processed_dataset[target_column] = processed_dataset[target_column].astype(bool)

# Remove features not able to use for train
processed_dataset = processed_dataset.drop(columns=non_train_features, axis=1)

# Set the target column as the first since it is how Sagemaker training expects it
column_order = [target_column] + [col for col in processed_dataset.columns if col != target_column]
processed_dataset = processed_dataset[column_order]

# Define the train, validation and test size ratios
train_ratio = 0.7
validation_ratio = 0.15
test_ratio = 0.15

# Split the dataset into train (70%) and test (30%)
train_dataset, temp_dataset = train_test_split(processed_dataset, test_size=1 - train_ratio, random_state=42)

# Calculate the size ratio of validation and test sets
val_test_ratio = test_ratio / (test_ratio + validation_ratio)

# Split the remaining dataset (X_temp, y_temp) into validation (15%) and test (15%)
val_dataset, test_dataset = train_test_split(temp_dataset, test_size=val_test_ratio, random_state=42)

print('Training set:', train_dataset.shape)
print('Validation set:', val_dataset.shape)
print('Test set:', test_dataset.shape)

Nan features in target: 80
Training set: (32855, 17)
Validation set: (7040, 17)
Test set: (7041, 17)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [None]:
# Save datases in S3
prefix = 'data'
data_dir = '../data/processed'

train_dataset_path = os.path.join(data_dir, 'train.csv')
val_dataset_path = os.path.join(data_dir, 'validation.csv')
test_dataset_path = os.path.join(data_dir, 'test.csv')

train_dataset.to_csv(train_dataset_path, index=False, header=False)
val_dataset.to_csv(val_dataset_path, index=False, header=False)
test_dataset.to_csv(test_dataset_path, index=False, header=False)

# Upload the test.csv, train.csv and validation.csv files which are contained in data_dir to S3 using sess.upload_data().
train_location = session.upload_data(train_dataset_path, key_prefix=prefix)
val_location = session.upload_data(val_dataset_path, key_prefix=prefix)
test_location = session.upload_data(test_dataset_path, key_prefix=prefix)

train_location, val_location, test_location

# Save columns since we will not have them available in S3
print(train_dataset.columns.tolist())

['offer_completed_after_view', 'age', 'income', 'membership_days', 'gender_F', 'gender_M', 'gender_O', 'reward', 'difficulty', 'duration', 'email', 'mobile', 'social', 'web', 'offer_bogo', 'offer_discount', 'offer_informational']


In [None]:
# AutoGluon predictor
predictor = TabularPredictor(
    label='offer_completed_after_view',
    problem_type='binary',
    eval_metric='average_precision'  # Use PR_AUC since it is an unbalanced binary classification problem
).fit(
    train_data=train_dataset,
    tuning_data=val_dataset,
    presets='best_quality',
    time_limit=60 * 30, # 30 minutes of time limit
    use_bag_holdout=True,
    verbosity=2,
)

In [None]:
# Note email is dropped because is constant

In [27]:
with open('../data/predictor.pickle', 'wb') as handle:
    pickle.dump(predictor, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
with open('../data/predictor.pickle', 'rb') as handle:
    predictor = pickle.load(handle)

In [10]:
# View the summary of the fit
fit_summary = predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                      model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.641629       4.227954  647.191162                0.002674           3.095848            2       True         14
1    NeuralNetFastAI_BAG_L1   0.639141       2.107751  539.882275                2.107751         539.882275            1       True         10
2         LightGBMXT_BAG_L1   0.637496       1.073841   56.094465                1.073841          56.094465            1       True          3
3           LightGBM_BAG_L1   0.636824       0.454645   48.013108                0.454645          48.013108            1       True          4
4           CatBoost_BAG_L1   0.635632       0.204501  172.661479                0.204501         172.661479            1       True          7
5      LightGBMLarge_BAG_L1   0.635194       0.619803   52.933703         



The best models are an ensemble, a neural netowrk and LightGBM with extra trees parameter (extra randomized trees). For simplicity and interpretably we will use LightGBM, which is available in Sagemaker also.