# Capstone Model Development with AutoGluon Template: Model Selection

### Open up Sagemaker Studio and use starter template

1. Notebook should be using a `ml.t3.medium` instance (2 vCPU + 4 GiB)
2. Notebook should be using kernal: `Python 3 (MXNet 1.8 Python 3.7 CPU Optimized)`

### Install packages

In [1]:
!pip install -U pip
!pip install -U pydantic==1.10.2
!pip install -U setuptools wheel
!pip install -U "mxnet<2.0.0" bokeh==2.0.1
!pip install autogluon --no-cache-dir
# Without --no-cache-dir, smaller aws instances may have trouble installing

[0m

In [2]:
import pandas as pd
from tqdm import tqdm
#!pip install ipywidgets
from autogluon.tabular import TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Create the train dataset in pandas by reading the csv
# Set the parsing of the datetime column so you can use some of the `dt` features in pandas later
local_data_path = "cleaned.csv"
dataset = pd.read_csv(local_data_path)
dataset.head()

Unnamed: 0.1,Unnamed: 0,age_upon_outcome,animal_id_outcome,date_of_birth,outcome_type,sex_upon_outcome,age_upon_outcome_(days),age_upon_outcome_(years),age_upon_outcome_age_group,outcome_datetime,...,age_upon_intake_(days),age_upon_intake_(years),age_upon_intake_age_group,intake_datetime,intake_month,intake_year,intake_weekday,intake_hour,intake_number,time_in_shelter_days
0,0,10 years,A006100,2007-07-09 00:00:00,Not Adopted,Neutered Male,3650,10.0,"(7.5, 10.0]",2017-12-07 14:07:00,...,3650,10.0,"(7.5, 10.0]",2017-12-07 00:00:00,12,2017,Thursday,14,1.0,0.588194
1,1,7 years,A006100,2007-07-09 00:00:00,Not Adopted,Neutered Male,2555,7.0,"(5.0, 7.5]",2014-12-20 16:35:00,...,2555,7.0,"(5.0, 7.5]",2014-12-19 10:21:00,12,2014,Friday,10,2.0,1.259722
2,2,6 years,A006100,2007-07-09 00:00:00,Not Adopted,Neutered Male,2190,6.0,"(5.0, 7.5]",2014-03-08 17:10:00,...,2190,6.0,"(5.0, 7.5]",2014-03-07 14:26:00,3,2014,Friday,14,3.0,1.113889
3,3,10 years,A047759,2004-04-02 00:00:00,Not Adopted,Neutered Male,3650,10.0,"(7.5, 10.0]",2014-04-07 15:12:00,...,3650,10.0,"(7.5, 10.0]",2014-04-02 15:55:00,4,2014,Wednesday,15,1.0,4.970139
4,4,16 years,A134067,1997-10-16 00:00:00,Not Adopted,Neutered Male,5840,16.0,"(15.0, 17.5]",2013-11-16 11:54:00,...,5840,16.0,"(15.0, 17.5]",2013-11-16 09:02:00,11,2013,Saturday,9,1.0,0.119444


In [6]:
# Simple output of the train dataset to view some of the min/max/varition of the dataset features.
dataset.describe()

Unnamed: 0.1,Unnamed: 0,age_upon_outcome_(days),age_upon_outcome_(years),outcome_month,outcome_year,outcome_hour,outcome_number,dob_year,dob_month,age_upon_intake_(days),age_upon_intake_(years),intake_month,intake_year,intake_hour,intake_number,time_in_shelter_days
count,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0,79672.0
mean,39835.5,782.046127,2.142592,6.655425,2015.472563,14.297306,1.12682,2013.25487,6.31031,769.341701,2.107785,6.584032,2015.436101,13.487022,1.12682,16.757116
std,22999.469661,1058.528519,2.900078,3.414284,1.305944,3.774317,0.456057,3.216517,3.289077,1056.00904,2.893175,3.366579,1.303157,3.121173,0.456057,41.679359
min,0.0,0.0,0.0,1.0,2013.0,0.0,1.0,1991.0,1.0,0.0,0.0,1.0,2013.0,0.0,1.0,0.0
25%,19917.75,90.0,0.246575,4.0,2014.0,12.0,1.0,2012.0,4.0,60.0,0.164384,4.0,2014.0,11.0,1.0,1.102083
50%,39835.5,365.0,1.0,7.0,2015.0,15.0,1.0,2014.0,6.0,365.0,1.0,7.0,2015.0,13.0,1.0,4.987153
75%,59753.25,1095.0,3.0,10.0,2017.0,17.0,1.0,2015.0,9.0,1095.0,3.0,10.0,2017.0,16.0,1.0,13.610764
max,79671.0,9125.0,25.0,12.0,2018.0,23.0,13.0,2018.0,12.0,9125.0,25.0,12.0,2018.0,23.0,13.0,1606.194444


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# Create the test pandas dataframe in pandas by reading the csv, remember to parse the datetime!
df_train, df_test = train_test_split(dataset, test_size=0.2, random_state=0)

In [6]:
df_train.dropna(inplace=True)

Requirements:
* We are prediting `count`, so it is the label we are setting.
* Ignore `casual` and `registered` columns as they are also not present in the test dataset. 
* Use the `root_mean_squared_error` as the metric to use for evaluation.
* Set a time limit of 10 minutes (600 seconds).
* Use the preset `best_quality` to focus on creating the best model.

In [18]:
predictor = TabularPredictor(label="outcome_type", eval_metric='root_mean_squared_error').fit(train_data=df_train, time_limit=600, presets="best_quality")

No path specified. Models will be saved in: "AutogluonModels/ag-20230317_021917/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "AutogluonModels/ag-20230317_021917/"
AutoGluon Version:  0.7.0
Python Version:     3.8.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Feb 1 21:34:38 UTC 2023
Train Data Rows:    63729
Train Data Columns: 35
Label Column: outcome_type
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['Adopted', 'Not Adopted']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = Not Adopte

### Review AutoGluon's training run with ranking of models that did the best.

In [19]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2  -0.333263      52.387286  534.831772                0.097580           8.677265            2       True          7
1        LightGBMXT_BAG_L1  -0.341656       6.598824  124.240484                6.598824         124.240484            1       True          1
2          LightGBM_BAG_L1  -0.343237       5.760779   96.055327                5.760779          96.055327            1       True          2
3  RandomForestGini_BAG_L1  -0.352328      17.227569   60.343557               17.227569          60.343557            1       True          3
4  RandomForestEntr_BAG_L1  -0.353395      16.935670   59.031128               16.935670          59.031128            1       True          4
5          CatBoost_BAG_L1  -0.354304       0.918040  165.882312                

{'model_types': {'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L1': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesGini_BAG_L1': 'StackerEnsembleModel_XT',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
 'model_performance': {'LightGBMXT_BAG_L1': -0.34165574467337356,
  'LightGBM_BAG_L1': -0.3432365910953746,
  'RandomForestGini_BAG_L1': -0.3523279852346311,
  'RandomForestEntr_BAG_L1': -0.3533952439767361,
  'CatBoost_BAG_L1': -0.3543043150274465,
  'ExtraTreesGini_BAG_L1': -0.4119692372238927,
  'WeightedEnsemble_L2': -0.33326271435722216},
 'model_best': 'WeightedEnsemble_L2',
 'model_paths': {'LightGBMXT_BAG_L1': 'AutogluonModels/ag-20230317_021917/models/LightGBMXT_BAG_L1/',
  'LightGBM_BAG_L1': 'AutogluonModels/ag-20230317_021917/models/LightGBM_BAG_L1/',
  'RandomForestGini_BAG_L1': 'AutogluonModels

In [7]:
predictor2 = TabularPredictor(label="outcome_type", eval_metric='accuracy').fit(train_data=df_train, time_limit=600, presets="best_quality")

No path specified. Models will be saved in: "AutogluonModels/ag-20230317_023946/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "AutogluonModels/ag-20230317_023946/"
AutoGluon Version:  0.7.0
Python Version:     3.8.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Feb 1 21:34:38 UTC 2023
Train Data Rows:    63729
Train Data Columns: 35
Label Column: outcome_type
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['Adopted', 'Not Adopted']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = Not Adopte

In [8]:
predictor2.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2   0.888512      45.528670  331.754561                0.102317          11.900229            2       True          7
1        LightGBMXT_BAG_L1   0.882785       7.770231  124.832316                7.770231         124.832316            1       True          1
2          LightGBM_BAG_L1   0.881247       3.155086   70.849587                3.155086          70.849587            1       True          2
3  RandomForestGini_BAG_L1   0.875724      19.127375   63.447088               19.127375          63.447088            1       True          3
4          CatBoost_BAG_L1   0.874798       0.889215  180.353478                0.889215         180.353478            1       True          5
5  RandomForestEntr_BAG_L1   0.873668      15.373661   60.725343               1

{'model_types': {'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L1': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesGini_BAG_L1': 'StackerEnsembleModel_XT',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
 'model_performance': {'LightGBMXT_BAG_L1': 0.8827849173845502,
  'LightGBM_BAG_L1': 0.8812471559258737,
  'RandomForestGini_BAG_L1': 0.8757237678294026,
  'RandomForestEntr_BAG_L1': 0.8736681887366818,
  'CatBoost_BAG_L1': 0.8747979726655055,
  'ExtraTreesGini_BAG_L1': 0.8278177909585903,
  'WeightedEnsemble_L2': 0.8885122942459477},
 'model_best': 'WeightedEnsemble_L2',
 'model_paths': {'LightGBMXT_BAG_L1': 'AutogluonModels/ag-20230317_023946/models/LightGBMXT_BAG_L1/',
  'LightGBM_BAG_L1': 'AutogluonModels/ag-20230317_023946/models/LightGBM_BAG_L1/',
  'RandomForestGini_BAG_L1': 'AutogluonModels/ag-20230

In [7]:
predictor3 = TabularPredictor(label="outcome_type", eval_metric='f1_macro').fit(train_data=df_train, time_limit=600, presets="best_quality")

No path specified. Models will be saved in: "AutogluonModels/ag-20230317_030146/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "AutogluonModels/ag-20230317_030146/"
AutoGluon Version:  0.7.0
Python Version:     3.8.10
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Feb 1 21:34:38 UTC 2023
Train Data Rows:    63729
Train Data Columns: 35
Label Column: outcome_type
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  ['Adopted', 'Not Adopted']
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = Not Adopte

In [8]:
predictor3.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2   0.887252      52.052655  485.267195                0.112653          19.100602            2       True          6
1        LightGBMXT_BAG_L1   0.883520      12.300206  235.646189               12.300206         235.646189            1       True          1
2          LightGBM_BAG_L1   0.880077       4.887456  105.332001                4.887456         105.332001            1       True          2
3  RandomForestGini_BAG_L1   0.872277      19.455845   65.872528               19.455845          65.872528            1       True          3
4  RandomForestEntr_BAG_L1   0.871489      15.296495   59.315874               15.296495          59.315874            1       True          4
5          CatBoost_BAG_L1   0.859721       0.796608   61.815477                

{'model_types': {'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L1': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
 'model_performance': {'LightGBMXT_BAG_L1': 0.8835199131260898,
  'LightGBM_BAG_L1': 0.8800770086999878,
  'RandomForestGini_BAG_L1': 0.8722771955945687,
  'RandomForestEntr_BAG_L1': 0.8714892895478014,
  'CatBoost_BAG_L1': 0.8597211926347066,
  'WeightedEnsemble_L2': 0.8872521914315419},
 'model_best': 'WeightedEnsemble_L2',
 'model_paths': {'LightGBMXT_BAG_L1': 'AutogluonModels/ag-20230317_030146/models/LightGBMXT_BAG_L1/',
  'LightGBM_BAG_L1': 'AutogluonModels/ag-20230317_030146/models/LightGBM_BAG_L1/',
  'RandomForestGini_BAG_L1': 'AutogluonModels/ag-20230317_030146/models/RandomForestGini_BAG_L1/',
  'RandomForestEntr_BAG_L1': 'AutogluonModels/ag-2023031