In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
import pandas as pd
from autogluon import TabularPrediction as task
warnings.filterwarnings('ignore')
# !pip install mxnet, mxboard, autogluon

In [3]:
DATA_DIR = os.path.abspath('../../Data/home_credit_default_risk')
MODEL_DIR = os.path.abspath('models')

# Automated Modeling with *autogluon*
* The goal is to do automated modeling with [AutoGluon](https://auto.gluon.ai/stable/index.html#). 
* *AutoGluon* makes it easy to automatically experiment with a variety of algorithms, from tree ensembles to deep learning and even model stacking.

In [4]:
feature_matrix = pd.read_csv(os.path.join(DATA_DIR, 'feature_matrix.csv'), low_memory=False) 

In [5]:
df_train = feature_matrix[feature_matrix['dataset'] == 'train'].drop(['SK_ID_CURR', 'dataset'], axis=1)
df_test = feature_matrix[feature_matrix['dataset'] == 'test'].drop(['dataset', 'TARGET'], axis=1)

In [6]:
%%time
train_data = task.Dataset(df_train)

predictor = task.fit(train_data=train_data, label='TARGET', output_directory=MODEL_DIR, 
                     eval_metric='roc_auc', auto_stack=True, verbosity=0, visualizer='tensorboard')

CPU times: user 18d 14h 4min 7s, sys: 16d 8h 26min 44s, total: 34d 22h 30min 52s
Wall time: 1d 5h 56min 41s


In [7]:
lboard = predictor.leaderboard(silent=True)
lboard.sort_values(by='score_val', ascending=False)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,weighted_ensemble_k0_l2,0.78743,3098.737486,95757.520068,0.09342,101.301855,2,True,24
1,weighted_ensemble_k0_l1,0.786499,601.329862,46704.633752,0.093324,101.587764,1,True,12
2,CatboostClassifier_STACKER_l1,0.786261,2511.553999,53790.34002,8.020795,680.118535,1,True,21
3,LightGBMClassifierXT_STACKER_l1,0.785994,2511.152501,53834.121477,7.619297,723.899992,1,True,20
4,LightGBMClassifier_STACKER_l1,0.78599,2511.691034,53782.29231,8.157829,672.070825,1,True,19
5,LightGBMClassifierCustom_STACKER_l1,0.785596,2510.629085,54090.092252,7.095881,979.870767,1,True,23
6,LightGBMClassifierCustom_STACKER_l0,0.782958,10.546562,1941.627757,10.546562,1941.627757,0,True,11
7,CatboostClassifier_STACKER_l0,0.782336,7.888541,1890.238214,7.888541,1890.238214,0,True,9
8,LightGBMClassifierXT_STACKER_l0,0.780601,11.507542,860.345474,11.507542,860.345474,0,True,8
9,LightGBMClassifier_STACKER_l0,0.780356,10.297791,824.519218,10.297791,824.519218,0,True,7


* The model stacking technique achieved the highest predictive performance. This was 0.78149 for the Kaggle public board and 0.78391 for the private board as measured by AUROC.
* This process took about 1 day and 6 hours to train using 64 cores and 256GB of memory on an AWS m4.16xlarge EC2 instance, and about an hour and a half to infer.

In [8]:
%%time
test_data = task.Dataset(df_test.drop('SK_ID_CURR', axis=1))
predictor = task.load(MODEL_DIR)

pred_probablities = predictor.predict_proba(test_data, as_pandas=True)

CPU times: user 1d 20h 50min 45s, sys: 7h 21min 5s, total: 2d 4h 11min 50s
Wall time: 1h 37min 5s


In [9]:
pred_probablities = pd.concat([df_test['SK_ID_CURR'], pred_probablities], axis=1)
pred_probablities.to_csv(os.path.join(DATA_DIR, 'submission.csv'), index=False)