## AutoGluon
- AutoML의 SOTA라 불리는 autogluon을 이용한 모델 핸들링

In [1]:
# autogluon
# tabular : 표 데이터 
from autogluon.tabular import TabularDataset, TabularPredictor



# cross validation
from sklearn.model_selection import cross_val_predict

# visualization
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.metrics import PredictionErrorDisplay
import seaborn as sns

import pandas as pd 
import numpy as np

# 평가 지표
import evaluation

# 모델 저장
import pickle
import joblib
 

In [10]:
df = pd.read_csv('dataframes/mice_selected_df.csv')
df = df.loc[:, (df.columns != 'date') & (df.columns != 'previous interest rate') & (df.columns != 'next interest rate') & (df.columns != 'previous change')
                  & (df.columns != 'CPI')& (df.columns != 'corePCE')& (df.columns != 'PPI')& (df.columns != 'industrial production')]
dataset = TabularDataset(df)
regressor = TabularPredictor(label='present change', problem_type = 'regression', eval_metric='rmse')
regressor.fit(train_data=dataset, presets='best_quality', auto_stack=True, fit_weighted_ensemble=True, num_bag_sets=20, refit_full=True,set_best_to_refit_full=True, verbosity=2)

No path specified. Models will be saved in: "AutogluonModels\ag-20241018_061124"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          16
Memory Avail:       19.49 GB / 31.81 GB (61.3%)
Disk Space Avail:   381.19 GB / 465.13 GB (82.0%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overf

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x20c86878210>

In [28]:
leaderboard=regressor.leaderboard(silent=True)


In [29]:
leaderboard.head(20)

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.401211,root_mean_squared_error,0.539135,83.742895,0.0,0.022546,2,True,109
1,XGBoost_r34_BAG_L1,-0.412819,root_mean_squared_error,0.0349,3.393745,0.0349,3.393745,1,True,105
2,XGBoost_r98_BAG_L1,-0.412834,root_mean_squared_error,0.043945,9.105457,0.043945,9.105457,1,True,47
3,CatBoost_r128_BAG_L1,-0.415939,root_mean_squared_error,0.01895,15.617385,0.01895,15.617385,1,True,61
4,XGBoost_r95_BAG_L1,-0.417138,root_mean_squared_error,0.020945,1.492482,0.020945,1.492482,1,True,104
5,CatBoost_r9_BAG_L1,-0.417245,root_mean_squared_error,0.018265,19.744932,0.018265,19.744932,1,True,16
6,XGBoost_r33_BAG_L1,-0.417784,root_mean_squared_error,0.030837,4.574801,0.030837,4.574801,1,True,19
7,CatBoost_r180_BAG_L1,-0.41788,root_mean_squared_error,0.015516,10.388194,0.015516,10.388194,1,True,87
8,XGBoost_r22_BAG_L1,-0.41863,root_mean_squared_error,0.023939,1.371989,0.023939,1.371989,1,True,81
9,XGBoost_r49_BAG_L1,-0.419388,root_mean_squared_error,0.026951,2.590258,0.026951,2.590258,1,True,68


In [12]:
feature_importance = regressor.feature_importance(df)
feature_importance

Computing feature importance via permutation shuffling for 11 features using 837 rows with 5 shuffle sets...
	10.87s	= Expected runtime (2.17s per shuffle set)
	5.08s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
interest rate,0.150238,0.007701,8.256509e-07,5,0.166095,0.134381
PCE_yoy,0.111597,0.005622,7.703431e-07,5,0.123173,0.100021
coreCPI_yoy,0.100942,0.00201,1.884875e-08,5,0.10508,0.096804
CPI_yoy,0.082154,0.002837,1.704734e-07,5,0.087996,0.076312
corePCE_yoy,0.074394,0.00413,1.134795e-06,5,0.082897,0.065891
corePCE_mom,0.067258,0.009347,4.363319e-05,5,0.086504,0.048012
PPI_yoy,0.052886,0.00574,1.639705e-05,5,0.064706,0.041067
coreCPI_mom,0.052485,0.001615,1.075462e-07,5,0.055811,0.049159
CPI_mom,0.049312,0.005515,1.846926e-05,5,0.060668,0.037956
coreCPI,0.037006,0.001833,7.200728e-07,5,0.04078,0.033232


In [24]:
infer_df = pd.read_csv('dataframes/infer_df.csv')
infer_df = infer_df.loc[:, (infer_df.columns != 'date') & (infer_df.columns != 'previous interest rate') & (infer_df.columns != 'next interest rate') & (infer_df.columns != 'previous change')
                  & (infer_df.columns != 'CPI')& (infer_df.columns != 'corePCE')& (infer_df.columns != 'PPI')& (infer_df.columns != 'industrial production')]
infer_dataset = TabularDataset(infer_df)
inference = regressor.predict(infer_dataset)
inference

0   -0.122593
Name: present change, dtype: float32