# Description

We will use the preprocessed rossman data and compare regression accuracies/errors for tabnet and node vs XGBoost

In [9]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

ROOT = os.path.dirname(os.getcwd())
data = pd.read_pickle(os.path.join(ROOT, 'data', 'rossman-store', 'preprocessed_data.pkl.gz'))
X, y = data.drop(columns='Sales'), data[['Sales']]

### Baseline Model: XGBoost

In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

model = XGBRegressor()

#fit
model.fit(X_train, y_train)

#pred
pred = model.predict(X_test)

#evaluation
rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : % f" %(rmse))

RMSE :  385.070164


### TabNet

In [12]:
from pytorch_tabnet.tab_model import TabNetRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

model = TabNetRegressor()  #TabNetRegressor()

model.fit( 
    X_train.to_numpy(), y_train.to_numpy(),
    eval_set=[(X_valid.to_numpy(), y_valid.to_numpy())],
    eval_metric=['mse'],
    max_epochs = 20
)

preds = model.predict(X_test)

Device used : cpu
epoch 0  | loss: 22534658.39655| val_0_mse: 1627233.58968|  0:01:55s
epoch 1  | loss: 421563.11474| val_0_mse: 332605.24908|  0:03:21s
epoch 2  | loss: 342492.92584| val_0_mse: 471196.45377|  0:05:01s
epoch 3  | loss: 307602.68359| val_0_mse: 364083.52808|  0:07:04s
epoch 4  | loss: 284833.07014| val_0_mse: 255097.89137|  0:08:37s
epoch 5  | loss: 284051.66318| val_0_mse: 247855.33958|  0:10:03s
epoch 6  | loss: 272274.25667| val_0_mse: 209717.50339|  0:11:30s
epoch 7  | loss: 258828.57009| val_0_mse: 294852.47212|  0:13:04s
epoch 8  | loss: 242005.27901| val_0_mse: 196595.04643|  0:14:38s
epoch 9  | loss: 237728.35137| val_0_mse: 228982.06771|  0:16:06s
epoch 10 | loss: 236031.13718| val_0_mse: 286886.18106|  0:17:34s
epoch 11 | loss: 236148.03693| val_0_mse: 509205.0715|  0:19:01s
epoch 12 | loss: 229612.14246| val_0_mse: 434332.60184|  0:20:37s
epoch 13 | loss: 229659.49169| val_0_mse: 333211.76792|  0:22:05s
epoch 14 | loss: 224012.00069| val_0_mse: 221364.86363| 

KeyError: 0

In [16]:
preds = model.predict(X_test.to_numpy())
rmse = np.sqrt(MSE(y_test, preds))
print("RMSE : % f" %(rmse))

RMSE :  425.734993
