# Description

We will use the preprocessed Rossmann data and compare regression accuracies/errors for TabNet and NODE vs XGBoost

In [None]:
!pip install pytorch_tabnet
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

ROOT = os.path.dirname(os.getcwd())
data = pd.read_csv('preprocessed_data.csv', index_col=0)
data = data.iloc[:10000, :] #data was too big for node
display(data)
X, y = data.drop(columns='Sales'), data[['Sales']]

### Baseline Model: XGBoost

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

model = XGBRegressor(n_estimators=1000)

#fit
model.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_test, y_test)],
          early_stopping_rounds=7000,
          verbose=10,
          eval_metric=['logloss', 'rmse'])

#pred
pred = model.predict(X_test)

#evaluation
rmse = np.sqrt(MSE(y_test, pred))
print("RMSE : % f" %(rmse))

In [None]:
xgb_boost_losses = pd.concat([pd.Series(model.evals_result()['validation_0']['logloss']), pd.Series(model.evals_result()['validation_1']['logloss'])], keys=['Train NLL (XGB)', 'Test NLL (XGB)'], axis=1)
xgb_boost_errors = pd.concat([1 - pd.Series(model.evals_result()['validation_0']['rmse']), 1 - pd.Series(model.evals_result()['validation_1']['rmse'])], keys=['Train RMSE (XGB)', 'Test RMSE (XGB)'], axis=1)

display(xgb_boost_losses)
display(xgb_boost_errors)

xgb_boost_losses.to_csv('xgb_losses.csv')
xgb_boost_errors.to_csv('xgb_errors.csv')

### TabNet

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

model = TabNetRegressor(n_a=32, n_d=32, lambda_sparse=0.001, n_steps=5, gamma=1.2)  #TabNetRegressor()

model.fit( 
    X_train.to_numpy(), y_train.to_numpy(),
    eval_set=[(X_train.to_numpy(), y_train.to_numpy()), (X_test.to_numpy(), y_test.to_numpy())],
    eval_metric=['rmse'],
    max_epochs = 1000,
    batch_size = 4096,
    virtual_batch_size = 512,
    patience = 0 #Remove stopping criteria

)

In [None]:
preds = model.predict(X_test.to_numpy())
rmse = np.sqrt(MSE(y_test, preds))
print("RMSE : % f" %(rmse))

In [None]:
tabnet_errors = pd.concat([pd.Series(model.history['val_0_rmse']), pd.Series(model.history['val_1_rmse'])], keys=['Train RMSE (TabNet)', 'Test RMSE (TabNet)'], axis=1)

display(tabnet_errors)

tabnet_errors.to_csv('tabnet_errors.csv') 

In [None]:
plt.figure(figsize = (8, 5))
plt.bar(x=[i for i in data.columns if i!='Sales'], height=model.feature_importances_)
plt.title('Global Feature Importances | TabNet')
plt.xticks(rotation=90)

In [None]:
model.feature_importances_.shape

In [None]:
explain_matrix, masks = model.explain(X_test.to_numpy())
fig, axs = plt.subplots(1, 5, figsize=(20,20))

for i in range(5):
    axs[i].imshow(masks[i][:50])
    axs[i].set_title(f"mask {i}")
    axs[i].set_xticklabels([i for i in data.columns if i!='Sales'], rotation=45)

## NODE

In [None]:
!git clone https://github.com/Qwicen/node.git

In [None]:
!pip install -r node/requirements.txt

In [None]:
from node import lib

from category_encoders import LeaveOneOutEncoder
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2
%env CUDA_VISIBLE_DEVICES=0
import os, sys
import time
sys.path.insert(0, '..')
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch, torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=123)

In [None]:

in_features = X_train.shape[1]

model = nn.Sequential(
    lib.DenseBlock(in_features, 128, num_layers=8, tree_dim=3, depth=6, flatten_output=False,
                   choice_function=lib.entmax15, bin_function=lib.entmoid15),
    lib.Lambda(lambda x: x[..., 0].mean(dim=-1)),  # average first channels of every tree
    
).to(device)

'''
with torch.no_grad():
    tmp = torch.tensor(X_train[:5000].to_numpy(), device=device)
    #tmp = tmp.type(torch.int)
    res = model(tmp.int())
'''

In [None]:
from qhoptim.pyt import QHAdam
optimizer_params = { 'nus':(0.7, 1.0), 'betas':(0.95, 0.998) }

In [None]:
from tqdm import tqdm
from IPython.display import clear_output
loss_history, rmse_history = [], []
best_mse = float('inf')
best_step_mse = 0
early_stopping_rounds = 5000
report_frequency = 50

In [None]:
trainer = lib.Trainer(
    model=model, loss_function=F.mse_loss,
    experiment_name='xyz_200k32',
    warm_start=False,
    Optimizer=QHAdam,
    optimizer_params=optimizer_params,
    verbose=True,
    n_last_checkpoints=5
)

In [None]:
X_train = torch.Tensor(X_train.to_numpy())
y_train = torch.Tensor(y_train.to_numpy())
X_valid = torch.Tensor(X_valid.to_numpy())
y_valid = torch.Tensor(y_valid.to_numpy())


for batch in lib.iterate_minibatches(X_train, y_train, batch_size=512, 
                                                shuffle=True, epochs=1000):
    metrics = trainer.train_on_batch(*batch, device=device)
    
    if trainer.step % report_frequency == 0:
        trainer.save_checkpoint()
        trainer.average_checkpoints(out_tag='avg')
        trainer.load_checkpoint(tag='avg')
        mse = trainer.evaluate_mse(
            X_valid, y_valid, device=device, batch_size=16384)
        
        # loss_history.append(metrics['loss'])

        if mse < best_mse:
            best_mse = mse
            best_step_mse = trainer.step
            trainer.save_checkpoint(tag='best_mse')

        rmse = np.sqrt(mse)
        rmse_history.append(rmse)
        
        trainer.load_checkpoint()  # last
        trainer.remove_old_temp_checkpoints()

        # clear_output(True)
        # plt.figure(figsize=[18, 6])
        # plt.subplot(1, 2, 1)
        # plt.plot(loss_history)
        # plt.title('Loss')
        # plt.grid()
        # plt.subplot(1, 2, 2)
        # plt.plot(rmse_history)
        # plt.title('MSE')
        # plt.grid()
        # plt.show()
        # print("Loss %.5f" % (metrics['loss']))
        print("Val Error: %0.5f" % (rmse))
    if trainer.step > best_step_mse + early_stopping_rounds:
        print('BREAK. There is no improvment for {} steps'.format(early_stopping_rounds))
        print("Best step: ", best_step_mse)
        print("Best Error: %0.5f" % (best_mse))
        break

In [None]:
X_test = torch.Tensor(X_test.to_numpy())
y_test = torch.Tensor(y_test.to_numpy())

In [None]:
trainer.load_checkpoint(tag='best_mse')
mse = trainer.evaluate_mse(X_test, y_test, device=device)
print('Best step: ', trainer.step)
print("Test RMSE: %0.5f" % (np.sqrt(mse)))