# Baseline model

In [None]:
import os
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir(os.pardir)

import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

Rationale for choosing the baseline model:
1. Shwartz-Ziv, R. and Armon, A. Tabular data: Deep learning is not all you need. arXiv:2106.03253, 2021
2. Grinsztajn, L., Oyallon, E., and Varoquaux, G. Why do tree-based models still outperform deep learning on tabular data? arXiv:2207.08815, 2022

## Data

In [None]:
PATH_PROC_DATA = os.path.join('data', 'processed')
syn_data = pd.read_csv(os.path.join(PATH_PROC_DATA, 'pDeltaT_synthetic.csv'))
true_data = pd.read_csv(os.path.join(PATH_PROC_DATA, 'pDeltaT_clean.csv'))

features = ['d [mm]', 'f [GHz]', 'psPDtot_1 [W/m2]', 'psPDtot_4 [W/m2]']
target = 'pDeltaT * 100 [°C]'
X = syn_data[features]
y = syn_data[target]

# train and validation set - synthetic data
X_train, X_valid, y_train, y_valid = train_test_split(X.to_numpy(),
                                                      y.to_numpy(),
                                                      test_size=.2)
# test set - original data
X_test = true_data[features].to_numpy()
y_test = true_data[target].to_numpy()

## Importing/training the regressor

In [None]:
try:
    print('Trying to restore the regressor...')
    model_opt = XGBRegressor()
    model_opt.load_model(os.path.join('models', '02_baseline_model.json'))
    print('Restoring successful.')
except Exception as e:
    print(e)
    print('Training the regressor...')
    
    param_grid = {
        'max_depth': [2, 3, 5, 7],
        'max_leaves': [0, 1, 5, 10],
        'learning_rate': [0.01, 0.025, 0.05, 0.1, 0.5],
        'gamma': [0, 0.25, 0.5, 1.0, 1.5],
        'reg_lambda': [0, 1, 5, 10, 20, 50],
        'scale_pos_weight': [1, 3, 5, 10]
    }

    regressor = XGBRegressor(early_stopping_rounds=20)

    grid = GridSearchCV(
        estimator=regressor,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        verbose=0,
        n_jobs=-1,
        cv=5
    )

    grid.fit(X_train, y_train,
             eval_set=[(X_eval, y_eval)],
             verbose=False)
    
    print('Finishing...')
    param_opt = grid.best_params_
    model_opt = grid.best_estimator_
    
    print('Saving...')
    model_opt.save_model(os.path.join('models', '02_baseline_model.json'))

## Evaluating the regressor

In [None]:
y_pred = model_opt.predict(X_test)
y_resid = y_test / 100 - y_pred / 100
ae = np.abs(y_resid)
mae = np.mean(ae)
mae

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4.5, 4))
ax = sns.histplot(x=ae, bins='fd', stat='density', kde=True, ax=ax)
ax.vlines(mae, *ax.get_ybound(), ls='--',
          label='mean absolute error')
ax.set(xlabel='absolute error (°C)', ylabel='probability density')
ax.legend();

In [None]:
PATH_ERROR_DATA = os.path.join('data', 'models')
error_data = os.path.join(PATH_ERROR_DATA, 'baseline.npy')
save = False
if save:
    with open(error_data, 'wb') as f:
        np.save(f, ae)