# Regression example: *Synthetic* 3 dataset
$$y  = (5x_1-5x_2)1_{x_5\le 0}+(5x_3-5x_4)1_{x_5> 0}$$



## Librairies 

In [1]:
import os, sys, time
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [3]:
import torch 
import torch.nn as nn
import torch.optim as optim 

In [4]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from skorch.callbacks import EarlyStopping,LRScheduler,Checkpoint, TrainEndCheckpoint, EpochScoring

In [5]:
from tabsra.skorch_tabsra import InputShapeSetterTabSRA,TabSRALinearRegressor

## Data

In [6]:
np.random.seed(42)
x1 = np.random.normal(0,1,3*10000)
x2 = np.random.normal(0,1,3*10000)
x3 = np.random.normal(0,1,3*10000)
x4 = np.random.normal(0,1,3*10000)
x5 = np.random.normal(0,1,3*10000)
X = np.concatenate((x1.reshape(-1,1), x2.reshape(-1,1),x3.reshape(-1,1),x4.reshape(-1,1),x5.reshape(-1,1)  ),axis=1)

In [7]:
odd = (5*x1-5*x2)*(x5<=0) + (5*x3-5*x4)*(x5>0)
y_c = 1/(1+np.exp(-odd))
y = np.where(y_c>0.5, 1,0)
X_train_,X_test_,Y_train_,Y_test_ = train_test_split(X,odd,stratify =y , random_state=42)
n_features, n_classes = 5,1
len(Y_train_),len(Y_test_)

(22500, 7500)

In [8]:
feature_names = ['x1','x2','x3','x4','x5']


## Model: TabSRALinear 

In [9]:
#valid_loss
other_params = {"module__encoder_bias":True,
                "module__classifier_bias":False,
                "module__n_head":1,
                "module__dim_head":4,
                "optimizer__lr":0.01,
                "max_epochs":150,
                "batch_size":512,                
                "optimizer__weight_decay":0,
                "random_state":42,
               }
scoring = EpochScoring(scoring='r2',lower_is_better=False)
setter = InputShapeSetterTabSRA(regression=True)
early_stop = EarlyStopping(monitor=scoring.scoring, patience=20,load_best=True,lower_is_better=False, threshold=0.00001,threshold_mode='abs')
lr_scheduler = LRScheduler(policy=ReduceLROnPlateau, patience=15, min_lr=2e-5,factor=0.2, verbose=1, mode='max',monitor=scoring.scoring)
call_b = [scoring, setter, early_stop, lr_scheduler]

In [10]:
%%time
TabSRA = TabSRALinearRegressor(**other_params,callbacks=call_b)
_ = TabSRA.fit(X_train_.astype(np.float32),Y_train_.reshape(-1,1).astype(np.float32))

Re-initializing module because the following parameters were re-set: module__dim_input, module__dim_output.
Re-initializing criterion.
Re-initializing optimizer.
  epoch      r2    train_loss    valid_loss      dur
-------  ------  ------------  ------------  -------
      1  [36m0.1471[0m       [32m47.6918[0m       [35m41.5221[0m  13.6775
      2  [36m0.3002[0m       [32m39.1473[0m       [35m34.0702[0m  0.2457
      3  [36m0.4785[0m       [32m30.9727[0m       [35m25.3871[0m  0.2001
      4  [36m0.5967[0m       [32m23.2003[0m       [35m19.6363[0m  0.2089
      5  [36m0.6703[0m       [32m18.7371[0m       [35m16.0495[0m  0.2016
      6  [36m0.7209[0m       [32m15.5881[0m       [35m13.5877[0m  0.2521
      7  [36m0.7612[0m       [32m13.3134[0m       [35m11.6240[0m  0.2149
      8  [36m0.7855[0m       [32m11.6417[0m       [35m10.4425[0m  0.1948
      9  [36m0.8088[0m       [32m10.4440[0m        [35m9.3066[0m  0.1913
     10  [36m0.8264

In [22]:
Coef  = pd.DataFrame(TabSRA.get_weights()[0])
Coef.columns=feature_names
Coef

Unnamed: 0,x1,x2,x3,x4,x5
0,5.001876,-4.996838,5.169649,-5.016213,5.380935


In [25]:
pred_tabsra = TabSRA.predict(X_test_.astype(np.float32))

rmse_tabsra = np.sqrt(mean_squared_error(Y_test_, pred_tabsra))
r2_tabsra = r2_score(Y_test_, pred_tabsra) 
print(f"--RMSE  = {rmse_tabsra} -- r2  = {r2_tabsra}")

--RMSE  = 0.3034210188095153 -- r2  = 0.9981866874934777


In [28]:
%%time
attributions_tabsra = pd.DataFrame(TabSRA.get_feature_attribution(X_test_.astype(np.float32)))
attributions_tabsra.columns = feature_names
attributions_tabsra['pred_proba'] = pred_tabsra
attributions_tabsra['label'] = Y_test_

CPU times: user 28.5 ms, sys: 0 ns, total: 28.5 ms
Wall time: 5.43 ms


In [29]:
attributions_tabsra.head()

Unnamed: 0,x1,x2,x3,x4,x5,pred_proba,label
0,-7.768471e-06,0.027804,8.204901,-3.011704,0.009502,5.230494,5.207175
1,2.296751,1.677891,-4.941949e-08,-1.649862e-06,-0.000148,3.974492,3.974853
2,1.641793,-0.20404,-6.330068e-09,1.415404e-09,-0.000368,1.437385,1.437009
3,-4.454783,-9.539963,-5.557716e-05,-0.0009707033,-0.00396,-13.999731,-14.000517
4,-9.808094e-07,-0.013421,0.2677278,0.5371353,0.044899,0.836339,0.807119


In [31]:
%%time
attention_tabsra = pd.DataFrame(TabSRA.get_attention(X_test_.astype(np.float32))[0])
attention_tabsra.columns = feature_names
attention_tabsra['pred_proba'] = pred_tabsra
attention_tabsra['label'] = Y_test_

CPU times: user 33.2 ms, sys: 0 ns, total: 33.2 ms
Wall time: 7.1 ms


In [32]:
attention_tabsra.head()

Unnamed: 0,x1,x2,x3,x4,x5,pred_proba,label
0,2.521361e-06,0.006143,0.9666835,0.9999999,0.002266,5.230494,5.207175
1,0.9999952,1.0,2.758221e-08,2.59659e-07,1.6e-05,3.974492,3.974853
2,0.9999999,1.0,4.321437e-10,4.146849e-09,4.6e-05,1.437385,1.437009
3,0.9997753,0.999958,0.0008022116,0.0002500543,0.001725,-13.999731,-14.000517
4,2.277387e-07,0.011819,0.9529745,1.0,0.008292,0.836339,0.807119
