In [1]:
!pip install pytorch-tabnet
!cd /usr/local/python3.7/dist-packages/pytorch_tabnet && patch </content/float64.patch


import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


!pip install optuna

import numpy as np
import pandas as pd
import torch
torch.autograd.set_detect_anomaly(False)
torch.set_default_tensor_type(torch.DoubleTensor)
torch.set_default_dtype(torch.float64)

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import optuna
from google.colab import output

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from pytorch_tabnet.tab_model import  TabNetRegressor

output.clear()

In [2]:
import importlib
import pytorch_tabnet
importlib.reload(pytorch_tabnet)

<module 'pytorch_tabnet' (namespace)>

In [3]:
class Objective(object):
    def __init__(self, model_name, X, y, params):
        self.model_name = model_name

        # Save the trainings data
        self.X = X
        self.y = y
        self.params = params

        
    def __call__(self, trial):
        # Define hyperparameters to optimize
        trial_params = self.model_name.define_trial_parameters(trial, self.params)
        print(trial_params)
        
        score = 0
        # Cross validate the chosen hyperparameters

        kf = KFold(self.params['nfold'], shuffle = False)
        for train, test in kf.split(self.X):
            X_train, y_train = self.X.iloc[train, :], self.y.iloc[train]
            X_val, y_val = self.X.iloc[test, :], self.y.iloc[test]
            
            model = self.model_name(trial_params)
            model.fit(X_train, y_train, X_val, y_val)
            score += mean_squared_error(y_val, model.predict(X_val),
                                        squared = self.params['squared_metrics'])

        score /= self.params['nfold']
        
        return score


def main(X, y, model_name, params, n_trials = 100):
    print("Start hyperparameter optimization")
    
    Sampler = optuna.samplers.TPESampler(seed = 777)
    study = optuna.create_study(sampler = Sampler)
    study.optimize(Objective(model_name, X, y, params), n_trials, show_progress_bar = True, n_jobs = 1)
    
    print("Best parameters:", study.best_trial.params)

    return study

In [4]:
class TabNet():

    def __init__(self, params):
        
        self.model = TabNetRegressor(**params, verbose = False, device_name = 'cpu')
        #if torch.cuda.is_available():
        #    self.model.to('cuda')
        
    def fit(self, X, y, X_val=None, y_val=None):
        X = X.to_numpy()
        y = y.to_numpy().reshape(-1, 1)
        
        if isinstance(X_val, pd.DataFrame):
            X_val, y_val = X_val.to_numpy(), y_val.to_numpy().reshape(-1, 1)
            
        self.model.fit(X, y, eval_set = [(X_val, y_val)], eval_name = ['eval'], max_epochs = 500, patience = 20)
        history = self.model.history
        return history['loss']

    def predict(self, X):
        X = X.to_numpy()

        return self.model.predict(X)
        
    @classmethod
    def define_trial_parameters(cls, trial, params):
        params_tunable = {}
        params_out = {}
        for i, val in params.items():
            if isinstance(val, list):
                params_tunable[f'{i}'] = val
            else:
                params_out[f'{i}'] = val
        
        if 'n_d' in params_tunable:
            params_out[f'n_d'] = trial.suggest_int('n_d', params['n_d'][0], params['n_d'][1], log = False)
        if 'n_steps' in params_tunable:
            params_out[f'n_steps'] = trial.suggest_int('n_steps', params['n_steps'][0], params['n_steps'][1], log = False)
        if 'gamma' in params_tunable:
            params_out[f'gamma'] = trial.suggest_float('gamma', params['gamma'][0], params['gamma'][1], log = False)
        if 'cat_emb_dim' in params_tunable:
            params_out[f'cat_emb_dim'] = trial.suggest_int('cat_emb_dim', params['cat_emb_dim'][0], params['cat_emb_dim'][1], log = False)
        if 'n_independent' in params_tunable:
            params_out[f'n_independent'] = trial.suggest_int('n_independent', params['n_independent'][0], params['n_independent'][1], log = False)
        if 'n_shared' in params_tunable:
            params_out[f'n_shared'] = trial.suggest_int('n_shared', params['n_shared'][0], params['n_shared'][1], log = False)
        if 'momentum' in params_tunable:
            params_out[f'momentum'] = trial.suggest_float('momentum', params['momentum'][0], params['momentum'][1], log = True)
        if 'mask_type' in params_tunable:
            params_out[f'mask_type'] = trial.suggest_categorical('mask_type', params['mask_type'])
        
        
        if 'nfold' in params_out:
            del params_out['nfold']
        if 'squared_metrics' in params_out:
            del params_out['squared_metrics']
        
        return params_out

In [5]:
np.random.seed(7)
X = np.random.randint(0, 11, size = (745, 50))
y = np.random.rand(745) * 175

In [6]:
X = pd.DataFrame(X)
y = pd.DataFrame(y)

torch.cuda.is_available()

True

In [10]:
import functools
def module_has_nan(m):
    ret = functools.reduce(
        lambda ret, module: ret or functools.reduce(
            lambda ret, parameter: (ret or parameter.isnan().any().cpu().numpy()).any(),
            module.parameters(), False
        ), m.modules(), False
    )
    print(ret)
    return ret

In [8]:
failed_params = {'n_d': 2, 'n_steps': 23, 'gamma': 1.3362582829842025, 'n_independent': 18, 'n_shared': 6, 'momentum': 0.0060940631018202435, 'mask_type': 'entmax'}

score = 0
# Cross validate the chosen hyperparameters

kf = KFold(5, shuffle = False)
for train, test in kf.split(X):
    X_train, y_train = X.iloc[train, :], y.iloc[train]
    X_val, y_val = X.iloc[test, :], y.iloc[test]
    
    model = TabNetRegressor(**failed_params, verbose = True, device_name = 'cuda')

    X = X_train.to_numpy().astype(np.float64)
    y = y_train.to_numpy().reshape(-1, 1).astype(np.float64)
        
    if isinstance(X_val, pd.DataFrame):
        X_val, y_val = X_val.to_numpy(), y_val.to_numpy().reshape(-1, 1)

    model.fit(X, y, eval_set = [(X_val, y_val)], eval_name = ['eval'], max_epochs = 500, patience = 20)
    module_has_nan(model.network)
    print()

    score += mean_squared_error(y_val, model.predict(X_val),
                                squared = False)

score /= 5

Device used : cuda
epoch 0  | loss: 10974.82711| eval_mse: 2044538699.63964|  0:00:01s
epoch 1  | loss: 11009.60241| eval_mse: 845740477.28573|  0:00:02s
epoch 2  | loss: 10947.17714| eval_mse: 227494150.28563|  0:00:03s
epoch 3  | loss: 11022.88373| eval_mse: 155486311.95783|  0:00:04s
epoch 4  | loss: 10993.46461| eval_mse: 79800478.68356|  0:00:06s
epoch 5  | loss: 10948.6001| eval_mse: 42369398.12749|  0:00:07s
epoch 6  | loss: 10978.69974| eval_mse: 24321578.65051|  0:00:08s
epoch 7  | loss: 11007.02591| eval_mse: 13017295.00968|  0:00:09s
epoch 8  | loss: 10988.90641| eval_mse: 23292570.28343|  0:00:11s
epoch 9  | loss: 10999.3067| eval_mse: 4963329.62921|  0:00:12s
epoch 10 | loss: 11067.60201| eval_mse: 2609405.74334|  0:00:13s
epoch 11 | loss: 10986.05107| eval_mse: 3169869.68824|  0:00:14s
epoch 12 | loss: 11020.58315| eval_mse: 2220864.70622|  0:00:15s
epoch 13 | loss: 10966.25843| eval_mse: 1314556.77798|  0:00:17s
epoch 14 | loss: 10946.65865| eval_mse: 1167119.30034|  0:0

TypeError: ignored

In [11]:
module_has_nan(model.network)

False


False

In [None]:
%debug

> [0;32m/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py[0m(2283)[0;36mbatch_norm[0;34m()[0m
[0;32m   2281 [0;31m[0;34m[0m[0m
[0m[0;32m   2282 [0;31m    return torch.batch_norm(
[0m[0;32m-> 2283 [0;31m        [0minput[0m[0;34m,[0m [0mweight[0m[0;34m,[0m [0mbias[0m[0;34m,[0m [0mrunning_mean[0m[0;34m,[0m [0mrunning_var[0m[0;34m,[0m [0mtraining[0m[0;34m,[0m [0mmomentum[0m[0;34m,[0m [0meps[0m[0;34m,[0m [0mtorch[0m[0;34m.[0m[0mbackends[0m[0;34m.[0m[0mcudnn[0m[0;34m.[0m[0menabled[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   2284 [0;31m    )
[0m[0;32m   2285 [0;31m[0;34m[0m[0m
[0m
tensor([[ 4.,  9.,  2.,  ..., 10.,  9.,  0.],
        [ 7.,  6.,  6.,  ...,  7.,  1.,  4.],
        [ 7.,  8.,  2.,  ...,  3.,  9.,  6.],
        ...,
        [ 5.,  4.,  6.,  ...,  3.,  1.,  8.],
        [ 4.,  1.,  8.,  ...,  4.,  7.,  9.],
        [ 8.,  2.,  6.,  ...,  8.,  6.,  6.]], dtype=torch.float32)
Parameter containing:
tenso

In [None]:
TabNet_params = {
    'n_d' : [2, 10],
    'n_steps' : [1, 25],
    'gamma' : [1., 2.],
    'n_independent' : [1, 20],
    'n_shared' : [1, 20],
    'momentum' : [1e-3, 0.4],
    'mask_type' : ['sparsemax', 'entmax'],
    'nfold' : 5,
    'squared_metrics' : False
    }

model_name = TabNet

TabNet_res = main(X = X, y = y, model_name = model_name, params = TabNet_params, n_trials = 50)

[32m[I 2022-04-20 06:44:45,324][0m A new study created in memory with name: no-name-76acffe7-550a-4925-ba9d-5fc664087939[0m


Start hyperparameter optimization


  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

{'n_d': 3, 'n_steps': 8, 'gamma': 1.062036414714562, 'n_independent': 10, 'n_shared': 17, 'momentum': 0.2582866324854284, 'mask_type': 'entmax'}

Early stopping occurred at epoch 173 with best_epoch = 153 and best_eval_mse = 2520.85343
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 248 with best_epoch = 228 and best_eval_mse = 2525.8012
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 181 with best_epoch = 161 and best_eval_mse = 2318.17918
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 191 with best_epoch = 171 and best_eval_mse = 2434.95432
Best weights from best epoch are automatically used!

Early stopping occurred at epoch 180 with best_epoch = 160 and best_eval_mse = 2375.5705
Best weights from best epoch are automatically used!
[32m[I 2022-04-20 06:56:23,327][0m Trial 0 finished with value: 49.33960101210168 and parameters: {'n_d': 3, 'n_steps': 8, 'gamma': 1.0

RuntimeError: ignored

In [None]:
%debug

> [0;32m/usr/local/lib/python3.7/dist-packages/pytorch_tabnet/tab_network.py[0m(738)[0;36mforward[0;34m()[0m
[0;32m    736 [0;31m[0;34m[0m[0m
[0m[0;32m    737 [0;31m    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mx[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 738 [0;31m        [0mscale[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0msqrt[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mFloatTensor[0m[0;34m([0m[0;34m[[0m[0;36m0.5[0m[0;34m][0m[0;34m)[0m[0;34m.[0m[0mto[0m[0;34m([0m[0mx[0m[0;34m.[0m[0mdevice[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    739 [0;31m        [0;32mif[0m [0mself[0m[0;34m.[0m[0mfirst[0m[0;34m:[0m  [0;31m# the first layer of the block has no scale multiplication[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    740 [0;31m            [0mx[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mglu_layers[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m([0m[0mx[0m


sys.settrace() should not be used when the debugger is being used.
This may cause the debugger to stop working correctly.
If this is needed, please check: 
http://pydev.blogspot.com/2007/06/why-cant-pydev-debugger-work-with.html
to see how to restore the debug tracing back correctly.
Call Location:
  File "/usr/lib/python3.7/bdb.py", line 357, in set_quit
    sys.settrace(None)



In [14]:
X.astype(np.float64)

array([[ 8.,  8.,  0., ...,  2.,  2.,  2.],
       [ 0.,  9.,  7., ...,  7.,  0.,  3.],
       [ 2.,  3.,  4., ...,  9.,  9.,  4.],
       ...,
       [ 7.,  2.,  8., ...,  4.,  9.,  8.],
       [ 8.,  0.,  1., ...,  3., 10.,  6.],
       [ 5.,  0.,  0., ...,  5.,  6.,  5.]])

In [13]:
!cd /usr/local && pwd

/usr/local
