In [None]:
!pip install pytorch-tabnet

In [1]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
import pickle 
import numpy as np
from sklearn.model_selection import train_test_split
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

x, y = pickle.load(open("data/train.pkl", "rb"))
test_name, x_test = pickle.load(open("data/test.pkl", "rb"))
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)



In [None]:
clf = TabNetClassifier(device_name='cpu')
clf.fit(x_train, y_train, x_val, y_val)

In [2]:
import torch
import torch.nn as nn
from models.layers import *

def convlayer(c_in,c_out,ks=3,padding='same',bias=True,stride=1,
              bn_init=False,zero_bn=False,bn_before=True,
              act_fn='relu', **kwargs):
    '''conv layer (padding="same") + bn + act'''
    if ks % 2 == 1 and padding == 'same': padding = ks // 2
    layers = [ConvSP1d(c_in,c_out, ks, bias=bias, stride=stride) if padding == 'same' else \
    nn.Conv1d(c_in,c_out, ks, stride=stride, padding=padding, bias=bias)]
    bn = GBN(c_out)
    if bn_init: nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    if bn_before: layers.append(bn)
    if act_fn: layers.append(get_act_layer(act_fn, **kwargs))
    if not bn_before: layers.append(bn)
    return nn.Sequential(*layers)

  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")


In [13]:
import torch
from torch.nn import Linear, BatchNorm1d, ReLU
import numpy as np
from pytorch_tabnet import sparsemax


def initialize_non_glu(module, input_dim, output_dim):
    gain_value = np.sqrt((input_dim+output_dim)/np.sqrt(4*input_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
    # torch.nn.init.zeros_(module.bias)
    return


def initialize_glu(module, input_dim, output_dim):
    gain_value = np.sqrt((input_dim+output_dim)/np.sqrt(input_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
    # torch.nn.init.zeros_(module.bias)
    return


class GBN(torch.nn.Module):
    """
        Ghost Batch Normalization
        https://arxiv.org/abs/1705.08741
    """

    def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01):
        super(GBN, self).__init__()

        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = BatchNorm1d(self.input_dim, momentum=momentum)

    def forward(self, x):
        chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
        res = [self.bn(x_) for x_ in chunks]

        return torch.cat(res, dim=0)


class TabNetNoEmbeddings(torch.nn.Module):
    def __init__(self, input_dim, output_dim,
                 n_d=8, n_a=8,
                 n_steps=3, gamma=1.3,
                 n_independent=2, n_shared=2, epsilon=1e-15,
                 virtual_batch_size=128, momentum=0.02):
        """
        Defines main part of the TabNet network without the embedding layers.

        Parameters
        ----------
        - input_dim : int
            Number of features
        - output_dim : int
            Dimension of network output
            examples : one for regression, 2 for binary classification etc...
        - n_d : int
            Dimension of the prediction  layer (usually between 4 and 64)
        - n_a : int
            Dimension of the attention  layer (usually between 4 and 64)
        - n_steps: int
            Number of sucessive steps in the newtork (usually betwenn 3 and 10)
        - gamma : float
            Float above 1, scaling factor for attention updates (usually betwenn 1.0 to 2.0)
        - momentum : float
            Float value between 0 and 1 which will be used for momentum in all batch norm
        - n_independent : int
            Number of independent GLU layer in each GLU block (default 2)
        - n_shared : int
            Number of independent GLU layer in each GLU block (default 2)
        - epsilon: float
            Avoid log(0), this should be kept very low
        """
        super(TabNetNoEmbeddings, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_independent = n_independent
        self.n_shared = n_shared
        self.virtual_batch_size = virtual_batch_size

        if self.n_shared > 0:
            shared_feat_transform = torch.nn.ModuleList()
            for i in range(self.n_shared):
                if i == 0:
                    shared_feat_transform.append(ConvSP1d(self.input_dim,
                                                        2*(n_d + n_a),1,
                                                        bias=False))
                else:
                    shared_feat_transform.append(ConvSP1d(n_d + n_a, 2*(n_d + n_a),1, bias=False))

        else:
            shared_feat_transform = None

        self.initial_splitter = FeatTransformer(self.input_dim, n_d+n_a, shared_feat_transform,
                                                n_glu_independent=self.n_independent,
                                                virtual_batch_size=self.virtual_batch_size,
                                                momentum=momentum)

        self.feat_transformers = torch.nn.ModuleList()
        self.att_transformers = torch.nn.ModuleList()

        for step in range(n_steps):
            transformer = FeatTransformer(self.input_dim, n_d+n_a, shared_feat_transform,
                                          n_glu_independent=self.n_independent,
                                          virtual_batch_size=self.virtual_batch_size,
                                          momentum=momentum)
            attention = AttentiveTransformer(n_a, self.input_dim,
                                             virtual_batch_size=self.virtual_batch_size,
                                             momentum=momentum)
            self.feat_transformers.append(transformer)
            self.att_transformers.append(attention)
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.flatten = Flatten()
        self.final_mapping = Linear(2 * (n_d ** n_a), output_dim, bias=False)
        initialize_non_glu(self.final_mapping, 2 * (n_d ** n_a), output_dim)

    def forward(self, x):
        res = 0

        prior = torch.ones(x.shape).to(x.device)
        M_explain = torch.zeros(x.shape).to(x.device)
        M_loss = 0
        att = self.initial_splitter(x)[:, self.n_d:]
        masks = {}
        for step in range(self.n_steps):
            M = self.att_transformers[step](prior, att)
            masks[step] = M
            M_loss += torch.mean(torch.sum(torch.mul(M, torch.log(M+self.epsilon)),
                                           dim=1)) / (self.n_steps)
            # update prior
            prior = torch.mul(self.gamma - M, prior)
            # output
            masked_x = torch.mul(M, x)
            out = self.feat_transformers[step](masked_x)
            d = ReLU()(out[:, :self.n_d])
            res = torch.add(res, d)
            # explain
            step_importance = torch.sum(d, dim=1)
            M_explain += torch.mul(M, step_importance.unsqueeze(dim=1))
            # update attention
            att = out[:, self.n_d:]
        
        print(res.shape)
        res = self.gap(res)
        print(res.shape)
        res = self.flatten(res)
        print(res.shape)
        res = self.final_mapping(res)
        return res

class AttentiveTransformer(torch.nn.Module):
    def __init__(self, input_dim, output_dim, virtual_batch_size=128, momentum=0.02):
        """
        Initialize an attention transformer.

        Parameters
        ----------
        - input_dim : int
            Input size
        - output_dim : int
            Outpu_size
        - momentum : float
            Float value between 0 and 1 which will be used for momentum in batch norm
        """
        super(AttentiveTransformer, self).__init__()
        self.fc = ConvSP1d(input_dim, output_dim,1, bias=False)
        initialize_non_glu(self.fc, input_dim, output_dim)
        self.bn = GBN(output_dim, virtual_batch_size=virtual_batch_size,
                      momentum=momentum)

        # Sparsemax
        self.sp_max = sparsemax.Sparsemax(dim=-1)
        # Entmax
        # self.sp_max = sparsemax.Entmax15(dim=-1)

    def forward(self, priors, processed_feat):
        x = self.fc(processed_feat)
        x = self.bn(x)
        x = torch.mul(x, priors)
        x = self.sp_max(x)
        return x


class FeatTransformer(torch.nn.Module):
    def __init__(self, input_dim, output_dim, shared_layers, n_glu_independent,
                 virtual_batch_size=128, momentum=0.02):
        super(FeatTransformer, self).__init__()
        """
        Initialize a feature transformer.

        Parameters
        ----------
        - input_dim : int
            Input size
        - output_dim : int
            Outpu_size
        - n_glu_independant
        - shared_blocks : torch.nn.ModuleList
            The shared block that should be common to every step
        - momentum : float
            Float value between 0 and 1 which will be used for momentum in batch norm
        """

        params = {
            'n_glu': n_glu_independent,
            'virtual_batch_size': virtual_batch_size,
            'momentum': momentum
        }

        if shared_layers is None:
            # no shared layers
            self.shared = torch.nn.Identity()
            is_first = True
        else:
            self.shared = GLU_Block(input_dim, output_dim,
                                    first=True,
                                    shared_layers=shared_layers,
                                    n_glu=len(shared_layers),
                                    virtual_batch_size=virtual_batch_size,
                                    momentum=momentum)
            is_first = False

        if n_glu_independent == 0:
            # no independent layers
            self.specifics = torch.nn.Identity()
        else:
            spec_input_dim = input_dim if is_first else output_dim
            self.specifics = GLU_Block(spec_input_dim, output_dim,
                                       first=is_first,
                                       **params)

    def forward(self, x):
        print("Feat Transform Input", x.shape)
        print(self.shared)
        x = self.shared(x)
        print("Feat Transform Shared",x.shape)
        x = self.specifics(x)
        return x


class GLU_Block(torch.nn.Module):
    """
        Independant GLU block, specific to each step
    """

    def __init__(self, input_dim, output_dim, n_glu=2, first=False, shared_layers=None,
                 virtual_batch_size=128, momentum=0.02):
        super(GLU_Block, self).__init__()
        self.first = first
        self.shared_layers = shared_layers
        self.n_glu = n_glu
        self.glu_layers = torch.nn.ModuleList()

        params = {
            'virtual_batch_size': virtual_batch_size,
            'momentum': momentum
        }

        fc = shared_layers[0] if shared_layers else None
        self.glu_layers.append(GLU_Layer(input_dim, output_dim,
                                         fc=fc,
                                         **params))
        for glu_id in range(1, self.n_glu):
            fc = shared_layers[glu_id] if shared_layers else None
            self.glu_layers.append(GLU_Layer(output_dim, output_dim,
                                             fc=fc,
                                             **params))

    def forward(self, x):
        scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device))
        if self.first:  # the first layer of the block has no scale multiplication
            x = self.glu_layers[0](x)
            layers_left = range(1, self.n_glu)
        else:
            layers_left = range(self.n_glu)

        for glu_id in layers_left:
            x = torch.add(x, self.glu_layers[glu_id](x))
            x = x*scale
        return x


class GLU_Layer(torch.nn.Module):
    def __init__(self, input_dim, output_dim, fc=None,
                 virtual_batch_size=128, momentum=0.02):
        super(GLU_Layer, self).__init__()

        self.output_dim = output_dim
        if fc:
            self.fc = fc
        else:
            self.fc = ConvSP1d(input_dim, 2*output_dim,1, bias=False)
        initialize_glu(self.fc, input_dim, 2*output_dim)

        self.bn = GBN(2*output_dim, virtual_batch_size=virtual_batch_size,
                      momentum=momentum)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        out = torch.mul(x[:, :self.output_dim], torch.sigmoid(x[:, self.output_dim:]))
        return out

In [4]:
from timeseries import *
from models import *
import pickle 
import numpy as np
from sklearn.model_selection import train_test_split
from fastai.distributed import *

In [5]:
defaults.device = torch.device('cpu')
fastai.torch_core.defaults.device = torch.device('cpu')
    
scale_type = 'normalize'
scale_by_channel = False
scale_by_sample  = True 
scale_range = (-1, 1)
bs=128
data = (ItemLists(Path("data"), TSList(x_train),TSList(x_val))
        .label_from_lists(y_train, y_val)
        .databunch(bs=bs, val_bs=bs * 2, device=torch.device("cpu"))
        .scale(scale_type=scale_type, scale_by_channel=scale_by_channel, 
             scale_by_sample=scale_by_sample,scale_range=scale_range)
     )
data

TSDataBunch;

Train: LabelList (512 items)
x: TSList
TimeSeries(ch=12, seq_len=4096),TimeSeries(ch=12, seq_len=4096),TimeSeries(ch=12, seq_len=4096),TimeSeries(ch=12, seq_len=4096),TimeSeries(ch=12, seq_len=4096)
y: CategoryList
1,2,2,2,1
Path: .;

Valid: LabelList (128 items)
x: TSList
TimeSeries(ch=12, seq_len=4096),TimeSeries(ch=12, seq_len=4096),TimeSeries(ch=12, seq_len=4096),TimeSeries(ch=12, seq_len=4096),TimeSeries(ch=12, seq_len=4096)
y: CategoryList
1,2,2,2,2
Path: .;

Test: None

In [6]:
defaults.device

device(type='cpu')

In [14]:
model = TabNetNoEmbeddings(data.features, data.c, n_d=4, n_a=4).to(defaults.device)
kappa = KappaScore()
learn = Learner(data, model, metrics=[accuracy, kappa])

In [15]:
model

TabNetNoEmbeddings(
  (initial_splitter): FeatTransformer(
    (shared): GLU_Block(
      (shared_layers): ModuleList(
        (0): ConvSP1d(
          (conv): Conv1d(12, 16, kernel_size=(1,), stride=(1,), bias=False)
        )
        (1): ConvSP1d(
          (conv): Conv1d(8, 16, kernel_size=(1,), stride=(1,), bias=False)
        )
      )
      (glu_layers): ModuleList(
        (0): GLU_Layer(
          (fc): ConvSP1d(
            (conv): Conv1d(12, 16, kernel_size=(1,), stride=(1,), bias=False)
          )
          (bn): GBN(
            (bn): BatchNorm1d(16, eps=1e-05, momentum=0.02, affine=True, track_running_stats=True)
          )
        )
        (1): GLU_Layer(
          (fc): ConvSP1d(
            (conv): Conv1d(8, 16, kernel_size=(1,), stride=(1,), bias=False)
          )
          (bn): GBN(
            (bn): BatchNorm1d(16, eps=1e-05, momentum=0.02, affine=True, track_running_stats=True)
          )
        )
      )
    )
    (specifics): GLU_Block(
      (glu_layers):

In [16]:
learn.fit_one_cycle(100)

epoch,train_loss,valid_loss,accuracy,kappa_score,time


Feat Transform Input torch.Size([128, 12, 4096])
GLU_Block(
  (shared_layers): ModuleList(
    (0): ConvSP1d(
      (conv): Conv1d(12, 16, kernel_size=(1,), stride=(1,), bias=False)
    )
    (1): ConvSP1d(
      (conv): Conv1d(8, 16, kernel_size=(1,), stride=(1,), bias=False)
    )
  )
  (glu_layers): ModuleList(
    (0): GLU_Layer(
      (fc): ConvSP1d(
        (conv): Conv1d(12, 16, kernel_size=(1,), stride=(1,), bias=False)
      )
      (bn): GBN(
        (bn): BatchNorm1d(16, eps=1e-05, momentum=0.02, affine=True, track_running_stats=True)
      )
    )
    (1): GLU_Layer(
      (fc): ConvSP1d(
        (conv): Conv1d(8, 16, kernel_size=(1,), stride=(1,), bias=False)
      )
      (bn): GBN(
        (bn): BatchNorm1d(16, eps=1e-05, momentum=0.02, affine=True, track_running_stats=True)
      )
    )
  )
)
Feat Transform Shared torch.Size([128, 8, 4096])
Feat Transform Input torch.Size([128, 12, 4096])
GLU_Block(
  (shared_layers): ModuleList(
    (0): ConvSP1d(
      (conv): Conv1d

RuntimeError: size mismatch, m1: [128 x 4], m2: [512 x 2] at /opt/conda/conda-bld/pytorch_1573049306803/work/aten/src/TH/generic/THTensorMath.cpp:197

In [17]:
128*4

512