## Install Deps
- Transformer
- einops
- pytorch-lightning
- neptune
- gpytorch



In [None]:
!pip install transformers pytorch-lightning neptune-client neptune-contrib gpytorch einops

In [None]:
!pip uninstall torchtext



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Configuration

In [None]:
PRETRAINED_MODEL = 'bert-base-uncased'
SAMPLE=10000
CONTENT_FRAC = 0.20
BATCH_SIZE = 512
WARMUP = 2000
MAX_CYCLES = 10
MAX_EPOCHS = 75
LEARNING_RATE = 0.005

SIGMOID_THRESHOLD = 0.5
IS_BERT = False
IS_MASK = True
USE_CASE = 'MachineLog' # 'Covid19' | 'MachineLog' | 'MachineLog_no_numerical'
EXP_TYPE = 'COL'# 'KEY' | 'KEY+COL' | 'COL'

CORE_TRANSFORMER_PARAMS = dict(
    num_layers=12,\
    embedding_size=128,\
    layer_norm_epsilon=0.00001,\
    scale=0.01,\
    resid_pdrop=0.1,\
    attn_pdrop=0.1,\
    num_attention_heads = 8,\
    embd_pdrop=0.1,
    num_actions=3,\
    common_conv_dim=128
)

# Import Data

In [None]:
def get_data(data_type=USE_CASE,exp_type=EXP_TYPE):
  import json
  import os
  base_pth = None
  train_pth = None
  test_pth = None
  if data_type == 'MachineLog':
    base_pth = '/content/drive/MyDrive/Research/dataset/key_index_based_dataset/cell_based_dataset/machine_log_data'
    if exp_type =='KEY':
      train_pth = 'training_machine_log_data_with_key_index_prediction.json'
      test_pth = 'testing_machine_log_data_with_key_index_prediction.json'
    elif exp_type =='COL':
      train_pth = 'training_machine_log_data_with_column_label_prediction.json'
      test_pth = 'testing_machine_log_data_with_column_label_prediction.json'
  elif data_type == 'MachineLog_no_numerical':
    base_pth = '/content/drive/MyDrive/Research/dataset/key_index_based_dataset/cell_based_dataset/machine_log_data/no_numerical_value'
    if exp_type =='KEY':
      train_pth = 'training_machine_log_data_with_key_label_prediction.json'
      test_pth = 'testing_machine_log_data_with_key_label_prediction.json'
    elif exp_type =='COL':
      train_pth = 'training_machine_log_data_with_column_label_prediction.json'
      test_pth = 'testing_machine_log_data_with_column_label_prediction.json'
  
  elif data_type == 'Covid19':
    base_pth = '/content/drive/MyDrive/Research/dataset/key_index_based_dataset/cell_based_dataset/covid-19_data'
    if exp_type =='KEY':
      train_pth = 'training_covid-19_data_with_key_index_prediction.json'
      test_pth = 'testing_covid-19_data_with_key_index_prediction.json'
    elif exp_type =='COL':
      train_pth = 'training_covid-19_data_with_column_label_prediction.json'
      test_pth = 'testing_covid-19_data_with_column_label_prediction.json'

  if train_pth is None or test_pth is None:
    raise Exception("Cannot Load dataset")

  with open(os.path.join(base_pth,train_pth),'r',encoding='utf-8') as f:
    train_data_json = [json.loads(i) for i in f.readlines()]
  with open(os.path.join(base_pth,test_pth),'r',encoding='utf-8') as f:
    test_data_json = [json.loads(i) for i in f.readlines()]
  return train_data_json,test_data_json

In [None]:
train_data_json,test_data_json = get_data(data_type=USE_CASE,exp_type=EXP_TYPE)

In [None]:


NUM_TEST = len(test_data_json)
NUM_TEST

17000

In [None]:
dats = []
for i in train_data_json:
  dats.extend(i['label_index'])
NUM_LABELS = len(set(dats))

In [None]:
fd={i:0 for i in dats}
for i in dats:
  fd[i]+=1


In [None]:
fd

In [None]:
NUM_LABELS

21

In [None]:
total = sum([fd[f] for f in fd])
wt_init = [1/(fd[f]/total) for f in fd]
WEIGHTS = [w/sum(wt_init) for w in wt_init]



# Data Loader

In [None]:
from torch.utils.data import (
    random_split,
    DataLoader,
    RandomSampler,
    Subset,
    TensorDataset
)
import torch
import torch.nn.functional as F
from transformers import (
    BertTokenizer
)
from pytorch_lightning import Trainer, seed_everything
seed_everything(42)


class DocumentDataPreprocessor():
    """DocumentDataPreprocessor 
    """
    CLASS_TOKEN = '[CLS]'
    SEP_TOKEN = '[EOS]'
    SPECIAL_TOKENS = []

    def __init__(self,tokenizer:BertTokenizer,\
                column_split_order=[],
                ):

        self.tokenizer = tokenizer

    def get_tokenized_text(self,content_text,max_length=1024,pad_to_max_length=True):
        attention_mask = []
        input_ids = []
        encoded_dict = self.tokenizer.encode_plus(
                    content_text,                      # Sentence to encode.
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    max_length = max_length,           # Pad & truncate all sentences.
                    padding = 'max_length',
                    truncation=True,
                    return_attention_mask=True,
                    return_tensors = 'pt',     # Return pytorch tensors.
            )
    
        # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
        attention_mask.append(encoded_dict['attention_mask'])
        # And its attention mask (simply differentiates padding from non-padding).
        input_ids = torch.cat(input_ids,dim=0)
        attention_mask = torch.cat(attention_mask,dim=0)
        return input_ids,attention_mask
    

    @staticmethod
    def split_dataset(dataset,train_percent=0.9):
        # Create a split in train-validation 
        # Calculate the number of samples to include in each set.
        if train_percent > 1:
            raise Exception('Training Percentage cannot be > 1')
        train_size = int(train_percent * len(dataset))
        val_size = len(dataset) - train_size

        # Divide the dataset by randomly selecting samples.
        train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
        return train_dataset,val_dataset

In [None]:
def create_training_set(data_json,tokenizer,exp_type=EXP_TYPE):
  import json
  proc = DocumentDataPreprocessor(tokenizer)
  input_ids,att_mask,labels = [],[],[]
  for d in data_json:
    json_value = json.dumps(d['value'])
    i,m = proc.get_tokenized_text(json_value,max_length=70)
    input_ids.append(i)
    att_mask.append(m)
    if exp_type == 'KEY':
        labels.append(d['label_index'][0])
    elif exp_type == 'COL':
      hot_tensor= F.one_hot(torch.LongTensor([r-1 for r in d['label_index']]),num_classes=NUM_LABELS).sum(dim=0)
      labels.append(hot_tensor)

  input_ids = torch.cat(input_ids,dim=0)
  att_mask = torch.cat(att_mask,dim=0)
  if exp_type == 'KEY':
    labels = torch.Tensor(labels)
  elif exp_type == 'COL':
    labels = torch.stack(labels,dim=0)
  # print("labels",labels.shape)
  return TensorDataset(input_ids,att_mask,labels)

In [None]:
# json.dumps(train_data_json[0])
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)#prajjwal1/bert-tiny


In [None]:
train_dataset = create_training_set(train_data_json,tokenizer)
train_dataset,val_dataset = DocumentDataPreprocessor.split_dataset(train_dataset)

In [None]:
test_dataset =  create_training_set(test_data_json,tokenizer)

In [None]:
train_loader = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=BATCH_SIZE)

In [None]:
val_loader = DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=True)

# Model Description

In [None]:
from transformers import BertModel,AdamW,get_cosine_with_hard_restarts_schedule_with_warmup,AutoModel

In [None]:
import torch.nn as nn
import einops
import torch.nn.functional as F

In [None]:
class PredictionHeadTransform(nn.Module):
    def __init__(self, hidden_size=768,layer_norm_eps=0.00001):
        super().__init__()
        self.dense = nn.Linear(hidden_size, hidden_size)
        self.transform_act_fn = torch.nn.GELU()
        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


class PredictionHead(nn.Module):
    def __init__(self, hidden_size=768,layer_norm_eps=0.00001,num_preds=NUM_LABELS):
        super().__init__()
        # self.transform = PredictionHeadTransform(hidden_size=hidden_size,layer_norm_eps=layer_norm_eps)
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(hidden_size, num_preds)

    def forward(self, hidden_states):
        # hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states

class BertDataIntegrationClassifier(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.model = BertModel.from_pretrained('bert-base-uncased')
    self.pred_head = PredictionHead(hidden_size=self.model.config.hidden_size,layer_norm_eps=self.model.config.layer_norm_eps)
    self.sfmx = nn.Softmax(dim=1)
  
  def forward(self,input,att_mask=None):
    dx = self.model(input,attention_mask=att_mask)
    val = dx.pooler_output
    final_st = dx.pooler_output # val[:,0] # Getting classtoken op
    return self.pred_head(final_st) #self.sfmx()


In [None]:
# model = BertDataIntegrationClassifier()
import pytorch_lightning as pl
import pytorch_lightning.metrics.functional.accuracy as get_accuracy
from pytorch_lightning.metrics.functional import f1 as f1_score

### Vanilla Self Attention
- NO bs of Other things like mask/cross attention etc.  
- Attention defined by
  - $$ W_Q,W_K,W_V: \text{Are key,query,and value matrixes}$$
  - $$attention = (softmax(W_Q \times W_K)*scale) \times W_V$$ 

In [None]:
class SimpleSelfAttention(nn.Module):
  '''
  Vanilla Self attention on sequence. No Masking etc.
  '''
  def __init__(self,hidden_size,dropout=0.1,num_heads=4,scale=0.2,mlp_dim=3072):
    super().__init__()
    self.kqv_layer = nn.Linear(hidden_size,3*hidden_size)
    self.num_heads = num_heads
    self.ff_layer = nn.Sequential(
          nn.Linear(hidden_size, hidden_size),
          nn.Dropout(dropout)
    )
    
    self.scale = hidden_size ** -scale

  def forward(self,sequence_embedding):
    # print(f"Shape of Sequence Embedding {sequence_embedding.shape}")
    kqv = self.kqv_layer(sequence_embedding).chunk(3, dim = -1)
    # print(f"Shape of kqv {kqv[0].shape,kqv[1].shape}")
    k,q,v = map(lambda x:einops.rearrange(x,'b s (h d) -> b h s d',h=self.num_heads),kqv)
    scaled_dot_product = torch.einsum('bhsd,bhnd->bhsn',k,q) * self.scale
    weighted_sum = F.softmax(scaled_dot_product,dim=-1)
    value_weighted_sum = torch.einsum('bhsn,bhsd->bhnd',weighted_sum,v)
    reweighted_sequence_embedding = einops.rearrange(value_weighted_sum,'b h s d -> b s (h d)',h=self.num_heads)
    return self.ff_layer(reweighted_sequence_embedding)

In [None]:
import math
class Conv1D(nn.Module):
    """
    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
    Basically works like a linear layer but the weights are transposed.
    Args:
        nf (:obj:`int`): The number of output features.
        nx (:obj:`int`): The number of input features.
    """

    def __init__(self, nf, nx):
        super().__init__()
        self.nf = nf
        w = torch.empty(nx, nf)
        nn.init.normal_(w, std=0.02)
        self.weight = nn.Parameter(w)
        self.bias = nn.Parameter(torch.zeros(nf))

    def forward(self, x):
        size_out = x.size()[:-1] + (self.nf,)
        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
        x = x.view(*size_out)
        return x

In [None]:
class MLP(nn.Module):
    def __init__(self, n_state,embedding_size=256,resid_pdrop=0.1):  # in MLP: n_state=(n * embedding_size)
        super().__init__()
        nx = embedding_size # n_state = outputfeatures
        self.c_fc = Conv1D(n_state, nx)
        self.c_proj = Conv1D(nx, n_state) # nx = inputfeatures
        self.act = nn.GELU()
        self.dropout = nn.Dropout(resid_pdrop)

    def forward(self, x):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
        return self.dropout(h2)

In [None]:
##  https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit_pytorch.py
class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

In [None]:
## Code Refactored From https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_gpt2.py 

class Block(nn.Module):
    def __init__(self, 
                 embedding_size=256,\
                 layer_norm_epsilon=0.00001,\
                 scale=0.2,\
                 resid_pdrop=0.1,\
                 attn_pdrop=0.1,\
                 num_attention_heads = 8):
        super().__init__()
        hidden_size = embedding_size
        inner_dim = 4 * hidden_size
        self.ln_1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        self.attn = Residual(SimpleSelfAttention(hidden_size,num_heads = num_attention_heads,scale=scale,dropout=attn_pdrop,mlp_dim=inner_dim))
        self.ln_2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        self.mlp = Residual(MLP(inner_dim,embedding_size=embedding_size,resid_pdrop=resid_pdrop))
    def forward(
        self,
        hidden_states,
    ):
        attn_outputs = self.attn(
            self.ln_1(hidden_states),
        )
        feed_forward_hidden_states = self.mlp(self.ln_2(attn_outputs))
        return feed_forward_hidden_states

### Cross-Modal-Attention

- Inspiration From https://arxiv.org/abs/1906.00295

$$ Y_\alpha = CM_{\beta\to\alpha}(X_\alpha,X_\beta)$$
$$ Y_\alpha = softmax(\frac{Q_\alpha \times K^T_\beta}{\sqrt{d_k}})V_\beta$$


In [None]:
class CrossModalAttention(nn.Module):
  def __init__(self,hidden_size,dropout=0.1,num_heads=4,scale=0.2):
    super().__init__()
    self.kv_layer = nn.Linear(hidden_size,2*hidden_size)
    self.q_layer = nn.Linear(hidden_size,hidden_size)
    self.num_heads = num_heads
    self.ff_layer = nn.Sequential(
          nn.Linear(hidden_size, hidden_size),
          nn.Dropout(dropout)
    )
    self.scale = hidden_size**-scale

  def forward(self,seq_x,seq_y,mask=True):
    '''
    r(seq_x|seq_y)
    r(action|text)
    r(text|action)
      action: b t d 
          - b: batchsize
          - t: trajectory length 
          - d : hidden dims
      text : b s d
          - b: batchsize
          - s : length of text
          - d: hidden dims
      *d will be same in text and traj
    '''
    kv = self.kv_layer(seq_y).chunk(2, dim = -1)
    q = einops.rearrange(self.q_layer(seq_x),'b s (h d) -> b h s d',h=self.num_heads)
    k,v = map(lambda x:einops.rearrange(x,'b s (h d) -> b h s d',h=self.num_heads),kv)
    scaled_dot_product = torch.einsum('bhsd,bhnd->bhsn',k,q) * self.scale
    # add mask here for better performance. gi

    weighted_sum = F.softmax(scaled_dot_product,dim=-1)
    value_weighted_sum = torch.einsum('bhsn,bhsd->bhnd',weighted_sum,v)
    reweighted_sequence_embedding = einops.rearrange(value_weighted_sum,'b h s d -> b s (h d)',h=self.num_heads)
    return self.ff_layer(reweighted_sequence_embedding)

In [None]:
class CrossModalAttentionWithMask(nn.Module):
  def __init__(self,hidden_size,dropout=0.1,num_heads=4,scale=0.2):
    super().__init__()
    self.kv_layer = nn.Linear(hidden_size,2*hidden_size)
    self.q_layer = nn.Linear(hidden_size,hidden_size)
    self.num_heads = num_heads
    self.ff_layer = nn.Sequential(
          nn.Linear(hidden_size, hidden_size),
          nn.Dropout(dropout)
    )
    self.scale = hidden_size**-scale

  def forward(self,seq_x,seq_y,mask=None):
    '''
    r(seq_x|seq_y)
    r(action|text)
    r(text|action)
      action: b t d 
          - b: batchsize
          - t: trajectory length 
          - d : hidden dims
      text : b s d
          - b: batchsize
          - s : length of text
          - d: hidden dims
      *d will be same in text and traj
    '''
    kv = self.kv_layer(seq_y).chunk(2, dim = -1)
    q = einops.rearrange(self.q_layer(seq_x),'b s (h d) -> b h s d',h=self.num_heads)
    k,v = map(lambda x:einops.rearrange(x,'b s (h d) -> b h s d',h=self.num_heads),kv)
    scaled_dot_product = torch.einsum('bhsd,bhnd->bhsn',k,q) * self.scale
    # add mask here for better performance. gi
    if mask is not None:
      # print(f"Shape Of Mask {mask.shape}")
      mask_vals = self.get_extended_attention_mask(mask,seq_x.size(),device=seq_x.device)
      scaled_dot_product+=mask_vals
    # return scaled_dot_product
    weighted_sum = F.softmax(scaled_dot_product,dim=-1)
    value_weighted_sum = torch.einsum('bhsn,bhsd->bhnd',weighted_sum,v)
    reweighted_sequence_embedding = einops.rearrange(value_weighted_sum,'b h s d -> b s (h d)',h=self.num_heads)
    return self.ff_layer(reweighted_sequence_embedding)

  @staticmethod
  def get_extended_attention_mask(attention_mask, input_shape, device=torch.device('cpu'),is_decoder=False):
      """ Thank You Hugging Face : 
      https://github.com/huggingface/transformers/blob/443f67e887a030d8254eba126e5f2cdb8b70eb63/src/transformers/modeling_utils.py
      Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
      Arguments:
          attention_mask (:obj:`torch.Tensor`):
              Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
          input_shape (:obj:`Tuple[int]`):
              The shape of the input to the model.
          device: (:obj:`torch.device`):
              The device of the input to the model.
      Returns:
          :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
      """
      # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
      # ourselves in which case we just need to make it broadcastable to all heads.
      if attention_mask.dim() == 3:
          extended_attention_mask = attention_mask[:, None, :, :]
      elif attention_mask.dim() == 2:
          # Provided a padding mask of dimensions [batch_size, seq_length, d]
          # - if the model is a decoder, apply a causal mask in addition to the padding mask
          # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
          if is_decoder:
              batch_size, seq_length, d = input_shape
              seq_ids = torch.arange(seq_length, device=device)
              causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
              # in case past_key_values are used we need to add a prefix ones mask to the causal mask
              # causal and attention masks must have same type with pytorch version < 1.3
              causal_mask = causal_mask.to(attention_mask.dtype)

              if causal_mask.shape[1] < attention_mask.shape[1]:
                  prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
                  causal_mask = torch.cat(
                      [
                          torch.ones(
                              (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype
                          ),
                          causal_mask,
                      ],
                      axis=-1,
                  )

              extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
          else:
              extended_attention_mask = attention_mask[:, None, None, :]
      else:
          raise ValueError(
              "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
                  input_shape, attention_mask.shape
              )
          )

      # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
      # masked positions, this operation will create a tensor which is 0.0 for
      # positions we want to attend and -10000.0 for masked positions.
      # Since we are adding it to the raw scores before the softmax, this is
      # effectively the same as removing these entirely.
      # extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
      extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
      return extended_attention_mask


In [None]:
class MultiModalAttentionBlock(nn.Module):
    def __init__(self, 
                 embedding_size=256,\
                 layer_norm_epsilon=0.00001,\
                 scale=False,\
                 resid_pdrop=0.1,\
                 attn_pdrop=0.1,\
                 num_attention_heads = 8):
        super().__init__()
        hidden_size = embedding_size
        inner_dim = 4 * hidden_size
        self.ln_1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        self.attn = CrossModalAttentionWithMask(hidden_size,num_heads = num_attention_heads,scale=scale,dropout=attn_pdrop)
        self.ln_2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        self.mlp = Residual(MLP(inner_dim,embedding_size=embedding_size,resid_pdrop=resid_pdrop))
    def forward(
        self,
        seq_x,seq_y,mask=None
    ):
        attn_outputs = self.attn(
            self.ln_1(seq_x),self.ln_1(seq_y),mask=mask
        )
        # Residual connection
        attn_outputs = seq_x + attn_outputs
        feed_forward_hidden_states = self.mlp(self.ln_2(attn_outputs))
        return feed_forward_hidden_states

In [None]:
#  https://github.com/yaohungt/Multimodal-Transformer/blob/master/modules/position_embedding.py
# Code adapted from the fairseq repo.
import math
def make_positions(tensor, padding_idx, left_pad):
    """Replace non-padding symbols with their position numbers.
    Position numbers begin at padding_idx+1.
    Padding symbols are ignored, but it is necessary to specify whether padding
    is added on the left side (left_pad=True) or right side (left_pad=False).
    """
    max_pos = padding_idx + 1 + tensor.size(1)
    device = tensor.get_device()
    buf_name = f'range_buf_{device}'
    if not hasattr(make_positions, buf_name):
        setattr(make_positions, buf_name, tensor.new())
    setattr(make_positions, buf_name, getattr(make_positions, buf_name).type_as(tensor))
    if getattr(make_positions, buf_name).numel() < max_pos:
        torch.arange(padding_idx + 1, max_pos, out=getattr(make_positions, buf_name))
    mask = tensor.ne(padding_idx)
    positions = getattr(make_positions, buf_name)[:tensor.size(1)].expand_as(tensor)
    if left_pad:
        positions = positions - mask.size(1) + mask.long().sum(dim=1).unsqueeze(1)
    new_tensor = tensor.clone()
    return new_tensor.masked_scatter_(mask, positions[mask]).long()


class SinusoidalPositionalEmbedding(nn.Module):
    """This module produces sinusoidal positional embeddings of any length.
    Padding symbols are ignored, but it is necessary to specify whether padding
    is added on the left side (left_pad=True) or right side (left_pad=False).
    """

    def __init__(self, embedding_dim, padding_idx=0, left_pad=0, init_size=128):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.padding_idx = padding_idx
        self.left_pad = left_pad
        self.weights = dict()   # device --> actual weight; due to nn.DataParallel :-(
        self.register_buffer('_float_tensor', torch.FloatTensor(1))

    @staticmethod
    def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
        """Build sinusoidal embeddings.
        This matches the implementation in tensor2tensor, but differs slightly
        from the description in Section 3.5 of "Attention Is All You Need".
        """
        half_dim = embedding_dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
        if embedding_dim % 2 == 1:
            # zero pad
            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
        if padding_idx is not None:
            emb[padding_idx, :] = 0
        return emb

    def forward(self, input):
        """Input is expected to be of size [bsz x seqlen]."""
        bsz, seq_len = input.size()
        max_pos = self.padding_idx + 1 + seq_len
        device = input.get_device()
        if device not in self.weights or max_pos > self.weights[device].size(0):
            # recompute/expand embeddings if needed
            self.weights[device] = SinusoidalPositionalEmbedding.get_embedding(
                max_pos,
                self.embedding_dim,
                self.padding_idx,
            )
        self.weights[device] = self.weights[device].type_as(self._float_tensor)
        positions = make_positions(input, self.padding_idx, self.left_pad)
        return self.weights[device].index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()

    def max_positions(self):
        """Maximum number of supported positions."""
        return int(1e5)  # an arbitrary large number

In [None]:
class MultiModalTransformer(nn.Module):
  def __init__(self,
              num_layers = 4,
              embed_dropout =0.1 ,
              embedding_size=256,\
              layer_norm_epsilon=0.00001,\
              scale=0.2,\
              resid_pdrop=0.1,\
              attn_pdrop=0.1,\
              num_attention_heads = 8):
    super().__init__()
    self.layers = nn.ModuleList([])
    self.embed_dropout=embed_dropout
    self.embed_scale = math.sqrt(embedding_size)
    self.embed_positions = SinusoidalPositionalEmbedding(embedding_size)

    for _ in range(num_layers):
      
        self.layers.append(MultiModalAttentionBlock(
                          embedding_size=embedding_size,\
                          layer_norm_epsilon=layer_norm_epsilon,\
                          scale=scale,\
                          resid_pdrop=resid_pdrop,\
                          attn_pdrop=attn_pdrop,\
                          num_attention_heads = num_attention_heads))

  def forward(self, seq_x,seq_y,mask=None):
    # Add positional embedding
    seq_x = self.embed_scale * seq_x
    if self.embed_positions is not None:
        seq_x += self.embed_positions(seq_x[:, :, 0])   
    seq_x = F.dropout(seq_x, p=self.embed_dropout, training=self.training)

    seq_y = self.embed_scale * seq_y
    if self.embed_positions is not None:
        seq_y += self.embed_positions(seq_y[:, :, 0])   
    seq_y = F.dropout(seq_y, p=self.embed_dropout, training=self.training)
    
    hidden = seq_x
    for attention_block in self.layers:
        hidden = attention_block(hidden,seq_y,mask=mask)
    return hidden

In [None]:
class VanillaTransformer(nn.Module):
  '''
  Contains Sinusoidal Embedding for sequences as part of the Framework. 
  '''
  def __init__(self,
              num_layers = 4,
              embed_dropout =0.1 ,\
              embedding_size=256,\
              layer_norm_epsilon=0.00001,\
              scale=0.2,\
              resid_pdrop=0.1,\
              attn_pdrop=0.1,\
              num_attention_heads = 8):
    super().__init__()
    self.layers = nn.ModuleList([])
    self.embed_dropout=embed_dropout
    self.embed_scale = math.sqrt(embedding_size)
    self.embed_positions = SinusoidalPositionalEmbedding(embedding_size)

    for _ in range(num_layers): 
        self.layers.append(Block(
                          embedding_size=embedding_size,\
                          layer_norm_epsilon=layer_norm_epsilon,\
                          scale=scale,\
                          resid_pdrop=resid_pdrop,\
                          attn_pdrop=attn_pdrop,\
                          num_attention_heads = num_attention_heads))

  def forward(self, x, mask = None):
    # Add positional embedding
    x = self.embed_scale * x # (b,len,d)
    if self.embed_positions is not None:
        x += self.embed_positions(x[:, :, 0])   
    x = F.dropout(x, p=self.embed_dropout, training=self.training)

    hidden = x
    for attention_block in self.layers:
        hidden = attention_block(hidden)
    return hidden

## Ground Up Transformer with Bert Tokenized Embedding. 

## Transformer model based on pretrained BERT 

In [None]:
class BertEmbedTransformer(nn.Module):
  def __init__(self,
              num_layers=8,\
              embedding_size=128,\
              layer_norm_epsilon=0.00001,\
              scale=0.01,\
              resid_pdrop=0.1,\
              attn_pdrop=0.1,\
              num_attention_heads = 8,\
              embd_pdrop=0.1,
              num_actions=3,\
              common_conv_dim=64):
    super().__init__()
    data_input = dict(
        embedding_size=embedding_size,
        num_layers =num_layers,
        layer_norm_epsilon =layer_norm_epsilon,
        scale =scale,
        resid_pdrop =resid_pdrop,
        attn_pdrop =attn_pdrop,
        num_attention_heads =num_attention_heads,
    )
    self.transformer = VanillaTransformer(**data_input)
    bert_model = AutoModel.from_pretrained(PRETRAINED_MODEL)
    bert_emb = bert_model.embeddings.word_embeddings
    text_embedding_dim = bert_emb.embedding_dim
    num_emb = bert_emb.num_embeddings
    self.text_embeddings = nn.Embedding(num_embeddings=num_emb,embedding_dim=text_embedding_dim)
    # bert_model.embeddings.word_embeddings
    self.text_embeddings.load_state_dict(bert_emb.state_dict())
    # self.text_embeddings,_ , text_embedding_dim = self.create_emb_layer(glove_weights)
    self.text_embeddings.weight.requires_grad = False
    self.text_conv = nn.Conv1d(text_embedding_dim,common_conv_dim,kernel_size=1,padding=0,bias=False)
    self.text_cls_token = nn.Parameter(torch.randn(1, 1, common_conv_dim))
    self.to_cls = nn.Identity()
    self.final_layer = nn.Sequential(
        nn.Linear(common_conv_dim, common_conv_dim),
        nn.ReLU(),
        nn.Dropout(embd_pdrop),
        nn.Linear(common_conv_dim, common_conv_dim)
    )
    
  def forward(self,inputs,att_mask=None):
    text_tensor = self.text_embeddings(inputs)
    
    text_tensor = text_tensor.transpose(1,2)

    text_tensor = self.text_conv(text_tensor).permute(0,2,1)
    
    # Prepend CLS tokens and Finally extract thoose instead of the last token.
    b,n,_ = text_tensor.size()
    text_cls_token = einops.repeat(self.text_cls_token,'() n d -> b n d', b = b)

    text_tensor = torch.cat((text_cls_token,text_tensor),dim=1)
    # adding Extra one for cls tokens that get prepended the tensors
    if att_mask!=None:
      att_mask = torch.cat((torch.ones(b).unsqueeze(1).to(text_tensor.device),att_mask),dim=1)
    
    text_j_tensor = self.transformer(text_tensor,mask=att_mask)
    
    l_txt = self.to_cls(text_j_tensor[:,0])
    
    # A residual block
    concat_tensor_proj = self.final_layer(l_txt)
    concat_tensor_proj+=l_txt
    
    return concat_tensor_proj 
    

class TransformerDIClassifier(torch.nn.Module):
  def __init__(self,
              num_layers=8,\
              embedding_size=64,\
              layer_norm_epsilon=0.00001,\
              scale=0.01,\
              resid_pdrop=0.1,\
              attn_pdrop=0.1,\
              num_attention_heads = 8,\
              embd_pdrop=0.1,
              num_actions=3,\
              common_conv_dim=64):
    super().__init__()
    self.model = BertEmbedTransformer(
        num_layers = num_layers,
        embedding_size = embedding_size,
        layer_norm_epsilon = layer_norm_epsilon,
        scale = scale,
        resid_pdrop = resid_pdrop,
        attn_pdrop = attn_pdrop,
        num_attention_heads = num_attention_heads,
        embd_pdrop = embd_pdrop,
        num_actions = num_actions,
        common_conv_dim = common_conv_dim,
    )
    self.pred_head = PredictionHead(hidden_size=common_conv_dim,layer_norm_eps=layer_norm_epsilon)
    
  
  def forward(self,input,att_mask=None):
    final_st = self.model(input,att_mask=att_mask)
    return self.pred_head(final_st)  #self.sfmx()


## Lightning Module


## module for doing singel label prediction (e.g key index prediction)

In [None]:
logit_list = []
label_list = []
class DataIntegrationClassifier(pl.LightningModule):
  def __init__(self,with_mask=False,is_bert=IS_BERT):
    super().__init__()
    # self.model =  TransformerDIClassifier()
    self.with_mask = with_mask
    if is_bert:
      print("Using BERT Model")
      self.model = BertDataIntegrationClassifier()
    else:
      print("Using Vanilla Backbone Model")
      self.model = TransformerDIClassifier(**CORE_TRANSFORMER_PARAMS)
    # self.cross_entropy = nn.CrossEntropyLoss(torch.Tensor(WEIGHTS))
    self.cross_entropy = nn.CrossEntropyLoss()
    self.sfmx = nn.Softmax(dim=1)
    
  def forward(self,inputs,att_mask=None): # state,action should be full trajectory sequences for state and action for each element in the batch.  
    '''
    inputs : (input_ids)
        - input_ids: b k :
          - b: batchsize
          - k: sequence_length
    *_mask = mask: b s : binary tensor. 
    '''
    return self.model(inputs,att_mask=att_mask)


  def training_step(self,batch,batch_nb):
    inps,mask,labels = batch
    labels=(labels-1).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      logits = self.sfmx(self(inps,att_mask=None))
    else:
      logits = self.sfmx(self(inps,att_mask=mask))
    loss = self.cross_entropy(logits,labels)
    accuracy = get_accuracy(logits,labels)
    f1 = f1_score(
        torch.argmax(logits.detach(),dim=1),labels,num_classes=NUM_LABELS
    )
    self.logger.log_metrics({
        'train_f1' : f1.detach().cpu().numpy(),
        'train_accuracy':accuracy.detach().cpu().numpy(),
        # 'train_loss_embed':(loss_embed_pc+loss_embed_nc).detach().cpu().numpy(),
        'train_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss}


  def validation_step(self,batch,batch_nb):
    inps,mask,labels = batch
    labels=(labels-1).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      logits = self.sfmx(self(inps,att_mask=None))
    else:
      logits = self.sfmx(self(inps,att_mask=mask))
    loss = self.cross_entropy(logits,labels)
    accuracy = get_accuracy(logits,labels)
    f1 = f1_score(
        torch.argmax(logits.detach(),dim=1),labels,num_classes=NUM_LABELS
    )
    self.logger.log_metrics({
        'val_f1' : f1.detach().cpu().numpy(),
        'val_accuracy':accuracy.detach().cpu().numpy(),
        'val_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss,'val_loss':loss}
  
  def test_step(self,batch,batch_nb):
    inps,mask,labels = batch
    labels=(labels-1).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      logits = self.sfmx(self(inps,att_mask=None))
    else:
      logits = self.sfmx(self(inps,att_mask=mask))
    loss = self.cross_entropy(logits,labels)
    with open('logits.txt','w') as w:
      for i in logit_list:
        w.write(i)
       
    with open('label.txt','w') as wf:
      for i in label_list:
        wf.write(i)
        wf.write('\n')
    accuracy = get_accuracy(logits,labels)
    f1 = f1_score(
        torch.argmax(logits.detach(),dim=1),labels,num_classes=NUM_LABELS
    )
    self.logger.log_metrics({
        'test_f1' : f1.detach().cpu().numpy(),
        'test_accuracy':accuracy.detach().cpu().numpy(),
        'test_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss,'test_accuracy':accuracy}


  def configure_optimizers(self):
    optimizer =  AdamW(self.parameters(), lr=LEARNING_RATE, eps=1e-12, betas=(0.9,0.999))
    num_minibatch_steps = NUM_TRAIN_SAMPLES/BATCH_SIZE
    max_epochs = MAX_EPOCHS
    warmup = WARMUP
    t_total = max_epochs * num_minibatch_steps
    num_cycles = MAX_CYCLES
    lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, warmup, t_total,num_cycles=num_cycles)
    return [optimizer] ,[lr_scheduler]



## module for doing multi-label predicition (e.g column prediction)

In [None]:
import numpy as np

class DataIntegrationColumnClassifier(pl.LightningModule):
  def __init__(self,with_mask=False,is_bert=IS_BERT):
    super().__init__()
    # self.model =  TransformerDIClassifier()
    self.with_mask = with_mask
    if is_bert:
      print("Using BERT Model")
      self.model = BertDataIntegrationClassifier()
    else:
      print("Using Vanilla Backbone Model")
      self.model = TransformerDIClassifier(**CORE_TRANSFORMER_PARAMS)
    # self.cross_entropy = nn.CrossEntropyLoss(torch.Tensor(WEIGHTS))
    
    self.sfmx = nn.Softmax(dim=1)
    self.sigmoid = nn.Sigmoid()
    self.loss = nn.BCEWithLogitsLoss()
    self.sigmoid_threshold = SIGMOID_THRESHOLD
    
  def forward(self,inputs,att_mask=None): # state,action should be full trajectory sequences for state and action for each element in the batch.  
    '''
    inputs : (input_ids)
        - input_ids: b k :
          - b: batchsize
          - k: sequence_length
    *_mask = mask: b s : binary tensor. 
    '''
    return self.model(inputs,att_mask=att_mask)

  def get_multi_label_exact_match_accuracy(self,y_pred,y_true):
    pred_vals = (self.sigmoid(y_pred) > self.sigmoid_threshold).long()
    num_vals = pred_vals.size()[0]
    correct_vals = 0 # Exactly Correct Vals
    for v1,v2 in zip(pred_vals,y_true):
      if str(v1.cpu().numpy()) == str(v2.cpu().numpy()):
        correct_vals+=1
    return torch.tensor(correct_vals/num_vals)

  

  def training_step(self,batch,batch_nb):
    inps,mask,labels = batch
    # labels=(labels).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      logits = self(inps,att_mask=None)
    else:
      logits = self(inps,att_mask=mask)
    loss = self.loss(logits,labels.type_as(logits))
    accuracy = self.get_multi_label_exact_match_accuracy(logits,labels)
    self.logger.log_metrics({
        'train_accuracy':accuracy.detach().cpu().numpy(),
        # 'train_loss_embed':(loss_embed_pc+loss_embed_nc).detach().cpu().numpy(),
        'train_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss}


  def validation_step(self,batch,batch_nb):
    inps,mask,labels = batch
    # labels=(labels).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      logits = self(inps,att_mask=None)
    else:
      logits = self(inps,att_mask=mask)
    
    loss = self.loss(logits,labels.type_as(logits))
    accuracy = self.get_multi_label_exact_match_accuracy(logits,labels)
    self.logger.log_metrics({
        'val_accuracy':accuracy.detach().cpu().numpy(),
        'val_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss,'val_loss':loss}
  
  def test_step(self,batch,batch_nb):
    inps,mask,labels = batch
    # labels=(labels).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      logits = self(inps,att_mask=None)
    else:
      logits = self(inps,att_mask=mask)
    loss = self.loss(logits,labels.type_as(logits))
    np.set_printoptions(precision=3,threshold=np.inf)
    logit_list = logits.cpu().numpy()
    label_list = labels.cpu().numpy()

    print(logit_list)
    np.savetxt('logits.txt',logit_list)
    # with open('logits.txt','w') as w:
    #   for i in logit_list:
    #     out = np.array_str(i) 
    #     w.write(out)
    np.savetxt('label.txt',label_list)  
    # with open('label.txt','w') as wf:
    #   for i in label_list:
    #     out = np.array_str(i) 
    #     wf.write(out)
    #     wf.write('\n')
    accuracy = self.get_multi_label_exact_match_accuracy(logits,labels)
    self.logger.log_metrics({
        'test_accuracy':accuracy.detach().cpu().numpy(),
        'test_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss,'test_accuracy':accuracy}


  def configure_optimizers(self):
    optimizer =  AdamW(self.parameters(), lr=LEARNING_RATE, eps=1e-12, betas=(0.9,0.999))
    num_minibatch_steps = NUM_TRAIN_SAMPLES/BATCH_SIZE
    max_epochs = MAX_EPOCHS
    warmup = WARMUP
    t_total = max_epochs * num_minibatch_steps
    num_cycles = MAX_CYCLES
    lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, warmup, t_total,num_cycles=num_cycles)
    return [optimizer] ,[lr_scheduler]



## multi-head transformer (used for doing multi prediction tasks at the same time, abandoned right now)

In [None]:
class ClassifierHead(nn.Module):
  def __init__(self,hidden):
    super().__init__()
    self.class_value = nn.Linear(hidden,1)
    self.sigmoid = nn.Sigmoid()
    self.loss = nn.BCELoss()

  def get_label_and_loss(self,features,labels):
    opx = self(features)
    return opx, self.loss(opx,labels.float())

  def forward(self,features):
    return self.sigmoid(self.class_value(features))

class SoftmaxClassifierHead(nn.Module):
  def __init__(self,hidden):
    super().__init__()
    self.class_value = nn.Linear(hidden,2)
    self.sigmoid = nn.Sigmoid()
    self.sfmx = nn.Softmax(dim=1)
    self.loss = nn.CrossEntropyLoss(reduction='mean')

  def get_label_and_loss(self,features,labels):
    opx = self(features)
    # print(labels[0])
    return torch.argmax(opx,dim=1).unsqueeze(1), self.loss(opx,labels.squeeze(1).long())

  def forward(self,features):
    return self.sfmx(self.class_value(features))

class MultiHeadedBinaryClassifier(nn.Module):
  def __init__(self,hidden,num_labels=NUM_LABELS):
    super().__init__()
    self.init_multi_head_label(hidden,num_labels)
    self.num_labels = num_labels

  @staticmethod
  def get_head_name(idx):
    return f'class_head_{idx}'

  def get_class_head(self,idx):
    head_name = self.get_head_name(idx)
    return getattr(self,head_name)

  def get_head_predictions(self,features,idx):
    head = self.get_class_head(idx)
    return head(features)

  def init_multi_head_label(self,hidden,num_labels):
    for i in range(num_labels):
      setattr(self,self.get_head_name(i),SoftmaxClassifierHead(hidden))

  def label_head_preds(self,features):
    pred_ops = []
    for i in range(self.num_labels):
      pred_ops.append(self.get_head_predictions(features,i))
    return torch.cat(pred_ops,dim=1)

  def forward(self,features):
    return self.label_head_preds(features)
  
  def get_labels_and_losses(self,features,labels):
    label_tups = torch.split(labels,1,dim=1)
    losses,pred_vals = [],[]
    for idx,lab in enumerate(label_tups):
      head = self.get_class_head(idx)
      preds,loss = head.get_label_and_loss(features,lab)
      losses.append(loss)
      pred_vals.append(preds)
    mean_loss = torch.sum(torch.stack(losses))
    # print(pred_vals[0])
    preds = torch.cat(pred_vals,dim=1)
    return loss,preds


class DataIntegrationColumnMultiHeadClassifier(pl.LightningModule):
  def __init__(self,with_mask=False):
    super().__init__()
    # self.model =  TransformerDIClassifier()
    self.with_mask = with_mask
    
    print("Using Vanilla Backbone Model")
    self.model = BertEmbedTransformer(**CORE_TRANSFORMER_PARAMS)
      # self.model = TransformerDIClassifier()
    # self.cross_entropy = nn.CrossEntropyLoss(torch.Tensor(WEIGHTS))
    
    self.multihead_class = MultiHeadedBinaryClassifier(CORE_TRANSFORMER_PARAMS['common_conv_dim'])
    self.sfmx = nn.Softmax(dim=1)
    self.sigmoid_threshold = SIGMOID_THRESHOLD
    
  def forward(self,inputs,att_mask=None): # state,action should be full trajectory sequences for state and action for each element in the batch.  
    '''
    inputs : (input_ids)
        - input_ids: b k :
          - b: batchsize
          - k: sequence_length
    *_mask = mask: b s : binary tensor. 
    '''
    return self.model(inputs,att_mask=att_mask)

  def get_multi_label_exact_match_accuracy(self,y_pred,y_true):
    # print(y_pred[0])
    # print(y_true[0])
    pred_vals = y_pred.long()
    num_vals = pred_vals.size()[0]
    correct_vals = 0 # Exactly Correct Vals
    for v1,v2 in zip(pred_vals,y_true):
      if str(v1.cpu().numpy()) == str(v2.cpu().numpy()):
        correct_vals+=1
    return torch.tensor(correct_vals/num_vals)

  

  def training_step(self,batch,batch_nb):
    inps,mask,labels = batch
    # labels=(labels).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      features = self(inps,att_mask=None)
    else:
      features = self(inps,att_mask=mask)    
    loss, logits = self.multihead_class.get_labels_and_losses(features,labels)
    accuracy = self.get_multi_label_exact_match_accuracy(logits,labels)
    self.logger.log_metrics({
        'train_accuracy':accuracy.detach().cpu().numpy(),
        # 'train_loss_embed':(loss_embed_pc+loss_embed_nc).detach().cpu().numpy(),
        'train_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss}


  def validation_step(self,batch,batch_nb):
    inps,mask,labels = batch
    # labels=(labels).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      features = self(inps,att_mask=None)
    else:
      features = self(inps,att_mask=mask)    
    loss, logits = self.multihead_class.get_labels_and_losses(features,labels)
    
    accuracy = self.get_multi_label_exact_match_accuracy(logits,labels)
    self.logger.log_metrics({
        'val_accuracy':accuracy.detach().cpu().numpy(),
        'val_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss,'val_loss':loss}
  
  def test_step(self,batch,batch_nb):
    inps,mask,labels = batch
    # labels=(labels).type(torch.LongTensor).to(self.device)
    if not self.with_mask:
      features = self(inps,att_mask=None)
    else:
      features = self(inps,att_mask=mask)    
    loss, logits = self.multihead_class.get_labels_and_losses(features,labels)
    
    accuracy = self.get_multi_label_exact_match_accuracy(logits,labels)
    self.logger.log_metrics({
        'test_accuracy':accuracy.detach().cpu().numpy(),
        'test_loss':loss.detach().cpu().numpy(),
        'epoch': self.current_epoch,
    })
    return {'loss':loss,'test_accuracy':accuracy}


  def configure_optimizers(self):
    optimizer =  AdamW(self.parameters(), lr=LEARNING_RATE, eps=1e-12, betas=(0.9,0.999))
    num_minibatch_steps = NUM_TRAIN_SAMPLES/BATCH_SIZE
    max_epochs = MAX_EPOCHS
    warmup = WARMUP
    t_total = max_epochs * num_minibatch_steps
    num_cycles = MAX_CYCLES
    lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, warmup, t_total,num_cycles=num_cycles)
    return [optimizer] ,[lr_scheduler]



# Trainer


In [None]:
if EXP_TYPE == 'KEY':
  model = DataIntegrationClassifier(with_mask=IS_MASK,is_bert=IS_BERT)
elif EXP_TYPE == 'COL':
  model = DataIntegrationColumnClassifier(with_mask=IS_MASK,is_bert=IS_BERT)
  # model = DataIntegrationColumnMultiHeadClassifier(with_mask=IS_MASK)
else:
  raise Exception("Wrong EXP_TYPE")

Using Vanilla Backbone Model


In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import NeptuneLogger
NOTE = '''
Denser Backbone for Multi-label classification
'''
PROJECT_NAME = 'Data-Integration-Transformer'
EXPERIMENT_NAME= f'Key-Implementation-{USE_CASE}-Multihead'
pl.trainer.seed_everything(42)
# CORE_TRANFORMER_PARAMS.update({'note':NOTE})
logger_config_added_options = dict(
    SAMPLE = SAMPLE,
    BATCH_SIZE = BATCH_SIZE,
    LEARNING_RATE = LEARNING_RATE,
    WEIGHTS = WEIGHTS,
    USE_CASE = USE_CASE,
    NUM_LABELS = NUM_LABELS,
    IS_BERT = IS_BERT,
    IS_MASK=IS_MASK,
    EXP_TYPE=EXP_TYPE,
    CORE_TRANSFORMER_PARAMS=CORE_TRANSFORMER_PARAMS
)

# param_dict = {}
API_TOKEN = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiOWM2Yzk2YzQtMzM4YS00ODEwLWE0ODgtNDQzOGEyYThmNTQ5In0="
neptune_logger = NeptuneLogger(experiment_name=EXPERIMENT_NAME,
                             project_name=f'zwang578/{PROJECT_NAME}',
                             api_key = API_TOKEN,
                             params={**logger_config_added_options},
                             description=NOTE)

Global seed set to 42
NeptuneLogger will work in online mode


In [None]:
NUM_TRAIN_SAMPLES = len(train_dataset)
# Instantiate ModelCheckpoint
model_checkpoint = ModelCheckpoint(filename='model/checkpoints/{epoch:02d}-{val_loss:.2f}',
                                   save_weights_only=True,
                                   save_top_k=3,
                                   monitor='val_loss',
                                   period=1)
trainer = Trainer(
    automatic_optimization=True,
    max_epochs=MAX_EPOCHS,\
    progress_bar_refresh_rate=25,\
    gpus=1,\
    logger=neptune_logger,
    checkpoint_callback=model_checkpoint
)


trainer.fit(model, train_loader,val_dataloaders=val_loader)

In [None]:
trainer.test(model, test_dataloaders=DataLoader(test_dataset,batch_size=2048,shuffle=True))

In [None]:
def get_test_sest_acc(exp_id):
  import neptune
  project = neptune.init(f'zwang578/{PROJECT_NAME}',
                          api_token=API_TOKEN
                          )
  my_exp = project.get_experiments(id=exp_id)
  last_exp = my_exp[-1]
  if EXP_TYPE == 'KEY':
    testacc_values = last_exp.get_numeric_channels_values('test_accuracy','test_f1')
  else:
    testacc_values = last_exp.get_numeric_channels_values('test_accuracy')
  return testacc_values


In [None]:
acc_df = get_test_sest_acc(neptune_logger.experiment.id)
neptune_logger.experiment.set_property('test_set_accuracy_mean', acc_df['test_accuracy'].mean())
neptune_logger.experiment.set_property('test_set_accuracy_variance', acc_df['test_accuracy'].var())
if 'test_f1' in acc_df.columns:
  neptune_logger.experiment.set_property('test_set_f1_mean', acc_df['test_f1'].mean())
  neptune_logger.experiment.set_property('test_set_f1_var', acc_df['test_f1'].var())

In [None]:

neptune_logger.experiment.id