In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/research/Infinite/')

! pip install transformers
! pip install seqeval
! pip install pytorch-lightning
! pip install pytorch-metric-learning

In [5]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import seed_everything, loggers as pl_loggers
from pytorch_lightning.callbacks import ModelCheckpoint
from transformers import  DistilBertModel,DistilBertTokenizerFast

from pytorch_metric_learning import miners, losses

import pandas as pd
import numpy as np
import random
from functools import reduce


from scripts.dataset import *
from scripts.utils import *

In [6]:
# model parameter

seed_everything(42, workers=True)

config = {

'mc' : {
    'model_name' : 'distilbert-base-multilingual-cased',
    'tokenizer_name' : 'distilbert-base-multilingual-cased'
},

# training parameters
'tc' : {
    'lr' : 0.00003,
    'epoch' : 40,
    'batch_size' : 15,
    'weight_decay' : 0.003,
    'shuffle_data' : True,
    'num_worker' : 8
},

# data params

'dc' : {
    'train_dir' : '/content/drive/MyDrive/research/Infinite/data/multiATIS/split/train/clean/test_EN.tsv',
    'val_dir' : '/content/drive/MyDrive/research/Infinite/data/multiATIS/split/valid/clean/val.tsv',
    'intent_num' : 18,
    'slots_num' : 159,
    'max_len' : 56
},

# misc
'misc' : {
    'fix_seed' : False,
    'gpus' : -1,
    'param_dir' : '/content/drive/MyDrive/research/Infinite/bin/clean/ICL/',
    'log_dir' : '/content/drive/MyDrive/research/Infinite/logs/Infinite_sentCL/'
    'precision' : 16,
}
}

In [7]:
tb_logger = pl_loggers.TensorBoardLogger(config['misc']['log_dir'])


checkpoint_callback = ModelCheckpoint(
    monitor='valid_CL',
    dirpath= config['misc']['param_dir'],
    filename='INF_sentCL-{epoch:02d}-{val_loss:.2f}',
    save_top_k = -1
)

NameError: name 'basepath' is not defined

In [None]:

with open('./data/BG_Noise_Phrase.txt') as f:
    content = f.readlines()

phrase = [x.strip() for x in content] 


def mergelists(l1, l2 , prob):
    
    spl = [0]*int((1000)*prob) + [1]*int(1000*(1-prob))
    final = []
    while len(l1) >0 and len(l2) > 0:
        if random.sample(spl,1)[0] == 0:
            final.append(l1.pop(0))
        else:
            final.append(l2.pop(0))
    if len(l1) == 0:
        final = final + l2
    else:
        final = final + l1
    
    text,slot = '',''
    for token in final:
        text += token + ' '

    return text

def BG_Noise(samples, prob):
    
    aug_text = []
    aug_id = []

    for idx,text in enumerate(samples):
        
        bg_TEXT = random.sample(phrase,1)[0]
        
        text = mergelists(bg_TEXT.split(' '), text.split(' '),prob)
        aug_text.append(text)
        aug_id.append(idx)
    
    return aug_text,aug_id

def get_phrase_length(text):
    return text.split(" ")

def carrier_aug(samples,tau):
    
    aug_text = []
    aug_id = []

    for idx,text in enumerate(samples):

        CP_idx = get_phrase_length(text)
        CP_length = len(CP_idx)

        if CP_length <= 2:
            
            aug_text.append(text)
            aug_id.append(id)

        else:

            del_count = int(CP_length/2) if CP_length <= 5 else int(tau*CP_length)
            del_index = random.sample(list(range((CP_length))),del_count)
            
            text = ' '.join([i for j, i in enumerate(text.split(' ')) if j not in del_index])
            
            aug_id.append(idx)
            aug_text.append(text)

    return aug_text,aug_id

def contrastiveSampleGeneration(samples):

    aug_sample,aug_label = [],[]

    for tau in [0.2,0.4,0.6]:
        augmentation = carrier_aug(samples,tau)

        aug_sample += augmentation[0]
        aug_label += augmentation[1]

    for tau in [0.25,0.50,0.75]:
        augmentation = BG_Noise(samples,tau)

        aug_sample += augmentation[0]
        aug_label += augmentation[1]
    
    return aug_sample + samples ,aug_label + list(range(len(samples)))
    

In [8]:
class sentCLDataset(Dataset):
    def __init__(self, file_dir):

        self.data = pd.read_csv(file_dir, sep="\t")

    def __getitem__(self, index):

        text = str(self.data.TEXT[index])
        text = text.replace(".", "")
        text = text.replace("'", "")
        text = " ".join(text.split())

        return {
            "text": text,
        }

    def __len__(self):
        return len(self.data)


class sentCLDataset_pl(pl.LightningDataModule):
    def __init__(
        self, train_dir, val_dir, batch_size, num_worker
    ):

        super().__init__()
        self.train_dir = train_dir
        self.val_dir = val_dir
        self.batch_size = batch_size
        self.num_worker = num_worker

    def setup(self, stage: [str] = None):
        self.train = contraNLUDataset(self.train_dir)

        self.val = contraNLUDataset(self.val_dir)


    def train_dataloader(self):
        return DataLoader(
            self.train, batch_size=self.batch_size, shuffle=True,num_workers=self.num_worker
        )

    def val_dataloader(self):
        return DataLoader(
            self.val, batch_size=self.batch_size, shuffle=True,num_workers=self.num_worker
        )

In [None]:
class sentCLModel(nn.Module):
    def __init__(self, cfg):

        super(sentCLModel, self).__init__()

        self.encoder = DistilBertModel.from_pretrained(
            cfg["mc"]["model_name"], return_dict=True, output_hidden_states=True
        )
        
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(cfg["mc"]["tokenizer_name"])
        self.cfg = cfg
        
        self.intent_loss = losses.NTXentLoss()
        

    def forward(self, sent, intent_target, slots_target):
        
        inputs = self.tokenizer.encode_plus(text,None,add_special_tokens=True,return_token_type_ids=False,
            truncation=True,max_length=self.max_len,padding="max_length")

        token_ids = torch.tensor(inputs["input_ids"], dtype=torch.long)
        mask = torch.tensor(inputs["attention_mask"], dtype=torch.long)

        encoded_output = self.encoder(token_ids, mask)

        # intent data flow
        intent_hidden = encoded_output[0][:, 0]
     
        # accumulating intent contrastive loss
        intent_loss = self.intent_loss(intent_hidden,intent_target)
        
        return intent_loss

In [19]:
class TrainerSentCL(pl.LightningModule):
    
    def __init__(self, cfg):
        super().__init__()
        
        self.model = sentCLModel(cfg)
        self.cfg = cfg

    def forward(self, input_ids, attention_mask , intent_target, slots_target):
        return self.model(input_ids, attention_mask , intent_target, slots_target)
    
    def tokenizer(self,samples):
        
        token_ids , mask = [],[]
        for i in range(14):
            inputs = tokenizer.encode_plus(samples[i],None,add_special_tokens=True,return_token_type_ids=False,
            truncation=True,max_length=56,padding="max_length")
            token_ids.append(inputs["input_ids"])
            mask.append(inputs["attention_mask"])
    
        token_ids = torch.tensor(token_ids, dtype=torch.long)
        mask = torch.tensor(mask, dtype=torch.long)
    
        return token_ids, mask

    def training_step(self, batch, batch_idx):
        
        samples = batch['text']
        
        contrast_samples,labels = contrastiveSampleGeneration(samples)

        token_ids,mask = self.tokenizer(contrastive_samples)
        
        out = self(token_ids,mask,labels)
        
        self.log('train_CL', out, on_step=False, on_epoch=True, logger=True)

        return out
    
    def validation_step(self,batch,batch_idx):
        
        samples = batch['text']
        
        contrast_samples,labels = contrastiveSampleGeneration(samples)

        token_ids,mask = self.tokenizer(contrast_samples)
        
        out = self(token_ids,mask,labels)
        
        self.log('valid_CL', out, on_step=False, on_epoch=True, logger=True)

        return out
        
        
    
    def configure_optimizers(self):
         return torch.optim.AdamW( self.parameters(), lr=config['tc']['lr'] ,  weight_decay=self.cfg['tc']['weight_decay'])

In [20]:
model = TrainerSentCL(config)
contraDL = contra_Dataset_pl(config['dc']['train_dir'], config['dc']['val_dir'],config['tc']['batch_size'],config['tc']['num_worker'])

In [None]:
# model training
trainer = pl.Trainer(gpus=config['misc']['gpus'], deterministic=True, logger=tb_logger, callbacks=[checkpoint_callback] ,precision=config['misc']['precision'],max_epochs=config['tc']['epoch'])

trainer.fit(model, contraDL)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params
------------------------------------------------------
0 | Infinite_encoder | Infinite_encoder | 134 M 
------------------------------------------------------
134 M     Trainable params
0         Non-trainable params
134 M     Total params
538.936   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]