In [1]:
import os
import torch
import time
import torch.optim as optim
import torch
import torch.nn as nn 
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer , BertTokenizer
from torch.utils.data import Dataset, DataLoader
from TorchCRF import CRF
import pandas as pd
from torch import cuda
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

In [2]:
writer = SummaryWriter('baseline_1')

In [13]:
class jointBert(nn.Module):

    def __init__(self, baseModel , num_intent , num_slots, intent_dropout=0.0,slots_dropout=0.0 ):

        super(jointBert,self).__init__()
        #self.args = args
        self.encoder = DistilBertForSequenceClassification.from_pretrained(baseModel,num_labels=num_intent,return_dict=True,output_hidden_states=True)

        #self.intent_classifier = nn.Linear(768,num_intent)
        #self.dropout_intent = nn.Dropout(intent_dropout)

        self.slot_classifier = nn.Linear(768, num_slots)
        self.dropout_slots = nn.Dropout(slots_dropout)

        self.crf = CRF(num_slots)

        self.intent_loss = nn.CrossEntropyLoss()
        
        self.joint_loss_coef = 1.0

    
    def forward(self, input_ids, attention_mask, intent_target, slot_target):

        encoded_output = self.encoder(input_ids, attention_mask)
        sequence_rep = encoded_output[1][6]
        slots_logits = self.slot_classifier(self.dropout_slots(sequence_rep))
        
        intent_logits = encoded_output[0]
        
        joint_loss = 0
        
        # accumulating intent classification loss 
        intent_loss = self.intent_loss(intent_logits, intent_target)
        
        # accumulating slot prediction loss
        slot_loss = -1 * self.joint_loss_coef * self.crf(slots_logits, slot_target, mask=attention_mask.byte())
        #print(slot_loss.size())
        slot_loss = torch.mean(slot_loss)
        joint_loss = slot_loss + intent_loss

        return joint_loss


In [14]:
class nlu_dataset(Dataset):
    def __init__(self, file_dir, tokenizer, max_len):
        
        self.data = pd.read_csv(file_dir, sep='\t')
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer)
        self.max_len = max_len
    def __getitem__(self, index):
        
        text = str(self.data.utterance[index])
        text = " ".join(text.split())
        
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'intent': self.data.intent[index],
            'slot' : self.data.slot_labels[index],
            'intent_target': torch.tensor(self.data.intent_ID[index], dtype=torch.long),
            'slot_target' : self.data.slots_ID[index]
        } 
    
    def __len__(self):
        return len(self.data)

In [15]:
model = jointBert(baseModel='distilbert-base-multilingual-cased',num_intent=17,num_slots=160)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'pre_cla

In [16]:
trainDS, valDS =  nlu_dataset('./data/splits/multi-train.tsv', 'distilbert-base-multilingual-cased',46), nlu_dataset('./data/splits/multi-dev.tsv','distilbert-base-multilingual-cased',46)
trainDL = DataLoader(trainDS,batch_size=32,shuffle=True)
valDL = DataLoader(valDS,batch_size=32,shuffle=True)

In [17]:
for params in model.encoder.parameters():
    params.requires_grad = False

device = 'cpu'

In [18]:
def process_label(labels, max_len):
    slot_target = []
        
    for sLabel in labels:
        slots = [int(L) for L in sLabel.split()]
        slots += [159]*(max_len - len(slots))
        slot_target.append(slots)
        
    slot_target = torch.LongTensor(slot_target)
    return slot_target.to(device, dtype = torch.long)
    

In [19]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training loop
for _ in range(1):

    epoch_loss = 0.0
    model.train()
    # training loop

    start_train = time.time()
    for idx,batch in enumerate(trainDL,0):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        intent_target = batch['intent_target'].to(device, dtype = torch.long)
        slot_target = process_label(batch['slot_target'],46)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        loss = model(ids,mask,intent_target,slot_target)
        loss.backward()
        optimizer.step()

        #epoch_loss += loss.detach()
        print(loss.detach())
    
    end_train = time.time()
    writer.add_scalar('Loss/train', epoch_loss, _)
    print("Epoch: {epoch_no} train_loss: {loss} time elapsed: {time}".format(epoch_no = idx , loss = epoch_loss , time = end_train - start_train))

    # validation loop
    #best_eval_lo
    if _% args.check_val_every_n_epoch == 0:

        model.eval()
        eval_loss = 0.0 
        start_val = time.time()
        for idx,batch in enumerate(valDL,0):

            ids = batch['ids'].to(args.device, dtype = torch.long)
            mask = batch['mask'].to(args.device, dtype = torch.long)
            intent_target = batch['intent_target'].to(args.device, dtype = torch.long)
            slot_target = batch['slot_target'].to(args.device, dtype = torch.long)

            loss = model(ids,mask,intent_target,slot_target)

            eval_loss += loss.detach()

        
        end_val = time.time()
        writer.add_scalar('Loss/val', eval_loss, _ / args.check_val_every_n_epoch)
        print("Epoch: {epoch_no} train_loss: {loss} time elapsed: {time}".format(epoch_no = _ / args.check_val_every_n_epoch , loss = eval_loss , time = end_val - start_val))

writer.close()

torch.Size([32])


RuntimeError: grad can be implicitly created only for scalar outputs