In [1]:
import os
import torch
import time
import torch.optim as optim
import torch
import torch.nn as nn 
from transformers import DistilBertModel ,DistilBertTokenizer , BertTokenizer
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from pytorch_metric_learning import losses
from Model.jointBert import jointBert
from dataset import nlu_dataset
from torch.utils.tensorboard import SummaryWriter
from scripts.utils import *
import argparse

In [2]:
# params

device = 'cpu'
model_name = 'distilbert-base-multilingual-cased'
tokenizer_name = 'distilbert-base-multilingual-cased'
train_dir = './data/splits/multi-train.tsv'
val_dir = './data/splits/multi-dev.tsv'

max_len = 46
batch_size = 128 #64

freeze_encoder = False
lr=0.001
epoch = 10

In [18]:
class Infinite(nn.Module):

    def __init__(self,model):

        super(Infinite,self).__init__()
        
        self.encoder = DistilBertModel.from_pretrained(model,return_dict=True,output_hidden_states=True)
        self.pre_classifier = torch.nn.Linear(768, 768)
        
    
    def forward(self, input_ids, attention_mask):

        encoded_output = self.encoder(input_ids, attention_mask)
        hidden = self.pre_classifier(encoded_output[0][:,0])
        
        return hidden


In [19]:
model = Infinite(model_name)
model.to(device=device)

Infinite(
  (encoder): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featur

In [5]:
trainDS, valDS =  nlu_dataset(train_dir,tokenizer_name,max_len), nlu_dataset(val_dir,tokenizer_name,max_len)
trainDL = DataLoader(trainDS,batch_size=batch_size,shuffle=True,num_workers=1)
valDL = DataLoader(trainDS,batch_size=batch_size,shuffle=True,num_workers=1)

In [20]:
if freeze_encoder:
    # freezing base bert model
    for params in model.parameters():
        params.requires_grad = False

In [21]:
optimizer =  optim.Adam( model.parameters() , lr=lr,weight_decay=1e-3)

In [22]:
loss_func = losses.TripletMarginLoss()

In [23]:
# training loop
print('*'*10  + 'Training loop started' + '*'*10)
for _ in range(1,epoch):

    epoch_loss = 0.0
    model.train()
    # training loop
    
    start_train = time.time()
    num_batch = 0
    for idx,batch in enumerate(trainDL,0):
        num_batch += 1
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        intent_target = batch['intent_target'].to(device, dtype = torch.long)
        # zero the parameter gradients
        optimizer.zero_grad()
        embedding = model(ids,mask)
        loss = loss_func(embedding,intent_target)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.detach()
        #print(loss.detach())
    
    epoch_loss = epoch_loss/float(num_batch)
    end_train = time.time()
    writer.add_scalar('Loss/train', epoch_loss, _)
    print("Train Epoch: {epoch_no} train_loss: {loss} time elapsed: {time}".format(epoch_no = _ , loss = epoch_loss , time = end_train - start_train))

**********Training loop started**********


KeyboardInterrupt: 