In [None]:
pip install transformers



In [None]:
#ALL LIBRARIES USED
import transformers
from transformers import AutoConfig, AutoModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import XLMRobertaConfig, XLMRobertaModel,XLMRobertaTokenizer,XLMRobertaForSequenceClassification,XLMRobertaForCausalLM,GPT2ForSequenceClassification
from transformers import (AutoTokenizer, PreTrainedTokenizerFast,
                          AutoModelForQuestionAnswering, TrainingArguments,
                          Trainer, default_data_collator, DataCollatorWithPadding)
from torch.utils.data import DataLoader
import datetime
from torch.utils.tensorboard import SummaryWriter

Mount drive for data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
import os
import shutil
import warnings
import pandas as pd

folder = "drive/MyDrive/ColabNotebooks/VarDial/data_train" #@param {type:"string"}
!ln -Ts "$folder" /content/data_train> /dev/null

# Add the assignment folder to Python path
if '/content/data_train' not in sys.path:
  sys.path.insert(0, '/content/data_train')

In [3]:
import sys
import os
import shutil
import warnings
import pandas as pd

folder = "drive/MyDrive/ColabNotebooks/VarDial/data_test" #@param {type:"string"}
!ln -Ts "$folder" /content/data_test> /dev/null

# Add the assignment folder to Python path
if '/content/data_test' not in sys.path:
  sys.path.insert(0, '/content/data_test')

## Importing Data

Import data and organize it in a dictionary

In [4]:
rootfolder='/content/'
directory = os.fsencode(rootfolder+'data/')
def dataframe(data,folder):
  rootfolder='/content/'
  directory = os.fsencode(rootfolder+'data'+'_'+folder)
  df = pd.DataFrame(columns=['language','premise', 'question', 'choice1', 'choice2', 'label'])
  for folder in os.listdir(directory):
    for file in os.listdir(os.fsdecode(os.path.join(directory, folder))):
      t_set= data+ '.jsonl'
      if file== t_set:
          file_path= os.path.join(os.fsdecode(os.path.join(directory, folder)),file)
          language= os.fsdecode(folder)
          jsonObj = pd.read_json(path_or_buf=file_path, lines=True)
          jsonObj['language']=language
          if 'label' in jsonObj.columns:
            jsonObj['label']=jsonObj['label'].replace(1, -1)
            jsonObj['label']=jsonObj['label'].replace(0, 1)
          #df= df.append(jsonObj)
          df= pd.concat([df, jsonObj])
  return df


Creation of train and validation set

In [5]:
df_train=dataframe('train','train')
df_val=dataframe('val','train')
df_test=dataframe('test','test')

In [13]:
len(df_test[df_test.label==-1])

0

In [14]:
len(df_test[df_test.question=='effect'])

750

In [15]:
len(df_test)

1500

In [12]:
df_val['choice1'].apply(len).mean()

27.30875

In [14]:
3200-1584

1616

Preparation of data to insert in model

In [None]:
special_token= {'cause':{'sl':'Ker','hr':'jer','sr':'jep','mk':'бидејќи','en':'because','hr-ckm':'zbog'},'effect':{'sl':'torej','hr':'tako','sr':'тако','mk':'така','en':'so','hr-ckm':'oda'}}

In [None]:
#test with only english
#df_train=df_train[df_train.language=='copa-en']
#df_val=df_val[df_val.language=='copa-en']

In [None]:
import numpy as np
#Method to create dataset to feed DataLoader
def dataset(df):
  data_ls= []
  for index, row in df.iterrows():
    premise= row['premise'][:-1]
    choice1= row['choice1'][:-1]
    choice2= row['choice2'][:-1]
    lang= row['language'][5:]
    connector=''
    question= row['question']
    match lang:
      case 'sl-cer':
        lang= 'sl'
      case 'sr-tor':
        lang= 'sr'
    connector= special_token[question][lang]

    premise= premise+' '+connector

    data_ls.append((premise,choice1,choice2,row['label']))

  return data_ls

Get ready dataloaders,device,etc


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data_train= dataset(df_train)
data_val= dataset(df_val)
training_loader = DataLoader(data_train, batch_size=16, shuffle=True)
validation_loader = DataLoader(data_val, batch_size=16, shuffle=False)

#Sequence Classification with MBERT and Ranking Loss


#### Model Creation

In [None]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, dropout=0.1):
        super(MLP, self).__init__()
        layers = []
        prev_size = input_size
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.LayerNorm(hidden_size))
            layers.append(nn.ReLU())
            #layers.append(nn.Dropout(dropout))
            prev_size = hidden_size
        layers.append(nn.Linear(prev_size, output_size))
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x)

In [None]:
#get max_length for padding
max_len_hyp1 = len(max(df_train['premise'], key = len))+ len(max(df_train['choice1'], key = len))+8
max_len_hyp2 = len(max(df_train['premise'], key = len))+ len(max(df_train['choice2'], key = len))+8
max_len_hyp3 = len(max(df_val['premise'], key = len))+ len(max(df_val['choice1'], key = len))+8
max_len_hyp4 = len(max(df_val['premise'], key = len))+ len(max(df_val['choice2'], key = len))+8
MAX_LEN= max(max_len_hyp1,max_len_hyp2,max_len_hyp3,max_len_hyp4)
MAX_LEN

161

In [None]:
#USING MBERT AND MARGIN LOSS
class COPAX_MARGIN(nn.Module):
  def __init__(self,hidden_sizes, output_size, dropout=0.1,model_string='bert-base-cased'):
      super(COPAX_MARGIN,self).__init__()
      self.tokenizer= AutoTokenizer.from_pretrained(model_string)
      self.model = AutoModel.from_pretrained(model_string)
      self.MlP = MLP(768, hidden_sizes=hidden_sizes,output_size=1, dropout=dropout)

  def forward(self,premise,choice1, choice2):
    hyp1= self.tokenizer(premise,choice1,add_special_tokens=True,max_length=MAX_LEN,padding='max_length',truncation=True, return_tensors='pt',return_attention_mask=True)
    hyp2= self.tokenizer(premise,choice2,add_special_tokens=True,max_length=MAX_LEN,padding='max_length',truncation=True, return_tensors='pt',return_attention_mask=True)
    hyp1= hyp1.to(device)
    hyp2= hyp2.to(device)
    #output1 = self.model(input_ids=hyp1['input_ids'], attention_mask=hyp1['attention_mask'],token_type_ids=hyp1['token_type_ids'])
    #output2 = self.model(input_ids=hyp2['input_ids'], attention_mask=hyp2['attention_mask'],token_type_ids=hyp2['token_type_ids'])
    output1 = self.model(input_ids=hyp1['input_ids'], attention_mask=hyp1['attention_mask'])
    output2 = self.model(input_ids=hyp2['input_ids'], attention_mask=hyp2['attention_mask'])

    last_hidden_states1 = output1[0]
    last_hidden_states2 = output2[0]

    ## Output of CLS token - considered to represent the hidden state of entire sentence
    cls1 = last_hidden_states1[:, 0,:]  # (bs, dim)
    cls2 = last_hidden_states2[:, 0,:]  # (bs, dim)

    ## Send the hidden state of CLS token thru Linear, Relu and dropout layers
    logits1 = self.MlP(cls1)
    logits2= self.MlP(cls2)

    return logits1,logits2

In [None]:
#USING MBERT AND CROSS ENTROPY
class COPAX_ENTROPY(nn.Module):
  def __init__(self,hidden_sizes, output_size, dropout=0.1,model_string='bert-base-cased'):
      super(COPAX_ENTROPY,self).__init__()
      self.tokenizer= AutoTokenizer.from_pretrained(model_string)
      self.model = AutoModel.from_pretrained(model_string)
      self.MlP = MLP(2*1024, hidden_sizes=hidden_sizes,output_size=2, dropout=dropout)

  def forward(self,premise,choice1, choice2):
    hyp1= self.tokenizer(premise,choice1,add_special_tokens=True,max_length=MAX_LEN,padding='max_length',truncation=True, return_tensors='pt',return_attention_mask=True)
    hyp2= self.tokenizer(premise,choice2,add_special_tokens=True,max_length=MAX_LEN,padding='max_length',truncation=True, return_tensors='pt',return_attention_mask=True)
    hyp1= hyp1.to(device)
    hyp2= hyp2.to(device)
    #output1 = self.model(input_ids=hyp1['input_ids'], attention_mask=hyp1['attention_mask'],token_type_ids=hyp1['token_type_ids'])
    #output2 = self.model(input_ids=hyp2['input_ids'], attention_mask=hyp2['attention_mask'],token_type_ids=hyp2['token_type_ids'])
    output1 = self.model(input_ids=hyp1['input_ids'], attention_mask=hyp1['attention_mask'])
    output2 = self.model(input_ids=hyp2['input_ids'], attention_mask=hyp2['attention_mask'])

    last_hidden_states1 = output1[0]
    last_hidden_states2 = output2[0]

    ## Output of CLS token - considered to represent the hidden state of entire sentence
    cls1 = last_hidden_states1[:, 0,:]  # (bs, dim)
    cls2 = last_hidden_states2[:, 0,:]  # (bs, dim)

    #concatenate cls tokens to feed to MLP for
    concat= torch.cat((cls1,cls2),dim=-1)
    logits = self.MlP(concat)

    return logits

In [None]:
#USING GPT-2
class COPAX_GPT_MARGIN(nn.Module):
  def __init__(self):
      super(COPAX_GPT_MARGIN,self).__init__()
      self.tokenizer = AutoTokenizer.from_pretrained("ai-forever/mGPT")
      self.model = GPT2ForSequenceClassification.from_pretrained("ai-forever/mGPT")


  def forward(self,premise,choice1, choice2):
    hyp1= self.tokenizer(premise,choice1,add_special_tokens=True,max_length=MAX_LEN,padding='max_length',truncation=True, return_tensors='pt',return_attention_mask=True)
    hyp2= self.tokenizer(premise,choice2,add_special_tokens=True,max_length=MAX_LEN,padding='max_length',truncation=True, return_tensors='pt',return_attention_mask=True)
    hyp1= hyp1.to(device)
    hyp2= hyp2.to(device)
    output1 = self.model(input_ids=hyp1['input_ids'], attention_mask=hyp1['attention_mask'])
    output2 = self.model(input_ids=hyp2['input_ids'], attention_mask=hyp2['attention_mask'])

    logits1 = output1.logits
    logits2 = output2.logits

    return logits1, logits2



### Methods for training with margin loss

In [None]:
def train_one_epoch_margin(epoch_index,model,loss_fn):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(training_loader):
        premise,choice1,choice2,label= data
        label=label.to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Margin loss
        F1,F2 = model(premise,choice1,choice2)
        F1=F1.squeeze(1)
        F2=F2.squeeze(1)

        # Compute the loss and its gradients
        loss = loss_fn(F1,F2, label)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()

    last_loss = running_loss / len(training_loader)
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch_index , 15, last_loss))

    return last_loss

In [None]:
from datetime import datetime
def train_margin(epochs,model,loss_fn,scheduler):
  epoch_number = 1
  best_vloss = 1_000_000.
  training_losses= []
  validation_losses = []
  for epoch in range(epochs):
      timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
      print('EPOCH {}:'.format(epoch_number ))

      model.train(True)

      avg_loss = train_one_epoch_margin(epoch_number,model,loss_fn)
      training_losses.append(avg_loss)

      running_vloss = 0.0

      model.eval()

      # Disable gradient computation and reduce memory consumption.
      with torch.no_grad():
          correct = 0
          total = 0
          for i, vdata in enumerate(validation_loader):
              premise,choice1,choice2,vlabel= vdata
              vlabel=vlabel.to(device)

              vF1,vF2 = model(premise, choice1,choice2)
              vF1=vF1.squeeze(1)
              vF2=vF2.squeeze(1)

              #Retrieve original labels
              or_label=(vlabel-1)/(-2)
              concat= torch.column_stack((vF1, vF2))

              vloss = loss_fn(vF1,vF2, vlabel)
              predictions = torch.argmax(concat, dim=1)
              total += vlabel.size(0)
              correct += (predictions == or_label).sum().item()

              running_vloss += vloss.item()

      scheduler.step()
      avg_vloss = running_vloss / len(validation_loader)
      validation_losses.append(avg_vloss)
      print('LOSS train {} valid {} test'.format(avg_loss, avg_vloss))
      print('Valid Accuracy of the model: {} %'.format(100 * correct / total))

      # Track best performance, and save the model's state
      if avg_vloss < best_vloss:
          best_vloss = avg_vloss
          model_path = 'model_{}_{}'.format(timestamp, epoch_number)
          torch.save(model.state_dict(), model_path)

      epoch_number += 1

  return training_losses,validation_losses

Method training with cross entropy

In [None]:
def train_one_epoch_entropy(epoch_index,model,loss_fn):
    running_loss = 0.
    last_loss = 0.
    for i, data in enumerate(training_loader):
        premise,choice1,choice2,label= data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        #BCE loss
        pred= model(premise,choice1, choice2)
        pred= pred.squeeze(1)

        or_label= (label-1)/(-2)
        or_label=or_label.to(device,torch.long)


        # Compute the loss and its gradients
        loss = loss_fn(pred, or_label)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()


    last_loss = running_loss / len(training_loader)
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch_index, 15, last_loss))

    return last_loss

In [None]:
from datetime import datetime
def train_entropy(epochs,model,loss_fn,scheduler):
  best_vloss = 1_000_000.
  epoch_number = 1
  training_losses= []
  validation_losses = []
  for epoch in range(epochs):
      timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
      print('EPOCH {}:'.format(epoch_number ))

      model.train(True)
      avg_loss = train_one_epoch_entropy(epoch_number,model,loss_fn)
      training_losses.append(avg_loss)

      running_vloss = 0.0

      model.eval()

      # Disable gradient computation and reduce memory consumption.
      with torch.no_grad():
          correct = 0
          total = 0
          for i, vdata in enumerate(validation_loader):
              premise,choice1,choice2,vlabel= vdata

              or_label= (vlabel-1)/(-2)
              or_label= or_label.to(device,torch.long)
              out= model( premise,choice1,choice2)

              out= out.squeeze(1)

              vloss = loss_fn(out, or_label)

              softmax= nn.Softmax(dim=-1)
              predictions = (softmax(out))
              predictions = torch.argmax(predictions, dim=1)
              total += vlabel.size(0)
              correct += (predictions == or_label).sum().item()

              running_vloss += vloss.item()
      if epoch % 5 == 0:
        scheduler.step()
      avg_vloss = running_vloss / len(validation_loader)
      validation_losses.append(avg_vloss)
      print('LOSS train {} valid {} test'.format(avg_loss, avg_vloss))
      print('Valid Accuracy of the model: {} %'.format(100 * correct / total))

      # Track best performance, and save the model's state
      if avg_vloss < best_vloss:
          best_vloss = avg_vloss
          model_path = 'model_{}_{}'.format(timestamp, epoch_number)
          torch.save(model.state_dict(), model_path)

      epoch_number += 1
  return training_losses,validation_losses

Initialize model,optimizer and loss


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
language='copa-sl-cer'
data_test = dataset(df_val[df_val.language==language])
test_loader = DataLoader(data_test, batch_size=len(data_test), shuffle=False)
#TRAINING WITH MBERT AND MARGIN LOSS
hidden_sizes=[512,128]
model= COPAX_MARGIN(hidden_sizes=hidden_sizes,output_size=1,dropout=0.1,model_string='xlm-roberta-base').to(device)
model.load_state_dict(torch.load('model_xlm_margin'))
outputs=[]
model=model.to(device)
with torch.no_grad():
        correct = 0
        total = 0
        total_losss =0
        model.eval()
        for i, vdata in enumerate(test_loader):
            premise,choice1,choice2,vlabel= vdata
            vF1,vF2 = model(premise, choice1,choice2)
            vF1=vF1.squeeze(1)
            vF2=vF2.squeeze(1)
            concat= torch.column_stack((vF1, vF2))
            out= model( premise,choice1,choice2)
            #out= out.squeeze(1)
            #softmax= nn.Softmax(dim=-1)
            #predictions = (softmax(out))
            output= torch.argmax(concat,dim=1)
            total += vlabel.size(0)
            or_label=(vlabel-1)/(-2)
            or_label= or_label.to(device,torch.long)
            correct += (output == or_label).sum().item()

print('Accuracy for language:',language,correct/total)


Accuracy for language: copa-sl-cer 0.47


In [None]:
#TRAINING WITH XML-ROBERTA base AND MARGIN LOSS
hidden_sizes=[512,128]
model= COPAX_MARGIN(hidden_sizes=hidden_sizes,output_size=1,dropout=0.1,model_string='xlm-roberta-base').to(device)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-6)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
loss_fn=nn.MarginRankingLoss(margin=0.2,reduction='sum')
train_losses, val_losses=train_margin(15,model,loss_fn,scheduler)

EPOCH 1:
Epoch [1/15], Loss: 3.2123
LOSS train 3.2122645890712738 valid 3.2012841796875 test
Valid Accuracy of the model: 49.5 %
EPOCH 2:
Epoch [2/15], Loss: 3.1854
LOSS train 3.185355373620987 valid 3.200496459007263 test
Valid Accuracy of the model: 50.75 %
EPOCH 3:
Epoch [3/15], Loss: 3.1882
LOSS train 3.188234063386917 valid 3.1993448066711427 test
Valid Accuracy of the model: 52.5 %
EPOCH 4:
Epoch [4/15], Loss: 3.1846
LOSS train 3.184556345939636 valid 3.1988496160507203 test
Valid Accuracy of the model: 53.25 %
EPOCH 5:
Epoch [5/15], Loss: 3.1844
LOSS train 3.1843526482582094 valid 3.1966691541671755 test
Valid Accuracy of the model: 55.125 %
EPOCH 6:
Epoch [6/15], Loss: 3.1563
LOSS train 3.1562745809555053 valid 3.1955680894851684 test
Valid Accuracy of the model: 53.5 %
EPOCH 7:
Epoch [7/15], Loss: 3.1741
LOSS train 3.17410263299942 valid 3.1941587686538697 test
Valid Accuracy of the model: 53.75 %
EPOCH 8:
Epoch [8/15], Loss: 3.1906
LOSS train 3.1906214451789854 valid 3.189560

KeyboardInterrupt: 

In [None]:
#TRAINING WITH XML-ROBERTA LARGE AND ENTROPY LOSS
hidden_sizes=[512,128]
model= COPAX_ENTROPY(hidden_sizes=hidden_sizes,output_size=2,dropout=0.3,model_string='xlm-roberta-large').to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-7,weight_decay=0.1)
loss_fn = nn.CrossEntropyLoss(reduction='sum')
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
train_losses, val_losses=train_entropy(15,model,loss_fn,scheduler)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

EPOCH 1:
Epoch [1/15], Loss: 11.1203
LOSS train 11.120329518318176 valid 11.110210208892822 test
Valid Accuracy of the model: 45.875 %
EPOCH 2:
Epoch [2/15], Loss: 11.0980
LOSS train 11.097984623908996 valid 11.110904769897461 test
Valid Accuracy of the model: 46.125 %
EPOCH 3:
Epoch [3/15], Loss: 11.0876
LOSS train 11.087617254257202 valid 11.112919292449952 test
Valid Accuracy of the model: 45.75 %
EPOCH 4:
Epoch [4/15], Loss: 11.1025
LOSS train 11.102519063949584 valid 11.125137367248534 test
Valid Accuracy of the model: 45.25 %
EPOCH 5:


KeyboardInterrupt: 

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
language='copa-sr-tor'
data_test = dataset(df_val[df_val.language==language])
test_loader = DataLoader(data_test, batch_size=len(data_test), shuffle=False)
#TRAINING WITH MBERT AND MARGIN LOSS
hidden_sizes=[512,128]
model = COPAX_ENTROPY(hidden_sizes=hidden_sizes,output_size=2,dropout=0.3,model_string='xlm-roberta-large').to(device)
model.load_state_dict(torch.load('model_large_entropy'))
outputs=[]
model=model.to(device)
with torch.no_grad():
        correct = 0
        total = 0
        total_losss =0
        model.eval()
        for i, vdata in enumerate(test_loader):
            premise,choice1,choice2,vlabel= vdata
            #vF1,vF2 = model(premise, choice1,choice2)
            #vF1=vF1.squeeze(1)
            #vF2=vF2.squeeze(1)
            #concat= torch.column_stack((vF1, vF2))
            out= model( premise,choice1,choice2)
            out= out.squeeze(1)
            softmax= nn.Softmax(dim=-1)
            predictions = (softmax(out))
            output= torch.argmax(predictions,dim=1)
            total += vlabel.size(0)
            or_label=(vlabel-1)/(-2)
            or_label= or_label.to(device,torch.long)
            correct += (output == or_label).sum().item()

print('Accuracy for language:',language,correct/total)


Accuracy for language: copa-sr-tor 0.47


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
language='copa-sr-tor'
data_test = dataset(df_val[df_val.language==language])
test_loader = DataLoader(data_test, batch_size=len(data_test), shuffle=False)
#TRAINING WITH MBERT AND MARGIN LOSS
hidden_sizes=[512,128]
model = COPAX_ENTROPY(hidden_sizes=hidden_sizes,output_size=2,dropout=0.3,model_string='xlm-roberta-large').to(device)
model.load_state_dict(torch.load('model_large_entropy'))
outputs=[]
model=model.to(device)
with torch.no_grad():
        correct = 0
        total = 0
        total_losss =0
        model.eval()
        for i, vdata in enumerate(test_loader):
            premise,choice1,choice2,vlabel= vdata
            #vF1,vF2 = model(premise, choice1,choice2)
            #vF1=vF1.squeeze(1)
            #vF2=vF2.squeeze(1)
            #concat= torch.column_stack((vF1, vF2))
            out= model( premise,choice1,choice2)
            out= out.squeeze(1)
            softmax= nn.Softmax(dim=-1)
            predictions = (softmax(out))
            output= torch.argmax(predictions,dim=1)
            total += vlabel.size(0)
            or_label=(vlabel-1)/(-2)
            or_label= or_label.to(device,torch.long)
            correct += (output == or_label).sum().item()

print('Accuracy for language:',language,correct/total)


In [None]:
#TRAINING WITH XML-ROBERTA LARGE AND MARGIN LOSS
hidden_sizes=[512,128]
model= COPAX_MARGIN(hidden_sizes=hidden_sizes,output_size=1,dropout=0.2,model_string='xlm-roberta-large').to(device)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-6,weight_decay=0.1)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
loss_fn=nn.MarginRankingLoss(margin=0.3,reduction='sum')
train_losses, val_losses=train_margin(15,model,loss_fn,scheduler)

In [None]:
#TRAINING WITH MBERT AND MARGIN LOSS
hidden_sizes=[512,128]
model= COPAX_MARGIN(hidden_sizes=hidden_sizes,output_size=1,dropout=0.2,model_string='google-bert/bert-base-multilingual-cased').to(device)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-6,weight_decay=0.1)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
loss_fn=nn.MarginRankingLoss(margin=0.3,reduction='sum')
train_losses, val_losses=train_margin(10,model,loss_fn,scheduler)

In [None]:
from matplotlib.pylab import plt
from numpy import arange


# Generate a sequence of integers to represent the epoch numbers
epochs = range(10)

# Plot and label the training and validation loss values
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')

# Add in a title and axes labels
plt.title('Losses for MBERT with Margin Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='best')

# Display the plot
plt.show()

In [None]:
#TRAINING WITH MBERT AND ENTROPY LOSS
hidden_sizes=[512,128]
model= COPAX_ENTROPY(hidden_sizes=hidden_sizes,output_size=2,dropout=0.3,model_string='bert-base-multilingual-cased').to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-7,weight_decay=0.1)
loss_fn = nn.CrossEntropyLoss(reduction='sum')
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
train_losses, val_losses=train_entropy(10,model,loss_fn,scheduler)

In [None]:
from matplotlib.pylab import plt
from numpy import arange


# Generate a sequence of integers to represent the epoch numbers
epochs = range(15)

# Plot and label the training and validation loss values
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')

# Add in a title and axes labels
plt.title('Losses for MBERT with Entropy Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='best')

# Display the plot
plt.show()

In [None]:
from matplotlib.pylab import plt
from numpy import arange


# Generate a sequence of integers to represent the epoch numbers
epochs = range(15)

# Plot and label the training and validation loss values
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')

# Add in a title and axes labels
plt.title('Losses for MBERT with Entropy Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='best')

# Display the plot
plt.show()

In [None]:
#TRAINING WITH XML-ROBERTA base AND ENTROPY LOSS
hidden_sizes=[512,256,128]
model= COPAX_ENTROPY(hidden_sizes=hidden_sizes,output_size=2,dropout=0.2,model_string='xlm-roberta-base').to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-7,weight_decay=0.1)
loss_fn = nn.CrossEntropyLoss(reduction='sum')
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
train_losses, val_losses=train_entropy(15,model,loss_fn,scheduler)

In [None]:
from matplotlib.pylab import plt
from numpy import arange


# Generate a sequence of integers to represent the epoch numbers
epochs = range(15)

# Plot and label the training and validation loss values
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')

# Add in a title and axes labels
plt.title('Losses for MBERT with Entropy Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='best')

# Display the plot
plt.show()

In [None]:
from matplotlib.pylab import plt
from numpy import arange


# Generate a sequence of integers to represent the epoch numbers
epochs = range(15)

# Plot and label the training and validation loss values
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')

# Add in a title and axes labels
plt.title('Losses for MBERT with Entropy Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='best')

# Display the plot
plt.show()

In [None]:
from matplotlib.pylab import plt
from numpy import arange


# Generate a sequence of integers to represent the epoch numbers
epochs = range(15)

# Plot and label the training and validation loss values
plt.plot(epochs, train_losses, label='Training Loss')
plt.plot(epochs, val_losses, label='Validation Loss')

# Add in a title and axes labels
plt.title('Losses for MBERT with Entropy Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='best')

# Display the plot
plt.show()

In [None]:
torch.save(model.state_dict(), '/content/data/model.pt')

## Testing


In [None]:
df_test[df_test.language=='copa-hr-ckm']

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data_test = dataset(df_test[df_test.language=='copa-sr-tor'])
test_loader = DataLoader(data_test, batch_size=len(data_test), shuffle=False)
model = COPAX()
model.load_state_dict(torch.load('drive/MyDrive/Colab Notebooks/VarDial/model.pt'))
outputs=[]
model=model.to(device)
with torch.no_grad():
        correct = 0
        total = 0
        total_losss =0
        model.eval()
        for hyp1, hyp2, _ in test_loader:
            F1,F2 = model(hyp1, hyp2)
            emb= torch.cat((F1,F2),dim=1)
            output= torch.argmax(emb,dim=1)
            outputs+=output.tolist()



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
language='copa-en'
data_test = dataset(df_val[df_val.language=='copa-en'])
test_loader = DataLoader(data_test, batch_size=len(data_test), shuffle=False)
#TRAINING WITH MBERT AND MARGIN LOSS
hidden_sizes=[512,128]
model = COPAX_MARGIN(hidden_sizes=hidden_sizes,output_size=1,dropout=0.2,model_string='xlm-roberta-large').to(device)
model.load_state_dict(torch.load('model_large_margin'))
outputs=[]
model=model.to(device)
with torch.no_grad():
        correct = 0
        total = 0
        total_losss =0
        model.eval()
        for i, vdata in enumerate(test_loader):
            premise,choice1,choice2,vlabel= vdata
            vF1,vF2 = model(premise, choice1,choice2)
            vF1=vF1.squeeze(1)
            vF2=vF2.squeeze(1)
            concat= torch.column_stack((vF1, vF2))
            output= torch.argmax(concat,dim=1)
            total += vlabel.size(0)
            or_label=(vlabel-1)/(-2)
            correct += (output == or_label).sum().item()

print('Accuracy for language:',language,correct/total)


In [None]:
h=[1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0]

In [None]:
len(h)