# Setup

In [1]:
!git clone https://github.com/H-TayyarMadabushi/SemEval_2022_Task2-idiomaticity.git
!git clone https://github.com/H-TayyarMadabushi/AStitchInLanguageModels.git
!git clone https://github.com/huggingface/transformers.git
%cd transformers/
!pip install --editable .
%cd /content/ 
!pip install datasets

Cloning into 'SemEval_2022_Task2-idiomaticity'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 92 (delta 31), reused 67 (delta 15), pack-reused 0[K
Unpacking objects: 100% (92/92), done.
Cloning into 'AStitchInLanguageModels'...
remote: Enumerating objects: 1030, done.[K
remote: Counting objects: 100% (1030/1030), done.[K
remote: Compressing objects: 100% (772/772), done.[K
remote: Total 1030 (delta 382), reused 803 (delta 202), pack-reused 0[K
Receiving objects: 100% (1030/1030), 79.86 MiB | 32.80 MiB/s, done.
Resolving deltas: 100% (382/382), done.
Cloning into 'transformers'...
remote: Enumerating objects: 91488, done.[K
remote: Counting objects: 100% (5/5), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 91488 (delta 0), reused 1 (delta 0), pack-reused 91483[K
Receiving objects: 100% (91488/91488), 75.67 MiB | 27.88 MiB/s, done.
Resolving d

# Imports

In [2]:
import site
site.main()
import os
import csv
import os
import sys
import random
import pickle
import logging
import tqdm
from pathlib import Path

from typing          import Optional
from dataclasses     import dataclass, field

import numpy as np
from sklearn.metrics import f1_score, accuracy_score

from datasets        import load_dataset, load_metric

import transformers
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.utils         import check_min_version
from transformers.trainer_utils import get_last_checkpoint, is_main_process

from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


# Helper Functions

In [3]:
def load_csv(path, delimiter=',') : 
  header = None
  data   = list()
  with open(path, encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=delimiter) 
    for row in reader : 
      if header is None : 
        header = row
        continue
      data.append(row) 
  return header, data

class Node():
  def __init__(self, sentence, label):
    self.sentence = sentence
    self.label = label

def create_idiom_dict_train(directory, file_name) :
    idiom_dict = {}
    file_name = os.path.join(directory, file_name) 
    header, data = load_csv(file_name)
    for elem in data:
        label     = elem[header.index('Label')]
        sentence = elem[header.index('Target')]
        idiom = elem[header.index('MWE')]
        if idiom in idiom_dict:
          idiom_dict[idiom].append(Node(sentence, label))
        else:
          idiom_dict[idiom] = [Node(sentence, label)]
    return idiom_dict
d1 = create_idiom_dict_train('SemEval_2022_Task2-idiomaticity/SubTaskA/Data/', 'train_zero_shot.csv')
d2 = create_idiom_dict_train('SemEval_2022_Task2-idiomaticity/SubTaskA/Data/', 'train_one_shot.csv')
for key, value in d2.items():
  if key in d1:
    d1[key].append(value)
  else:
    d1[key] = value

In [16]:
def _get_train_data(directory, file_name, include_context, include_idiom):
    file_name = os.path.join(directory, file_name) 
    header, data = load_csv(file_name)
    out_header = ['label1', 'label2', 'sentence1', 'sentence3']
    
    if include_idiom :
        out_header = [ 'label1', 'label2', 'sentence1', 'sentence2', 'sentence3', 'sentence4' ]
        
    # ['DataID', 'Language', 'MWE', 'Setting', 'Previous', 'Target', 'Next', 'Label']
    out_data = list()
    for elem1 in data :
        label     = elem1[header.index('Label')]
        sentence1 = elem1[header.index('Target')]
        if include_context :
            sentence1 = ' '.join([elem1[header.index('Previous')], elem1[header.index('Target')], elem1[header.index('Next')]])
        for elem2 in d1[elem1[header.index('MWE')]]:
          if elem2.sentence != sentence1:
              label2 = elem2.label
              sentence2 = elem2.sentence
              this_row = None
              if not include_idiom :
                  this_row = [label, label2, sentence1, sentence2]
              else :
                  sentence3 = elem1[header.index('MWE')]
                  sentence4 = sentence3
                  this_row = [label, label2, sentence1, sentence3, sentence2, sentence4]
              out_data.append( this_row )
              assert len(out_header) == len(this_row)
    
    return [out_header] + out_data

def _get_dev_data(directory, input_file_name, gold_file_name, include_context, include_idiom) :
    input_headers, input_data = load_csv(os.path.join(directory, input_file_name)) 
    gold_header, gold_data  = load_csv( os.path.join(directory, gold_file_name))
    assert len(input_data) == len(gold_data)
    # ['ID', 'Language', 'MWE', 'Previous', 'Target', 'Next']
    # ['ID', 'DataID', 'Language', 'Label']
    out_header = ['label1', 'label2', 'sentence1', 'sentence3']
    if include_idiom :
        out_header = ['label1', 'label2', 'label3', 'sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5', 'sentence6']

    out_data = list()
    for index in range(len(input_data)) :
        this_input_id = input_data[index][input_headers.index('ID')]
        this_gold_id = gold_data[index][gold_header.index('ID')]
        assert this_input_id == this_gold_id
        label = gold_data[index][gold_header.index('Label')]
        elem = input_data[index]
        sentence1 = elem[input_headers.index('Target')]
        if include_context :
            sentence1 = ' '.join([elem[input_headers.index('Previous')], elem[input_headers.index('Target')], elem[input_headers.index('Next')]])
        this_row = None
        if not include_idiom:
            this_row = [label, sentence1]
        else :
            sentence2 = elem[input_headers.index('MWE')]
            this_row = [label, sentence1, sentence2]
        idiom = elem[input_headers.index('MWE')]
        other_nodes = d1[idiom]
        if(len(other_nodes)==1):
            if not include_idiom:
                this_row = [label, other_nodes[0].label, sentence1, other_nodes[0].sentence] 
            else :
                sentence2 = elem[input_headers.index('MWE')]
                this_row = [label, other_nodes[0].label, other_nodes[0].label, sentence1, sentence2, other_nodes[0].sentence, sentence2, other_nodes[0].sentence, sentence2]
        else:
            if not include_idiom:
                this_row = [label, other_nodes[0].label, sentence1, other_nodes[0].sentence] 
            else :
                sentence2 = elem[input_headers.index('MWE')]
                this_row = [label, other_nodes[0].label, other_nodes[1].label, sentence1, sentence2, other_nodes[0].sentence, sentence2, other_nodes[1].sentence, sentence2]    
        # print(this_row)
        assert len(out_header) == len(this_row) 
        out_data.append(this_row)
    return [out_header] + out_data

In [5]:
def preprocess(input, tokenizer):
    input1 = []
    input2 = []
    label1 = []
    label2 = []
    for i in input:
      """if(i[1]!='1' and i[1]!='0'):
        continue"""
      label1.append(int(i[0]))
      label2.append(int(i[1]))
      args = (
            (i[2], i[3])
      )
      input1.append(args)
      args = (
            (i[4], i[5])
      )
      input2.append(args)
    encoded_input1 = tokenizer(input1, padding=True, return_tensors="pt")
    encoded_input2 = tokenizer(input2, padding=True, return_tensors="pt")
    input_ids1 = encoded_input1['input_ids']
    attention_mask1 = encoded_input1['attention_mask']
    labels1 = torch.tensor(label1)
    # print(input_ids1.size(), attention_mask1.size(), labels1.size())
    input_ids2 = encoded_input2['input_ids']
    attention_mask2 = encoded_input2['attention_mask']
    labels2 = torch.tensor(label2)
    # print(input_ids2.size(), attention_mask2.size(), labels2.size())
    return input_ids1, attention_mask1, labels1, input_ids2, attention_mask2, labels2

def preprocess_dev(input, tokenizer):
    input1 = []
    input2 = []
    input3 = []
    label1 = []
    label2 = []
    label3 = []
    for i in input:
      """if(i[1]!='1' and i[1]!='0'):
        continue"""
      label1.append(int(i[0]))
      label2.append(int(i[1]))
      label3.append(int(i[2]))
      args = (
            (i[3], i[4])
      )
      input1.append(args)
      args = (
            (i[5], i[6])
      )
      input2.append(args)
      args = (
          (i[7], i[8])
      )
      input3.append(args)
    encoded_input1 = tokenizer(input1, padding=True, return_tensors="pt")
    encoded_input2 = tokenizer(input2, padding=True, return_tensors="pt")
    encoded_input3 = tokenizer(input3, padding=True, return_tensors="pt")
    input_ids1 = encoded_input1['input_ids']
    attention_mask1 = encoded_input1['attention_mask']
    labels1 = torch.tensor(label1)
    input_ids2 = encoded_input2['input_ids']
    attention_mask2 = encoded_input2['attention_mask']
    labels2 = torch.tensor(label2)
    input_ids3 = encoded_input3['input_ids']
    attention_mask3 = encoded_input3['attention_mask']
    labels3 = torch.tensor(label3)
    return input_ids1, attention_mask1, labels1, input_ids2, attention_mask2, labels2, input_ids3, attention_mask3, labels3

In [6]:
def shuffle_data(data):
    indices = list(range(len(data)))
    random.shuffle(indices)
    shuffled_data = []
    for i in indices:
        shuffled_data.append(data[i])
    return shuffled_data

In [32]:
def train(model, train_data, tokenizer, lr, n_epoch, batch_size):
    print("Start Training!")
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    num_classes = 2
    shuffled_train_data = shuffle_data(train_data)
    for epoch in range(n_epoch):  
        torch.cuda.empty_cache()
        print(f"\nEpoch {epoch}")
        total_loss = 0.0
        total_steps = 0
        for step in tqdm.notebook.tqdm(range(0, len(train_data), batch_size), leave=False):
            batch = preprocess(shuffled_train_data[step:(step + batch_size)], tokenizer)
            batch = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model(batch[0], batch[1], batch[3], batch[4])
            target = torch.where(batch[2]==batch[5], 1.0, 0.0)
            loss = nn.functional.mse_loss(logits, target.reshape(-1, 1))
            loss.backward()
            total_loss += loss.item()
            total_steps += 1
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        print("Train loss on epoch {}: {}\n".format(epoch, total_loss / total_steps))

def eval(model, eval_data, tokenizer, batch_size):
    model.eval()
    predictions, true_labels = [], []
    num_correct = 0
    for step in tqdm.notebook.tqdm(range(0, len(eval_data), batch_size), leave=False):
        batch = preprocess_dev(eval_data[step:(step + batch_size)], tokenizer)
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            logits1 = model(batch[0], batch[1], batch[3], batch[4]) #B, 1
        with torch.no_grad():
            logits2 = model(batch[0], batch[1], batch[6], batch[7]) #B, 1
        logits = torch.cat((logits1, logits2), dim=1) #B, 2
        shifted_logits = torch.abs(logits - 1)
        max_arg = torch.argmax(shifted_logits, dim=1) #B - can be 0, 1.
        batch_predictions = []
        batch_true_labels = batch[2]
        first_sentence_labels = batch[5]
        second_sentence_labels = batch[8]
        sentence_labels = [first_sentence_labels, second_sentence_labels]
        for idx in range(len(logits)):
          if logits[idx][max_arg[idx]] < 0.5:
            batch_predictions.append((sentence_labels[max_arg[idx]][idx] - 1) * -1) # 0, 1 toggle
          else:
            batch_predictions.append(sentence_labels[max_arg[idx]][idx])
        predictions += batch_predictions
        true_labels += batch_true_labels
    for pred, true_label in zip(predictions, true_labels):
        if pred == true_label:
            num_correct += 1
    print("\nAccuracy: %s" % (float(num_correct) / float(len(true_labels))))

#Data Loader

In [17]:
train_zero_data = _get_train_data(
        directory   = 'SemEval_2022_Task2-idiomaticity/SubTaskA/Data/',
        file_name       = 'train_zero_shot.csv',
        include_context = False,
        include_idiom   = True
    )
train_one_data = _get_train_data(
        directory   = 'SemEval_2022_Task2-idiomaticity/SubTaskA/Data/',
        file_name       = 'train_one_shot.csv',
        include_context = False,
        include_idiom   = True
    )

assert train_zero_data[0] == train_one_data[0] ## Headers
train_data = train_one_data + train_zero_data[1:]

dev_data = _get_dev_data(
        directory    = 'SemEval_2022_Task2-idiomaticity/SubTaskA/Data/',
        input_file_name  = 'dev.csv',
        gold_file_name   = 'dev_gold.csv', 
        include_context  = False,
        include_idiom    = True
    )

# Config

In [18]:
batch_size = 32
learning_rate = 0.0001
num_epoch = 2
dropout_rate = 0.25
hf_model = 'distilbert-base-uncased'

#Model

In [21]:
class RelationNetwork(nn.Module):
    def __init__(self, hf_model, dropout_rate=0.25):
        super(RelationNetwork, self).__init__()
        self.base_model = AutoModel.from_pretrained(hf_model)
        self.dropout = nn.Dropout(dropout_rate)
        self.feedforward_1 = nn.Linear(768*2, 300)
        self.non_lin_1 = nn.PReLU()
        self.feedforward_2 = nn.Linear(300, 300)
        self.non_lin_2 = nn.PReLU()
        self.feedforward_3 = nn.Linear(300, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids1, attn_mask1, input_ids2, attn_mask2):
        outputs1 = self.base_model(input_ids1, attention_mask=attn_mask1).last_hidden_state[:, 0]
        outputs2 = self.base_model(input_ids2, attention_mask=attn_mask2).last_hidden_state[:, 0]
        concatenated_output = torch.cat((outputs1, outputs2), axis=1)
        f1 = self.dropout(self.non_lin_1(self.feedforward_1(concatenated_output)))
        f2 = self.dropout(self.non_lin_2(self.feedforward_2(f1)))
        return self.sigmoid(self.feedforward_3(f2))

In [27]:
tokenizer = AutoTokenizer.from_pretrained(hf_model)
model = RelationNetwork(hf_model, dropout_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.empty_cache()

if n_gpu > 1:
    model.to(device)
    model = torch.nn.DataParallel(model)
else:
    model.cuda()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#Train

In [28]:
train(model, train_data[1:], tokenizer, learning_rate, num_epoch, batch_size)

Start Training!

Epoch 0


  0%|          | 0/2694 [00:00<?, ?it/s]

Train loss on epoch 0: 0.044287449634244413


Epoch 1


  0%|          | 0/2694 [00:00<?, ?it/s]

Train loss on epoch 1: 0.050679691485157656



#Eval

In [33]:
eval(model, dev_data[1:], tokenizer, batch_size)

  0%|          | 0/24 [00:00<?, ?it/s]


Accuracy: 0.7591339648173207
