In [None]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import pandas as pd
train_df = pd.read_csv('/content/drive/My Drive/hw4/pnli_train.csv', header = None)
val_df   = pd.read_csv('/content/drive/My Drive/hw4/pnli_dev.csv', header = None)
test_df  = pd.read_csv('/content/drive/My Drive/hw4/pnli_test_unlabeled.csv', header = None)

In [None]:
import nlpaug
import transformers
import sentencepiece
import nlpaug.augmenter.word as naw

In [None]:
x, y, label = [], [], []
for i in range(len(train_df)):
  data = train_df.iloc[i]
  x.append(data[0]) 
  y.append(data[1])
  label.append(int(data[2]))

In [None]:
back_translation_aug = naw.BackTranslationAug(from_model_name = "Helsinki-NLP/opus-mt-en-ar", to_model_name = "Helsinki-NLP/opus-mt-ar-en", device = device )
x_aug = back_translation_aug.augment(x)
y_aug = back_translation_aug.augment(y)

In [None]:
x_aug_df = pd.DataFrame(x_aug)
y_aug_df = pd.DataFrame(y_aug)
label_df = pd.DataFrame(label)

In [None]:
df = pd.concat([x_aug_df, y_aug_df], axis=1)
aug_df = pd.concat([df, label_df], axis=1)
#after saving the data, manually update the excel file to match with the training file
aug_df.to_csv("/content/drive/My Drive/hw4/back_translated_augmented_data_ar.csv")

In [None]:
import pandas as pd
train_df = pd.read_csv('/content/drive/My Drive/hw4/pnli_val_train.csv', header = None)
# val_df   = pd.read_csv('/content/drive/My Drive/hw4/pnli_dev.csv', header = None)
test_df  = pd.read_csv('/content/drive/My Drive/hw4/pnli_test_unlabeled.csv', header = None)

In [None]:
train_df

In [None]:
test_df

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

class DataDeBerta(Dataset):
    def __init__(self, train_df, test_df):

        self.train_df = train_df
        self.test_df = test_df
        self.train_data = None
        self.tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-v3-large', do_lower_case = True)
        self.init_data()

    def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.test_data = self.load_test_data(self.test_df)

     def init_data(self):
        self.train_data = self.load_data(self.train_df)
        self.test_data = self.load_test_data(self.test_df)

    def load_data(self, df):
        MAX_LEN = 512
        token_ids, mask_ids, seg_ids, y = [], [], [], []

        precondition_list = df[0].to_list()
        sentence_list = df[1].to_list()
        label_list = df[2].to_list()

        for (precondition, sentence, label) in zip(precondition_list, sentence_list, label_list):
            precondition_id = self.tokenizer.encode(precondition, add_special_tokens = False)
            sentence_id = self.tokenizer.encode(sentence, add_special_tokens = False)
            pair_token_ids = [self.tokenizer.cls_token_id] + precondition_id + [self.tokenizer.sep_token_id] + sentence_id + [self.tokenizer.sep_token_id]
            precondition_len = len(precondition_id)
            sentence_len = len(sentence_id)

            segment_ids = torch.tensor([0] * (precondition_len + 2) + [1] * (sentence_len + 1)) 
            attention_mask_ids = torch.tensor([1] * (precondition_len + sentence_len + 3))  # mask padded values

            token_ids.append(torch.tensor(pair_token_ids))
            seg_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)
            y.append(int(label))

        token_ids = pad_sequence(token_ids, batch_first=True)
        mask_ids = pad_sequence(mask_ids, batch_first=True)
        seg_ids = pad_sequence(seg_ids, batch_first=True)
        y = torch.tensor(y)
        dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
        print(len(dataset))
        return dataset

    def load_test_data(self, df):
        MAX_LEN = 512
        token_ids, mask_ids, segment_ids, y = [], [], [], []

        premise_list = df[0].to_list()
        entailment_list = df[1].to_list()

        for (premise, entailment, label) in zip(premise_list, entailment_list, label_list):
            premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
            entailment_id = self.tokenizer.encode(entailment, add_special_tokens = False)
            pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + entailment_id + [self.tokenizer.sep_token_id]
            premise_len, entailment_len = len(premise_id), len(entailment_id)

            segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (entailment_len + 1)) 
            attention_mask_ids = torch.tensor([1] * (premise_len + entailment_len + 3))

            token_ids.append(torch.tensor(pair_token_ids))
            segment_ids.append(segment_ids)
            mask_ids.append(attention_mask_ids)

        token_ids = pad_sequence(token_ids, batch_first = True)
        mask_ids = pad_sequence(mask_ids, batch_first = True)
        segment_ids = pad_sequence(segment_ids, batch_first = True)
        dataset = TensorDataset(token_ids, mask_ids, segment_ids)
        # print(len(dataset))
        return dataset

    def get_data_loaders(self, batch_size = 32):
        train_loader = DataLoader(self.train_data, batch_size = batch_size, shuffle = True)
        test_loader = DataLoader(self.test_data, batch_size = batch_size)
        return train_loader, test_loader

In [None]:
train_dataset = DataDeBerta(train_df, test_df)
train_loader, test_loader = train_dataset.get_data_loaders(batch_size = 16)
print(len(train_loader))
print(len(test_loader))

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-v3-large', num_labels = 2, ignore_mismatched_sizes = True)
model.to(device)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.005}
]

In [None]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr = 2e-5, correct_bias = False)

In [None]:
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

In [None]:
class PreconditionInference:
  def __init__(self, model, train_loadder, test_loader, optimizer, epochs = 3):
    self.epochs = epochs
    self.model = model
    self.optimizer = optimizer
    self.test_loader = test_loader
    self.train_loader = train_loadder

  def multi_acc(self, y_pred, y_test):
    acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
    return acc

  def train(self):
    for epoch in range(self.epochs):
        self.model.train()
        total_train_loss, total_train_acc  = 0, 0
        for batch_idx, (pair_token_ids, mask_ids, segment_ids, y) in enumerate(self.train_loader):
            self.optimizer.zero_grad()
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            segment_ids = segment_ids.to(device)
            labels = y.to(device)
            loss, prediction = self.model(pair_token_ids, token_type_ids = segment_ids, attention_mask = mask_ids, labels = labels).values()
            acc = self.multi_acc(prediction, labels)

            loss.backward()
            self.optimizer.step()
            
            total_train_loss += loss.item()
            total_train_acc  += acc.item()

        train_acc  = total_train_acc/len(self.train_loader)
        train_loss = total_train_loss/len(self.train_loader)
        self.model.eval()
        print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f}')

  def predict(self):
    predictions = []
    with torch.no_grad():
        for batch_idx, (pair_token_ids, mask_ids, seg_ids) in enumerate(self.test_loader):
            pair_token_ids = pair_token_ids.to(device)
            mask_ids = mask_ids.to(device)
            seg_ids = seg_ids.to(device)
            prediction = self.model(pair_token_ids, token_type_ids = seg_ids, attention_mask = mask_ids).values()
            for pred in prediction:
              predictions.append(pred)

    data = []
    for prediction in predictions:
      for pred in prediction:
        data.append(pred)
    labels = [ 0 if label[0]>label[1] else 1 for label in data ]
    return labels


In [None]:
preconditionInference = PreconditionInference(model, train_loader, test_loader, optimizer)

In [None]:
preconditionInference.train()

In [None]:
predictions = preconditionInference.predict()

In [None]:
with open('/content/drive/My Drive/hw4/upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in predictions:
        fp.write(str(x) + '\n')