# Approach C Training: Fine-tuned DeBERTa for NLI

## Imports

In [1]:
!pip install transformers[torch]
!pip install datasets

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [2]:
from datasets import Dataset,load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DefaultDataCollator, EvalPrediction, set_seed
import torch
import numpy as np
import pandas as pd
import os
import json
import random
import nltk
from nltk.corpus import wordnet
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Constants

In [5]:
# CONSTANTS
TRAIN_DIR = './drive/MyDrive/Colab/NLU_CW/train.csv'
VAL_DIR = './drive/MyDrive/Colab/NLU_CW/dev.csv'
ITERATION_1_PATH = './drive/MyDrive/Colab/NLU_CW/tuning-iterations/1'
ITERATION_2_PATH = './drive/MyDrive/Colab/NLU_CW/tuning-iterations/2'
ITERATION_3_PATH = './drive/MyDrive/Colab/NLU_CW/tuning-iterations/3'
FULL_DATA_TRAIN_PATH = './drive/MyDrive/Colab/NLU_CW/data-experiment/full_data'
TEN_P_DATA_TRAIN_PATH = './drive/MyDrive/Colab/NLU_CW/data-experiment/10p'
AUGMENTED_DATA_PATH = './drive/MyDrive/Colab/NLU_CW/data-experiment/augmented'

BASE_MODEL = 'microsoft/deberta-v3-base'
PREMISE = 'premise'
HYPOTHESIS = 'hypothesis'
LABEL = 'label'

# VARIABLES
MAX_INPUT_LENGTH = 300
SEED = 42

# DATA AUGMENTATION
PERCENT_WORD_DEL = 0.1
PERCENT_WORD_SYNONYM = 0.1

## Base DeBERTa Model

In [6]:
# DeBERTa: Decoding-enhanced BERT with Disentangled Attention
model_name = BASE_MODEL
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [None]:
premise = "The cat jumped over the lazy dog."
hypothesis = "The dog was lazy."

input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"

prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["contradiction", "entailment"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}

print(prediction)

## Data

### Helper Functions

In [8]:
def create_input_sequence(sample):

  # Extract data
  text = str(sample["premise"])
  hypothesis = str(sample['hypothesis'])
  label = sample['label']

  # Encoding the sequence using the tokenizer
  encoded_sequence = tokenizer(text, hypothesis, truncation=True, padding='max_length', max_length=512, return_overflowing_tokens=True)
  encoded_sequence['labels'] = label

  # Decode the input_ids
  encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)

  return encoded_sequence

In [9]:
def compute_metrics(p: EvalPrediction):
  # Extracting predictions from EvalPrediction object
  preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions

  # Obtaining the predicted classes
  preds = np.argmax(preds, axis = 1)

  # Calculating the ratio of predictions equal to 2 (assumed label)
  ratio = np.mean(preds == 2)

  # Dictionary to store computed metrics
  result = {}

  # Loading evaluation metrics
  metric_f1 = load_metric("f1")
  metric_precision = load_metric("precision")
  metric_recall = load_metric("recall")
  metric_acc = load_metric("accuracy")

  # Computing various metrics
  result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
  result["precision"] = metric_precision.compute(predictions = preds, references = p.label_ids,average = 'macro')['precision']
  result["recall"] = metric_recall.compute(predictions = preds, references = p.label_ids,average = 'macro')["recall"]
  result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, average = 'macro')["f1"]

  return result

In [10]:
def augment_data(samples):
  augmented_samples = []

  # DATA AUGMENTATION FOR POSITIVE SAMPLES
  for sample in samples:
      premise = sample[PREMISE]
      hypothesis = sample[HYPOTHESIS]
      label = sample[LABEL]

      # Word deletion
      for original in [premise, hypothesis]:
          split_original = original.split(' ')
          if(len(split_original) < 2): continue

          deletion_word_count = int(len(split_original) * PERCENT_WORD_DEL)
          if(deletion_word_count == 0 or len(split_original) < deletion_word_count):
              deletion_word_count = 1
          indices_to_delete = random.sample(range(len(split_original)), deletion_word_count)

          # Sort indices in descending order to delete elements without affecting the indices of other elements
          indices_to_delete.sort(reverse=True)
          # Delete elements at the generated indices
          for i in indices_to_delete:
              del split_original[i]
          augmented = " ".join(split_original)
          augmented_samples.append(
              {
                  PREMISE: original,
                  HYPOTHESIS: augmented,
                  LABEL: 1
              }
          )

      # Span deletion
      for original in [premise, hypothesis]:
          split_original = original.split(' ')
          if(len(split_original) < 3): continue

          index_span_start = random.sample(range(len(split_original) - 1), 1)[0]
          for i in [index_span_start + 1, index_span_start]:
              del split_original[i]
          augmented = " ".join(split_original)
          augmented_samples.append(
              {
                  PREMISE: original,
                  HYPOTHESIS: augmented,
                  LABEL: 1
              }
          )

      # Reorder span
      for original in [premise, hypothesis]:
          words = original.split()

          # Check if the sentence has at least 5 words
          if len(words) < 5: continue

          # Select two spans that don't overlap
          span1_start = random.randint(1, len(words) - 3)
          span2_start = random.randint(1, len(words) - 3)
          while abs(span1_start - span2_start) < 2:
              span2_start = random.randint(0, len(words) - 3)

          # Switch the positions
          words[span1_start:span1_start + 2], words[span2_start:span2_start + 2] = \
          words[span2_start:span2_start+2], words[span1_start:span1_start+2]

          # Join the words back into a sentence
          switched_sentence = ' '.join(words)
          augmented_samples.append(
              {
                  PREMISE: original,
                  HYPOTHESIS: switched_sentence,
                  LABEL: 1
              }
          )

      # Replace with synonym
      for original in [premise, hypothesis]:
          words = original.split(' ')
          num_words_to_replace = max(1, int(len(words) * PERCENT_WORD_SYNONYM))
          words_to_replace = random.sample(words, num_words_to_replace)
          synonym_mapping = {}
          for word in words_to_replace:
              synonyms = set()
              for syn in wordnet.synsets(word):
                  for lemma in syn.lemmas():
                      synonyms.add(lemma.name())
              synonyms.discard(word)
              if not synonyms:
                  continue
              synonym = random.choice(list(synonyms))
              synonym_mapping[word] = synonym

          replaced_sentence = ' '.join(synonym_mapping.get(word, word) for word in words)
          augmented_samples.append(
              {
                  PREMISE: original,
                  HYPOTHESIS: replaced_sentence,
                  LABEL: 1
              }
          )

  positives = len(augmented_samples)
  premises = []
  hypotheses = []

  # DATA AUGMENTATION FOR NEGATIVE SAMPLES
  for sample in samples:
      premises.append(sample[PREMISE])
      hypotheses.append(sample[HYPOTHESIS])

  for i in range(positives//2):
      augmented_samples.append(
          {
              PREMISE: random.choice(premises),
              HYPOTHESIS: random.choice(premises),
              LABEL: 0
          }
      )
      augmented_samples.append(
          {
              PREMISE: random.choice(hypotheses),
              HYPOTHESIS: random.choice(hypotheses),
              LABEL: 0
          }
      )

  for sample in samples:
      augmented_samples.append(
          {
              PREMISE: sample[PREMISE],
              HYPOTHESIS: sample[HYPOTHESIS],
              LABEL: sample[LABEL]
          }
      )

  augmented_list = []
  for augmented_sample in augmented_samples:
    augmented_list.append([augmented_sample[PREMISE],
                          augmented_sample[HYPOTHESIS],
                          augmented_sample[LABEL]])

  return pd.DataFrame(augmented_list, columns=[PREMISE, HYPOTHESIS, LABEL])

In [11]:
def shuffle_df(old_df: pd.DataFrame, cycles: int = 1) -> pd.DataFrame:
    np.random.seed(42)  # Set a random seed for reproducibility
    for i in range(cycles):
        # Shuffle the rows of the DataFrame
        new_df = old_df.sample(frac=1).reset_index(drop=True)
        return new_df

In [12]:
def read_json_files(directory):
    json_files = [file for file in os.listdir(directory) if file.endswith('.json')]
    data = []
    for file in json_files:
        with open(os.path.join(directory, file), 'r') as f:
            json_data = json.load(f)
            model_setup = f'wr:{json_data["warmup_ratio"]};efl:{json_data["efl_preprocessing"]};tc:{json_data["train_count"]};lr:{json_data["learn_rate"]};wd:{json_data["weight_decay"]}'
            model = [[model_setup,
                      details['epoch'],
                      details['train_loss'],
                      details['val_loss'],
                      details['val_accuracy'],
                      details['val_precision'],
                      details['val_recall'],
                      details['val_f1score']] for details in json_data['model_details']]
            data.extend(model)
    return data

### Training Dataset

In [13]:
train_df = pd.read_csv(TRAIN_DIR)
train_limited_df = train_df.head(2694)
train_limited_df.head()

Unnamed: 0,premise,hypothesis,label
0,"However, Fort Charles was rebuilt as a militar...",Fort Charles was rebuilt as an amusement park ...,0
1,Buchanan's The Democrats and Republicans have...,THe parties will never be similar.,0
2,In order to review an acquisition that is usin...,The auditor only reviews the acquisition itsel...,0
3,Three young people sit outside and engage with...,There is a tin can and string telephone.,0
4,The lucrative tin mines of Kuala Lumpur in the...,The Chinese labor was seen as less costly and ...,1


### Validation Dataset

In [14]:
val_df = pd.read_csv(VAL_DIR)
val_df.head()

Unnamed: 0,premise,hypothesis,label
0,Mon Dieu!,This person is speaking English.,0
1,"He really shook up my whole mindset, Broker sa...","His mindset never changed, Broker said.",0
2,Patients were asked to place themselves on a r...,Most patients rated themselves as a 5 on the s...,1
3,I managed to pick-pocket someone next to the s...,I stole someone's wallet near the concession s...,1
4,Forty comments were received and considered pr...,The decisions regarding the issuance of the fi...,1


### Few-shot Dataset

In [None]:
contradictions_df = train_df[train_df[LABEL] == 0]
contradictions_rows = contradictions_df.sample(n=4, random_state=42)

entailments_df = train_df[train_df[LABEL] == 1]
entailments_rows = entailments_df.sample(n=4, random_state=42)

fewshot_df = pd.concat([entailments_rows, contradictions_rows], axis=0)
fewshot_df

Unnamed: 0,premise,hypothesis,label
18151,The original decoration has long since disappe...,The original decoration was hundreds of years ...,1
5930,then is it reasonable or is it or could you as...,The other eleven or five could convince them i...,1
15901,Hugging the slopes of Mont Blanc (Monte Bianco...,Mont Blanc's Courmayeur was established in the...,1
24362,"Two days later, she quoted Dole in another Th...",She quoted Dole two days later but nobody like...,1
1699,"You know, the champion kite-golfer from San Pr...",There are no golfers that come from San Prego.,0
21458,"But he was older, perhaps a decade older than ...","He was much younger, about 5 years younger tha...",0
751,oh wow well we're only about two hours from th...,We are a few minutes away.,0
6990,The community is built on an escarpment and is...,The community is built on an escarpment and is...,0


### Augmented Dataset

In [None]:
augmented_df = augment_data([sample for sample in fewshot_df.to_dict(orient='index').values()])
augmented_df

Unnamed: 0,premise,hypothesis,label
0,The original decoration has long since disappe...,The decoration has long since disappeared.,1
1,The original decoration was hundreds of years ...,The original decoration was hundreds years old...,1
2,The original decoration has long since disappe...,The has long since disappeared.,1
3,The original decoration was hundreds of years ...,decoration was hundreds of years old before it...,1
4,The original decoration has long since disappe...,The has long original decoration since disappe...,1
...,...,...,...
131,"Two days later, she quoted Dole in another Th...",She quoted Dole two days later but nobody like...,1
132,"You know, the champion kite-golfer from San Pr...",There are no golfers that come from San Prego.,0
133,"But he was older, perhaps a decade older than ...","He was much younger, about 5 years younger tha...",0
134,oh wow well we're only about two hours from th...,We are a few minutes away.,0


### Prepare Data for Model Fine-tuning

In [None]:
train_shuffle_df = shuffle_df(train_limited_df)
val_shuffle_df = shuffle_df(val_df)

train = Dataset.from_pandas(train_shuffle_df)
val = Dataset.from_pandas(val_shuffle_df)

# Map the create_input_sequence function to the train and test datasets
# This function encodes the data, adds labels, and generates input sentences
train_dataset = train.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["label","premise"])
val_dataset = val.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["label","premise"])

Map:   0%|          | 0/2694 [00:00<?, ? examples/s]

Map:   0%|          | 0/6737 [00:00<?, ? examples/s]

## Model Training

### Gridspace of Hyperparameters

In [None]:
gridspace = {
    'lr_rate': [0.0002, 0.00002, 0.000002],
    'w_decay': [0.6, 0.06, 0.006]
}

### Hyperparameter Tuning (9 setups; 3 iterations; 6 epochs)

In [None]:
EPOCHS = 6
RESULT = f"drive/MyDrive/Colab/NLU_CW/transformer-approach/2/ITER.json"

ITER = 1
SKIP_TO = 9
for LR_RATE in gridspace['lr_rate']:
  for W_DECAY in gridspace['w_decay']:
    if(ITER < SKIP_TO):
      ITER += 1
      continue

    model_name = "microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model.to(device)

    training_args = TrainingArguments(
        "test",
        num_train_epochs=EPOCHS,              # total number of training epochs
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=LR_RATE,
        per_device_train_batch_size=16,   # batch size per device during training
        per_device_eval_batch_size=16,    # batch size for evaluation
        warmup_ratio=0.1,                # number of warmup steps for learning rate scheduler
        weight_decay=W_DECAY,               # strength of weight decay
        fp16=True                        # mixed precision training
    )
    data_collator = DefaultDataCollator()

    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # Commence the model training
    trainer.train()

    print(trainer.state.log_history)


    model_info = []
    curr_epoch = 1

    train_loss = 0
    for data in trainer.state.log_history:
      if('loss' in data.keys()):
        train_loss = data['loss']

      elif('eval_loss' in data.keys()):
        curr_epoch_performance = {
          'epoch': curr_epoch,
          'train_loss': train_loss,
          'val_loss': data['eval_loss'],
          'val_accuracy': data['eval_accuracy'],
          'val_precision': data['eval_precision'],
          'val_recall': data['eval_recall'],
          'val_f1score': data['eval_f1'],
        }
        curr_epoch += 1
        model_info.append(curr_epoch_performance)

    with open(RESULT.replace('ITER', str(ITER)), 'w') as f:
      json.dump({
      'warmup_ratio': 0.1,
      'efl_preprocessing': False,
      'train_count': 2694,
      'learn_rate': LR_RATE,
      'weight_decay': W_DECAY,
      'model_details': model_info}, f, ensure_ascii=False, indent=4)
    ITER += 1



## Model Evaluation Using Validation Data

### Gridspace Results

In [None]:
iteration_1 = read_json_files(ITERATION_1_PATH)
iteration_2 = read_json_files(ITERATION_2_PATH)
iteration_3 = read_json_files(ITERATION_3_PATH)

### Iteration 1

In [None]:
df1 = pd.DataFrame(iteration_1, columns=['model', 'epoch', 'train_loss', 'val_loss', 'val_accuracy', 'val_precision', 'val_recall', 'val_f1score'])
df1

Unnamed: 0,model,epoch,train_loss,val_loss,val_accuracy,val_precision,val_recall,val_f1score
0,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,1,0.6078,0.551336,0.721835,0.72174,0.72098,0.721145
1,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,2,0.6264,0.711628,0.51655,0.633206,0.500317,0.341435
2,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,3,0.6981,0.692962,0.530355,0.557541,0.516671,0.419037
3,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,4,0.6961,0.6974,0.483746,0.241873,0.5,0.32603
4,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,5,0.6939,0.693428,0.516254,0.508129,0.50001,0.340755
5,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,6,0.695,0.69241,0.516254,0.508129,0.50001,0.340755
6,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,1,0.6139,0.559217,0.719905,0.736293,0.715257,0.711944
7,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,2,0.5868,0.683996,0.677304,0.749319,0.668201,0.644975
8,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,3,0.4582,0.676233,0.709069,0.740691,0.714664,0.702475
9,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,4,0.3214,0.526967,0.798575,0.798677,0.797913,0.798142


### Iteration 2

In [None]:
df2 = pd.DataFrame(iteration_2, columns=['model', 'epoch', 'train_loss', 'val_loss', 'val_accuracy', 'val_precision', 'val_recall', 'val_f1score'])
df2

Unnamed: 0,model,epoch,train_loss,val_loss,val_accuracy,val_precision,val_recall,val_f1score
0,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,1,0.6852,0.692654,0.516254,0.258127,0.5,0.34048
1,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,2,0.7016,0.693512,0.516254,0.258127,0.5,0.34048
2,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,3,0.697,0.693782,0.483746,0.241873,0.5,0.32603
3,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,4,0.695,0.693965,0.483746,0.241873,0.5,0.32603
4,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,5,0.6939,0.693766,0.516254,0.258127,0.5,0.34048
5,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,6,0.6934,0.692618,0.516254,0.258127,0.5,0.34048
6,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,1,0.7003,0.7066,0.483746,0.241873,0.5,0.32603
7,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,2,0.6994,0.693097,0.516254,0.258127,0.5,0.34048
8,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,3,0.6984,0.694517,0.483746,0.241873,0.5,0.32603
9,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,4,0.6963,0.694985,0.483746,0.241873,0.5,0.32603


### Iteration 3

In [None]:
df3 = pd.DataFrame(iteration_3, columns=['model', 'epoch', 'train_loss', 'val_loss', 'val_accuracy', 'val_precision', 'val_recall', 'val_f1score'])
df3

Unnamed: 0,model,epoch,train_loss,val_loss,val_accuracy,val_precision,val_recall,val_f1score
0,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,1,0.6728,0.647289,0.633071,0.696639,0.623264,0.590639
1,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,2,0.6812,0.702108,0.516254,0.258127,0.5,0.34048
2,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,3,0.6983,0.6922,0.544753,0.548411,0.54731,0.54306
3,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,4,0.6935,0.657244,0.621196,0.63119,0.615714,0.607216
4,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,5,0.6503,0.655432,0.63411,0.645592,0.628685,0.620804
5,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,6,0.6297,0.652386,0.644204,0.647366,0.646006,0.643781
6,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,1,0.7003,0.706594,0.483746,0.241873,0.5,0.32603
7,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,2,0.6994,0.693095,0.516254,0.258127,0.5,0.34048
8,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,3,0.6984,0.694518,0.483746,0.241873,0.5,0.32603
9,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,4,0.6963,0.694988,0.483746,0.241873,0.5,0.32603


### Iteration Averaging

In [None]:
model_column = df1['model']
numeric_columns = df1.select_dtypes(include=['number']).columns

# Sum the corresponding numerical elements of the DataFrames
sum_df = df1[numeric_columns] + df2[numeric_columns] + df3[numeric_columns]

# Divide the sum by the number of DataFrames
average_df = sum_df / 3
average_df = pd.concat([model_column, average_df], axis=1)
average_df

Unnamed: 0,model,epoch,train_loss,val_loss,val_accuracy,val_precision,val_recall,val_f1score
0,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,1.0,0.655267,0.630426,0.62372,0.558835,0.614748,0.550755
1,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,2.0,0.669733,0.702416,0.516352,0.383153,0.500106,0.340798
2,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,3.0,0.6978,0.692981,0.519618,0.449275,0.521327,0.429376
3,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,4.0,0.694867,0.68287,0.529563,0.371645,0.538571,0.419759
4,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,5.0,0.679367,0.680875,0.555539,0.470616,0.542898,0.434013
5,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.6,6.0,0.6727,0.679138,0.558904,0.471207,0.548672,0.441672
6,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,1.0,0.6715,0.65747,0.562466,0.40668,0.571752,0.454668
7,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,2.0,0.661867,0.690063,0.569937,0.421857,0.556067,0.441978
8,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,3.0,0.618333,0.688423,0.558854,0.408146,0.571555,0.451512
9,wr:0.1;efl:False;tc:2694;lr:0.0002;wd:0.06,4.0,0.571333,0.63898,0.588689,0.427475,0.599304,0.483401


### Selected Hyperparameters

In [None]:
# wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.006
min_index = average_df['val_loss'].idxmin()
min_row = average_df.loc[min_index]
print(min_row)

model            wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.006
epoch                                                   2.0
train_loss                                           0.3157
val_loss                                            0.34655
val_accuracy                                       0.876058
val_precision                                      0.876832
val_recall                                         0.875384
val_f1score                                        0.875737
Name: 31, dtype: object


### Model Training w/ Different Datasets (3 datasets; 20 epochs)

In [None]:
train_dfs = [
    train_df,           # 26,944 train samples
    train_limited_df,   # 2694 train samples
    augmented_df,       # 136 train samples
]

In [None]:
SELECTED_LR_RATE = 2e-05
SELECTED_W_DECAY = 0.006
EPOCHS = 20
RESULT = f"./drive/MyDrive/Colab/NLU_CW/transformer-approach/data-experiment/COUNT.json"

set_seed(SEED)

for SELECTED_TRAIN_DATA in train_dfs:

  train = Dataset.from_pandas(SELECTED_TRAIN_DATA)
  val = Dataset.from_pandas(val_df)

  train_dataset = train.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["label","premise"])
  val_dataset = val.map(create_input_sequence,batched=True,batch_size=1,remove_columns=["label","premise"])

  model_name = "microsoft/deberta-v3-base"
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
  model.to(device)

  training_args = TrainingArguments(
      "test",
      num_train_epochs=EPOCHS,
      evaluation_strategy="epoch",
      logging_strategy="epoch",
      learning_rate=SELECTED_LR_RATE,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      warmup_ratio=0.1,
      weight_decay=SELECTED_W_DECAY,
      fp16=True
  )
  data_collator = DefaultDataCollator()

  trainer = Trainer(
      model,
      training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )

  # Commence the model training
  trainer.train()

  model_info = []
  curr_epoch = 1

  train_loss = 0
  for data in trainer.state.log_history:
    if('loss' in data.keys()):
      train_loss = data['loss']

    elif('eval_loss' in data.keys()):
      curr_epoch_performance = {
        'epoch': curr_epoch,
        'train_loss': train_loss,
        'val_loss': data['eval_loss'],
        'val_accuracy': data['eval_accuracy'],
        'val_precision': data['eval_precision'],
        'val_recall': data['eval_recall'],
        'val_f1score': data['eval_f1'],
      }
      curr_epoch += 1
      model_info.append(curr_epoch_performance)

  with open(RESULT.replace('COUNT', str(len(SELECTED_TRAIN_DATA))), 'w') as f:
    json.dump({
    'warmup_ratio': 0.1,
    'efl_preprocessing': SELECTED_TRAIN_DATA == augmented_df,
    'train_count': len(SELECTED_TRAIN_DATA),
    'learn_rate': LR_RATE,
    'weight_decay': W_DECAY,
    'model_details': model_info}, f, ensure_ascii=False, indent=4)

In [34]:
full_data = read_json_files(FULL_DATA_TRAIN_PATH)
ten_percent = read_json_files(TEN_P_DATA_TRAIN_PATH)
augmented = read_json_files(AUGMENTED_DATA_PATH)

In [31]:
full_data_df = pd.DataFrame(full_data, columns=['model', 'epoch', 'train_loss', 'val_loss', 'val_accuracy', 'val_precision', 'val_recall', 'val_f1score'])
full_data_df

Unnamed: 0,model,epoch,train_loss,val_loss,val_accuracy,val_precision,val_recall,val_f1score
0,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,1,0.4475,0.264229,0.907822,0.909338,0.906889,0.907509
1,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,2,0.2262,0.297636,0.91465,0.916657,0.913599,0.914321
2,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,3,0.1562,0.404853,0.91376,0.913672,0.913664,0.913668
3,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,4,0.0942,0.455234,0.91376,0.913787,0.914224,0.913739
4,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,5,0.0629,0.471519,0.913611,0.913904,0.913192,0.913454
5,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,6,0.0454,0.508698,0.912424,0.912906,0.913181,0.91242
6,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,7,0.0362,0.50201,0.913463,0.91334,0.913734,0.913423
7,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,8,0.0264,0.643191,0.915393,0.915234,0.915535,0.915338
8,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,9,0.0221,0.707064,0.913611,0.913841,0.91323,0.913463
9,wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06,10,0.0168,0.7159,0.914354,0.914224,0.914335,0.914275


In [37]:
min_index = full_data_df['val_loss'].idxmin()
min_row = full_data_df.loc[min_index]
print(min_row)

model            wr:0.1;efl:False;tc:26944;lr:2e-05;wd:0.06
epoch                                                     1
train_loss                                           0.4475
val_loss                                           0.264229
val_accuracy                                       0.907822
val_precision                                      0.909338
val_recall                                         0.906889
val_f1score                                        0.907509
Name: 0, dtype: object


In [35]:
ten_percent_df = pd.DataFrame(ten_percent, columns=['model', 'epoch', 'train_loss', 'val_loss', 'val_accuracy', 'val_precision', 'val_recall', 'val_f1score'])
ten_percent_df

Unnamed: 0,model,epoch,train_loss,val_loss,val_accuracy,val_precision,val_recall,val_f1score
0,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,1,0.693,0.691521,0.516254,0.258127,0.5,0.34048
1,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,2,0.5001,0.365259,0.854386,0.865798,0.851503,0.852387
2,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,3,0.2722,0.446698,0.876058,0.879997,0.874415,0.87528
3,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,4,0.1471,0.528877,0.873534,0.875223,0.874724,0.873526
4,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,5,0.0766,0.741926,0.872198,0.880356,0.869836,0.870883
5,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,6,0.0504,0.711563,0.884964,0.88485,0.884827,0.884839
6,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,7,0.018,0.845821,0.879323,0.879612,0.879944,0.879312
7,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,8,0.0136,0.836514,0.88437,0.88424,0.884262,0.884251
8,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,9,0.0216,0.844112,0.883034,0.88422,0.882128,0.882659
9,wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06,10,0.0062,0.858398,0.884815,0.884704,0.884674,0.884689


In [38]:
min_index = ten_percent_df['val_loss'].idxmin()
min_row = ten_percent_df.loc[min_index]
print(min_row)

model            wr:0.1;efl:False;tc:2694;lr:2e-05;wd:0.06
epoch                                                    2
train_loss                                          0.5001
val_loss                                          0.365259
val_accuracy                                      0.854386
val_precision                                     0.865798
val_recall                                        0.851503
val_f1score                                       0.852387
Name: 1, dtype: object


In [36]:
augmented_df = pd.DataFrame(augmented, columns=['model', 'epoch', 'train_loss', 'val_loss', 'val_accuracy', 'val_precision', 'val_recall', 'val_f1score'])
augmented_df

Unnamed: 0,model,epoch,train_loss,val_loss,val_accuracy,val_precision,val_recall,val_f1score
0,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,1,0.6973,0.69351,0.516254,0.258127,0.5,0.34048
1,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,2,0.6813,0.693112,0.516254,0.258127,0.5,0.34048
2,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,3,0.5824,0.731604,0.481223,0.481786,0.48179,0.481221
3,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,4,0.3622,0.995607,0.499332,0.511894,0.508294,0.463282
4,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,5,0.268,1.31814,0.481075,0.473388,0.475966,0.465135
5,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,6,0.193,1.774103,0.496066,0.504207,0.503363,0.473254
6,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,7,0.2377,1.984645,0.48909,0.493966,0.494674,0.476395
7,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,8,0.2125,1.982922,0.482856,0.484816,0.485091,0.4814
8,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,9,0.2013,2.103464,0.491168,0.497442,0.497875,0.472081
9,wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06,10,0.1667,2.039477,0.487754,0.492612,0.493545,0.473933


In [39]:
min_index = augmented_df['val_loss'].idxmin()
min_row = augmented_df.loc[min_index]
print(min_row)

model            wr:0.1;efl:True;tc:136;lr:2e-05;wd:0.06
epoch                                                  2
train_loss                                        0.6813
val_loss                                        0.693112
val_accuracy                                    0.516254
val_precision                                   0.258127
val_recall                                           0.5
val_f1score                                      0.34048
Name: 1, dtype: object
