<a href="https://colab.research.google.com/github/aliang9/nlpfa23/blob/main/bart_eng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!nvidia-smi

Tue Dec 12 02:21:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    27W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install -q transformers
!pip install -q pytorch_lightning
!pip install -q logger

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for logger (setup.py) ... [?25l[?25hdone


In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    AutoTokenizer,
    BartTokenizer,
    BartForConditionalGeneration,
    BartForQuestionAnswering,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
import csv
from dataclasses import dataclass

from enum import Enum
from typing import List, Optional
from transformers import PreTrainedTokenizer

In [None]:
# below code is adapted from https://github.com/huggingface/transformers/blob/master/examples/multiple-choice/utils_multiple_choice.py

@dataclass(frozen=True)
class InputExample:
    """
    A single training/test example for multiple choice
    Args:
        example_id: Unique id for the example.
        question: string. The untokenized text of the second sequence (question).
        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """

    example_id: str
    context: str
    endings: List[str]
    label: Optional[str]

class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"

class DataProcessor:
    """Base class for data converters for multiple choice data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_test_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the test set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

class EngProcessor(DataProcessor):
    """Processor for the English Joke data set."""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} train".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} dev".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {} test".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")

    def get_tiny_examples(self, data_dir):
        logger.info("LOOKING AT {} tiny".format(data_dir))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "tiny.csv")), "tiny")

    def get_labels(self):
        """See base class."""
        return ["0", "1", "2", "3"]

    def _read_csv(self, input_file):
        with open(input_file, "r", encoding="utf-8") as f:
            return list(csv.reader(f))

    def _create_examples(self, lines: List[List[str]], type: str):
        """Creates examples for the training and dev sets."""
        if type == "train" and lines[0][-1] != "label":
            raise ValueError("For training, the input file must contain a label column.")
        examples = [
            InputExample(
                example_id=line[0],
                context=line[1],
                endings=[line[2], line[3], line[4], line[5]],
                label=line[6],
            )
            for line in lines[1:]  # we skip the line with the column names
        ]
        return examples

In [None]:
class EngDataset(Dataset):
  def __init__(self, tokenizer, data_dir, type_path,  max_len=512):
    self.data_dir = data_dir
    self.type_path = type_path
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self.proc = EngProcessor()

    self._build()

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

  def __len__(self):
    return len(self.inputs)

  def _build(self):
    if self.type_path == 'train':
      examples = self.proc.get_train_examples(self.data_dir)
    elif self.type_path == 'val':
      examples = self.proc.get_dev_examples(self.data_dir)
    elif self.type_path == 'test':
      examples = self.proc.get_test_examples(self.data_dir)
    elif self.type_path == 'tiny':
      examples = self.proc.get_tiny_examples(self.data_dir)
    else:
      raise ValueError

    for example in examples:
      self._create_features(example)

  def _create_features(self, example):
    input_ = example.context
    options = ['%s: %s' % (i, option) for i, option in zip('1234', example.endings)]
    options = " ".join(options)
    input_ = "context: %s  options: %s" % (input_, options)
    target = "%s" % str(int(example.label) + 1)
    # target = "omakase" # LOL

    # tokenize inputs
    tokenized_inputs = self.tokenizer.batch_encode_plus(
        [input_], max_length=self.max_len, padding='max_length', return_tensors="pt", truncation=True
    )
    # tokenize targets
    tokenized_targets = self.tokenizer.batch_encode_plus(
        [target], max_length=3, padding='max_length', return_tensors="pt", truncation=True,
    )

    # print(len(tokenized_inputs))
    # print(len(tokenized_targets))
    # print("\n")
    self.inputs.append(tokenized_inputs)
    self.targets.append(tokenized_targets)

In [None]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
# tokenizer = BartTokenizer.from_pretrained("valhalla/bart-large-finetuned-squadv1")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

# Fine-Tuning Infrastructure

In [None]:
class BARTFineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(BARTFineTuner, self).__init__()
    self.save_hyperparameters(hparams) # self.hparams = hparams

    self.model = BartForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = BartTokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    self.training_step_outputs = [] # NEW
    self.validation_step_outputs = [] # NEW

  def is_logger(self):
    return True

  def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=lm_labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )
    # print(outputs)
    loss = outputs[0]
    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)
    self.training_step_outputs.append(loss) # NEW

    tensorboard_logs = {"train_loss": loss}
    # logger.info("train_loss: {}".format(loss))
    return {"loss": loss, "log": tensorboard_logs}

  def on_train_epoch_end(self):
    avg_train_loss = torch.stack(self.training_step_outputs).mean()
    self.training_step_outputs.clear()

    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    logger.info("average: {}".format(avg_train_loss))
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    self.validation_step_outputs.append(loss) # NEW
    return {"val_loss": loss}

  def on_validation_epoch_end(self):
    avg_loss = torch.stack(self.validation_step_outputs).mean()
    self.validation_step_outputs.clear() # free memory

    logger.info("average: {}".format(avg_loss))
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [None]:
# Create a logs folder if it doesn't exist
log_folder = "logs"
os.makedirs(log_folder, exist_ok=True)

# Configure the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create a file handler that writes logs to a file
log_file = os.path.join(log_folder, "logfile.log")
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)

# Create a console handler for printing logs to the console
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# Create a formatter and set it for both handlers
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
file_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)

# Add the handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)

# Log a message
logger.info("HIII")

logging.basicConfig(filename='logs/logfile.log',
                    level=logging.DEBUG,
                    force=True, # Resets any previous configuration
                    )

class LoggingCallback(pl.Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        logger.info("***** Train results *****")
        metrics = trainer.callback_metrics
        logger.info("metrics {}".format(metrics))
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, metrics[key]))

    def on_validation_epoch_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, metrics[key]))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

            # Log and save results to file
            output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, metrics[key]))
                        writer.write("{} = {}\n".format(key, metrics[key]))

2023-12-12 02:22:06 - INFO - HIII
INFO:__main__:HIII


In [None]:
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='facebook/bart-base', #'facebook/bart-large', #'t5-base',
    tokenizer_name_or_path='facebook/bart-base', #'facebook/bart-large', #'t5-base',
    max_seq_length=512,
    learning_rate=.5e-4,
    weight_decay=0.01,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=4,
    num_val_epochs=0,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=True,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

# Data Cleaning

In [None]:
# GET CSV...
df = pd.read_csv("eng_joke_data/updated-reddit-jokes.csv")
df.rename(columns={"title": "setup", "selftext": "punchline"}, inplace=True)
df.replace("\n", " ", regex=True, inplace=True)
df = df[~df['setup'].str.contains(r'\[Removed\]', case=False, regex=True)]
df = df[~df['punchline'].str.contains(r'\[Removed\]', case=False, regex=True)]
print(len(df))
df.head()

7215


Unnamed: 0,punchline,setup,score
0,"‘Really’ I said ‘No, April fooaarrrrglegargl...",My wife offered me a blowjob today.,4181
1,Now he’s in a pickle.,A man who lived by the sea grew a cucumber so ...,1239
2,“You herd me.”,“I love my job!” exclaimed the farmer. “All yo...,34617
3,She said: “in a mirror” This really happene...,I was doing a pretend job interview with my 6 ...,1914
4,An NSFW tag,How do you grab the attention of a pervert?,38402


In [None]:
import random
from sklearn.model_selection import train_test_split

print(len(df))
df['row_id'] = df.reset_index().index
df = df.dropna()
print(len(df))
# Split the DataFrame into train, validation, and test sets
train_df, test_val_df = train_test_split(df, test_size=.4, random_state=42)
val_df, test_df = train_test_split(test_val_df, test_size=.5, random_state=42)
_, tiny_df = train_test_split(train_df, test_size=0.01, random_state=42)

from random import randint
def generate_csv(df, directory):
    columns = ['row_id', 'setup', 'punchline1', 'punchline2', 'punchline3', 'punchline4', 'label']
    new_df = pd.DataFrame(columns=columns)

    punchline_names = ['punchline1', 'punchline2', 'punchline3', 'punchline4']

    count = 0
    for index, row in df.iterrows():
        punchline_list = df['punchline'].sample(n=4).to_list()
        label = count % 4
        count += 1

        new_df_row = {'row_id': row['row_id'], 'setup': row['setup'], 'label': label}

        for i, column in enumerate(punchline_names):
            if column[-1] == str(label+1):
                new_df_row[column] = row['punchline']
            else:
                new_df_row[column] = punchline_list[i]

        new_df = pd.concat([new_df, pd.DataFrame([new_df_row])], ignore_index=True)

    new_df.to_csv(f'eng_joke_data/{directory}.csv', index=False)


generate_csv(train_df, 'train')
generate_csv(val_df, 'val')
generate_csv(test_df, 'test')
generate_csv(tiny_df, 'tiny')

7215
7215


In [None]:
dataset = EngDataset(tokenizer, data_dir='eng_joke_data', type_path='tiny')
len(dataset)

2023-12-12 03:13:59 - INFO - LOOKING AT eng_joke_data tiny


44

In [None]:
for i in range(5, 10):
  data = dataset[i]
  print(tokenizer.decode(data['source_ids']))
  print(tokenizer.decode(data['target_ids']))

<s>context: I joined a gym and said to the trainer, “I want to impress beautiful girls, which machine should I use?”  options: 1: So please don't vote, her strap-on is huge and it really scares me. 2: He said, “Try the ATM outside” 3:  Are you having a crisis? 4: To them love means nothing.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [None]:
counts = {1:0, 2:0, 3:0, 4:0}
for i in range(len(dataset)):
  data = dataset[i]
  counts[int(tokenizer.decode(data['target_ids'])[3])] += 1
print(counts)

{1: 11, 2: 11, 3: 11, 4: 11}


# Training

In [None]:
!mkdir -p eng_joke_bart

In [None]:
args_dict.update({'data_dir': 'eng_joke_data', 'output_dir': 'eng_joke_bart'})
args = argparse.Namespace(**args_dict)
print(args_dict)

{'data_dir': 'eng_joke_data', 'output_dir': 'eng_joke_bart', 'model_name_or_path': 'facebook/bart-base', 'tokenizer_name_or_path': 'facebook/bart-base', 'max_seq_length': 512, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 8, 'eval_batch_size': 8, 'num_train_epochs': 5, 'num_val_epochs': 0, 'gradient_accumulation_steps': 16, 'n_gpu': 1, 'early_stop_callback': True, 'fp_16': False, 'opt_level': 'O1', 'max_grad_norm': 1.0, 'seed': 42}


In [None]:
# checkpoint_callback = pl.callbacks.ModelCheckpoint(
#     dirpath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
# )
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=5
)

# train_params = dict(
#     accumulate_grad_batches=args.gradient_accumulation_steps,
#     gpus=args.n_gpu,
#     max_epochs=args.num_train_epochs,
#     early_stop_callback=False,
#     precision= 16 if args.fp_16 else 32,
#     amp_level=args.opt_level,
#     gradient_clip_val=args.max_grad_norm,
#     checkpoint_callback=checkpoint_callback,
#     callbacks=[LoggingCallback()],
# )

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    accelerator='cuda',
    max_epochs=args.num_train_epochs,
    # early_stop_callback=False,
    precision=16 if args.fp_16 else 32,
    # amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    # checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

In [None]:
def get_dataset(tokenizer, type_path, args):
  return EngDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path, max_len=args.max_seq_length)

In [None]:
print(args)
model = BARTFineTuner(hparams=args)
trainer = pl.Trainer(**train_params)

Namespace(data_dir='eng_joke_data', output_dir='eng_joke_bart', model_name_or_path='facebook/bart-base', tokenizer_name_or_path='facebook/bart-base', max_seq_length=512, learning_rate=5e-05, weight_decay=0.01, adam_epsilon=1e-08, warmup_steps=0, train_batch_size=8, eval_batch_size=8, num_train_epochs=5, num_val_epochs=0, gradient_accumulation_steps=16, n_gpu=1, early_stop_callback=True, fp_16=False, opt_level='O1', max_grad_norm=1.0, seed=42)


In [None]:
trainer.fit(model)

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

2023-12-12 02:49:02 - INFO - LOOKING AT eng_joke_data dev
2023-12-12 02:49:04 - INFO - ***** Validation results *****
2023-12-12 02:49:04 - INFO - average: 7.992802619934082
2023-12-12 02:49:04 - INFO - LOOKING AT eng_joke_data train


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

2023-12-12 02:50:31 - INFO - ***** Validation results *****
2023-12-12 02:50:31 - INFO - average: 0.4599340558052063
2023-12-12 02:50:31 - INFO - ***** Train results *****
2023-12-12 02:50:31 - INFO - metrics {}
2023-12-12 02:50:31 - INFO - average: 0.7396703958511353


Validation: |          | 0/? [00:00<?, ?it/s]

2023-12-12 02:52:01 - INFO - ***** Validation results *****
2023-12-12 02:52:01 - INFO - average: 0.2320708930492401
2023-12-12 02:52:01 - INFO - ***** Train results *****
2023-12-12 02:52:01 - INFO - metrics {}
2023-12-12 02:52:01 - INFO - average: 0.4123914837837219


Validation: |          | 0/? [00:00<?, ?it/s]

2023-12-12 02:53:31 - INFO - ***** Validation results *****
2023-12-12 02:53:31 - INFO - average: 0.16746507585048676
2023-12-12 02:53:31 - INFO - ***** Train results *****
2023-12-12 02:53:31 - INFO - metrics {}
2023-12-12 02:53:31 - INFO - average: 0.2529967427253723


Validation: |          | 0/? [00:00<?, ?it/s]

2023-12-12 02:55:01 - INFO - ***** Validation results *****
2023-12-12 02:55:01 - INFO - average: 0.14686591923236847
2023-12-12 02:55:01 - INFO - ***** Train results *****
2023-12-12 02:55:01 - INFO - metrics {}
2023-12-12 02:55:01 - INFO - average: 0.17505908012390137


Validation: |          | 0/? [00:00<?, ?it/s]

2023-12-12 02:56:32 - INFO - ***** Validation results *****
2023-12-12 02:56:32 - INFO - average: 0.15000216662883759
2023-12-12 02:56:32 - INFO - ***** Train results *****
2023-12-12 02:56:32 - INFO - metrics {}
2023-12-12 02:56:32 - INFO - average: 0.11323516070842743


# Evaluation

In [None]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [None]:
dataset =  EngDataset(tokenizer, data_dir='eng_joke_data', type_path='test')
loader = DataLoader(dataset, batch_size=32, num_workers=4)

2023-12-12 03:14:39 - INFO - LOOKING AT eng_joke_data test


In [None]:
model.model.eval()
model.model.cuda()
outputs = []
targets = []
for batch in tqdm(loader):
  # print(batch['source_ids'])
  # for x in batch['source_ids']:
  #   print(tokenizer.decode(x))
  # print(batch['source_mask'])
  # print(batch['target_ids'])
  # for x in batch['target_ids']:
  #   print(tokenizer.decode(x))
  outs = model.model.generate(input_ids=batch['source_ids'].cuda(),
                              attention_mask=batch['source_mask'].cuda(),
                              use_cache=True,
                              max_length=4)
  dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
  target = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch["target_ids"]]
  ced = [tokenizer.decode(ids) for ids in outs]
  # print(ced)
  outputs.extend(dec)
  targets.extend(target)

  0%|          | 0/46 [00:00<?, ?it/s]

In [None]:
parsed_outs = []
for i, out in enumerate(outputs):
  if (i % 1000) == 1: print(out)
  parsed_outs.append(out)

2
1


In [None]:
print(len(targets))
print(targets[0])
print(parsed_outs[0])
metrics.accuracy_score(targets, parsed_outs)

1443
1
1


0.8662508662508662

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(y_true, y_pred):
    # Assuming y_true and y_pred are NumPy arrays or lists
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    return accuracy, precision, recall, f1

accuracy, precision, recall, f1 = calculate_metrics(targets, parsed_outs)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8662508662508662
Precision: 0.8663393149052386
Recall: 0.8662508662508662
F1 Score: 0.8662430118554454


In [None]:
print(len(targets))
for i in range(len(targets)):
  if targets[i] != parsed_outs[i]:
    print(tokenizer.decode(dataset[i]['source_ids']))
    print(targets[i])
    print(parsed_outs[i])
    if i > 100: break

1443
<s>context: Why do sumo wrestlers shave their legs?  options: 1: So people don't confuse them with feminists.  2: One has standards 3: So he rounded them up. 4: It's a riot</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad