In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [23]:
MODEL = 't5-base'
BATCH_SIZE = 16
EPOCHS = 10
OUT_DIR = 't5_base_distractors_v2/continual'
MAX_SOURCE_LENGTH = 256
MAX_TARGET_LENGTH = 128
LEARNING_RATE = 1e-5

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# tokenizer = T5Tokenizer.from_pretrained(f"google-t5/{MODEL}")
# model = T5ForConditionalGeneration.from_pretrained(f"google-t5/{MODEL}")

# for continual training after a checkpoint
tokenizer = T5Tokenizer.from_pretrained(f"t5_base_distractors_v2/checkpoint-3500")
model = T5ForConditionalGeneration.from_pretrained(f"t5_base_distractors_v2/checkpoint-3500")

model.to('cuda')

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

Get Dataset

In [6]:
miscel_data = pd.read_csv('Datasets/science_questions/miscellaneous.csv')
miscel_data.shape

(6000, 8)

In [7]:
miscel_data.head(5)

Unnamed: 0,prompt,A,B,C,D,E,answer,wikipedia_excerpt
0,Who was responsible for the reorganisation of ...,Territorial brigades,First line divisions,Training Reserve,Second line divisions,British home army,C,British home army in the First World War: The ...
1,What film earned Rakshit Shetty the Karnataka ...,Rakshit Shetty did not win the Karnataka State...,Nam Areal Ondina,Ulidavaru Kandanthe,The information is not provided in the Wikiped...,Simple Agi Ondh Love Story,C,Rakshit Shetty: Rakshit made his acting debut ...
2,What is the population of Maklavan?,"Maklavan has a population of 5,000 individuals...","Maklavan has a population of 1,500 individuals...","Maklavan has a population of 2,800 individuals...","Maklavan has a population of 3,800 individuals...","Maklavan has a population of 2,170 individuals...",E,"Maklavan: Maklavan (, also Romanized as Mākalā..."
3,What was the stud fee for Empire Maker at Gain...,"$90,000","$120,000","$85,000","$100,000","$75,000",D,"Empire Maker: In September 2015, it was announ..."
4,What books has Brian J. Bowe published for Ens...,"Books about The Ramones, The Clash, and Judas ...","Books about The Ramones, The MC5, and Was (Not...","Books about The Ramones, The Clash, and The MC5.","Books about The Clash, The Stooges, and Judas ...","Books about The Stooges, The MC5, and Was (Not...",A,Brian J. Bowe: He co-edited the 2007 anthology...


In [8]:
def move_correct_ans_to_A(row):
    if row['answer'].strip() == "A":
        return row
    ans = row[row['answer']]
    row[row['answer']] = row['A']
    row['A'] = ans
    return row

miscel_data = miscel_data.apply(move_correct_ans_to_A, axis=1)

In [9]:
miscel_data.drop(labels=['answer'], axis=1, inplace=True)

In [10]:
prefix = "generate 4 distinct distractors:"
def preprocess_data(dataset, tokenizer):
    prompts = [f"{prefix} context: {context}, question: {question}, answer: {answer}" for context, question, answer in zip(dataset['wikipedia_excerpt'], dataset['prompt'], dataset['A'])]
    distractors = [f"distractor 1: {dis1}, distractor 2: {dis2}, distractor 3: {dis3}, distractor 4: {dis4}" for dis1, dis2, dis3, dis4 in zip(dataset['B'], dataset['C'], dataset['D'], dataset['E'])]

    inputs = tokenizer(
        text=prompts,
        max_length=MAX_SOURCE_LENGTH,
        padding='max_length',
        truncation=True, 
        return_tensors='pt'
    )
    labels = tokenizer(
        text_target=distractors,
        max_length=MAX_TARGET_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels['input_ids'],
        'decoder_attention_mask': labels['attention_mask']
    }

In [11]:
tokenized_data = preprocess_data(miscel_data, tokenizer)

In [12]:
import random

def split_dict(data, test_size=0.2):
    data_len = len(data['labels'])
    test_indices = []
    test_num = int(data_len * test_size)
    while True:
        rand_num = random.randrange(0, data_len)
        if rand_num not in test_indices:
            test_indices.append(rand_num)
        if len(test_indices) == test_num:
            break
        
    train_indices = [i for i in np.arange(data_len) if i not in test_indices]
    train_data = {}
    test_data = {}
    for key in data.keys():
        train_data[key] = data[key][train_indices]
        test_data[key] = data[key][test_indices] 

    return train_data, test_data

In [13]:
train_data, test_data = split_dict(tokenized_data, 0.2)

In [14]:
import datasets
train_set = datasets.Dataset.from_dict(train_data)
test_set = datasets.Dataset.from_dict(test_data)

Set up Training

In [24]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir=OUT_DIR,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy='steps',
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    save_total_limit=2,
)

In [25]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=train_set,
    eval_dataset=test_set
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [26]:
history = trainer.train()

Step,Training Loss,Validation Loss
500,0.6044,0.524672
1000,0.5855,0.522902
1500,0.5788,0.520941
2000,0.568,0.520442


KeyboardInterrupt: 

Inference

In [1]:
MAX_SOURCE_LENGTH = 256
MAX_TARGET_LENGTH = 128

In [3]:
# run this cell only once
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "output_train_all_with_additional_reverse_dataset1/checkpoint-45000"
if 'tokenizer' not in locals(): # prevent accidental re-run of cell
    tokenizer = AutoTokenizer.from_pretrained(model_path)

if 'model' not in locals(): # prevent accidental re-run of cell
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(torch.device('cuda'))

In [3]:
prefix = "generate 3 distinct distractors:"
context = """Multiple choice questions (MCQs) are a popular \
questioning format to assess reading comprehension (RC). Compared to written answers, MCQs \
allow for quick and automatic evaluation, and consistent scoring. Given a passage, question, and a \
set of plausible answers, the student needs to select \
the single correct answer. The main challenge that \
the student faces in this form of questions is the \
relatedness of the plausible answers (distractors) to \
each other, and the semantic context consistency \
of the plausible answers to the question and context """

question = "What is the purpose of multiple choice quesions (MCQs)?"
answer = "To assess reading comprehension"
inputs = tokenizer(text=f"{prefix} context: {context}, question: {question}, answer: {answer}", 
                   max_length=MAX_SOURCE_LENGTH,
                    padding='max_length',
                    truncation=True,
                    return_tensors='pt').to('cuda')
    
output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=MAX_TARGET_LENGTH
)

tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

["distractor 1: To assess a student's ability to read, distractor 2: To assess a student's ability to write, distractor 3: To assess a student's ability to read"]