In [1]:
!pip install datasets
!pip install evaluate
!pip install transformers==4.28.0
from datasets import load_dataset
from transformers import AutoConfig , AutoTokenizer , AutoModelForMultipleChoice, TrainingArguments, Trainer, DefaultDataCollator
from evaluate import load
import numpy as np

Collecting datasets
  Downloading datasets-2.14.1-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
config = AutoConfig.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
accuracy = load("accuracy")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [3]:
riddleSense_train = load_dataset('riddle_sense', split='train').shuffle()
riddleSense_val = load_dataset('riddle_sense', split='validation')

Downloading builder script:   0%|          | 0.00/5.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.11k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/414k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3510 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1021 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1184 [00:00<?, ? examples/s]

In [4]:
label_mapping_dict = {'A':0, 'B':1, "C":2, "D":3, "E":4}

In [5]:
def preprocess_function(examples):
    question_headers = examples["question"]
    second_sentences = [
        [f"{examples['choices'][i]['text'][j]}" for j in range(5)] for i, header in enumerate(question_headers)
    ]

    second_sentences = sum(second_sentences, [])

    tokenized_examples = tokenizer(second_sentences, truncation=True)
    return {k: [v[i : i + 5] for i in range(0, len(v), 5)] for k, v in tokenized_examples.items()}

In [6]:
preprocessed_train = riddleSense_train.map(preprocess_function, batched=True)
preprocessed_val = riddleSense_val.map(preprocess_function, batched=True)

Map:   0%|          | 0/3510 [00:00<?, ? examples/s]

Map:   0%|          | 0/1021 [00:00<?, ? examples/s]

In [7]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = 'answerKey'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        new_labels = []
        for label in labels:
          new_labels.append(label_mapping_dict[label])
        batch["labels"] = torch.tensor(new_labels, dtype=torch.int64)
        return batch

In [11]:
training_args = TrainingArguments("riddle_sense_check", save_strategy="no", label_names=['answerKey'])

In [None]:
model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased", config=config)
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=preprocessed_train,
                  eval_dataset=preprocessed_val,
                  tokenizer=tokenizer,
                  compute_metrics=None,
                  data_collator = DataCollatorForMultipleChoice(tokenizer))

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,1.001
1000,0.5329


TrainOutput(global_step=1317, training_loss=0.65401978952557, metrics={'train_runtime': 188.7738, 'train_samples_per_second': 55.781, 'train_steps_per_second': 6.977, 'total_flos': 432551719047180.0, 'train_loss': 0.65401978952557, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(preprocessed_val)

In [8]:
def mapping_func(n):
  return label_mapping_dict[n]

In [None]:
preds = np.argmax(np.array(predictions.predictions[1]), axis=1)
labels = np.array([i for i in map(mapping_func, preprocessed_val['answerKey'])])

In [None]:
(labels == preds).sum() / len(labels)

0.4054848188050931

Create Unique answers dataset and check model performance on validation set using this dataset

In [9]:
listOfIndicesToSelect = list()
listOfIndicesToRemove = list()
allWordSet = set()
for index,sample in enumerate(preprocessed_train):
  shouldAddFlag = 1
  for word in sample["choices"]["text"]:
    if word in allWordSet:
      shouldAddFlag = 0
      break
    else:
      allWordSet.add(word)
  if shouldAddFlag:
    listOfIndicesToSelect.append(index)
  else:
    listOfIndicesToRemove.append(index)

newDataset = preprocessed_train.select(listOfIndicesToSelect)

In [None]:
training_args = TrainingArguments("riddle_sense_check", save_strategy="no", label_names=['answerKey'])

In [28]:
model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased", config=config)
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=newDataset,
                  eval_dataset=preprocessed_val,
                  tokenizer=tokenizer,
                  compute_metrics=None,
                  data_collator = DataCollatorForMultipleChoice(tokenizer))
trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

Step,Training Loss


TrainOutput(global_step=264, training_loss=0.7611724391128077, metrics={'train_runtime': 25.5577, 'train_samples_per_second': 82.402, 'train_steps_per_second': 10.33, 'total_flos': 43649321060040.0, 'train_loss': 0.7611724391128077, 'epoch': 3.0})

In [29]:
predictions = trainer.predict(preprocessed_val)

In [30]:
preds = np.argmax(np.array(predictions.predictions[1]), axis=1)
labels = np.array([i for i in map(mapping_func, preprocessed_val['answerKey'])])

In [None]:
(labels == preds).sum() / len(labels)