In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[?25l[K     |█                               | 10 kB 33.3 MB/s eta 0:00:01[K     |██                              | 20 kB 35.4 MB/s eta 0:00:01[K     |███▏                            | 30 kB 20.8 MB/s eta 0:00:01[K     |████▏                           | 40 kB 12.9 MB/s eta 0:00:01[K     |█████▎                          | 51 kB 11.2 MB/s eta 0:00:01[K     |██████▎                         | 61 kB 13.2 MB/s eta 0:00:01[K     |███████▍                        | 71 kB 12.6 MB/s eta 0:00:01[K     |████████▍                       | 81 kB 12.8 MB/s eta 0:00:01[K     |█████████▌                      | 92 kB 14.1 MB/s eta 0:00:01[K     |██████████▌                     | 102 kB 12.1 MB/s eta 0:00:01[K     |███████████▌                    | 112 kB 12.1 MB/s eta 0:00:01[K     |████████████▋                   | 122 kB 12.1 MB/s eta 0:00:01[K     |█████████████▋                  | 133 kB 12.1 MB/s et

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 13.6 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 68.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 83.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 90.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.11.6 t

In [None]:
from datasets import load_dataset, load_metric, ClassLabel
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import transformers
import random
import pandas as pd

from pprint import pprint

import torch
import math
import time
import sys
import json
import numpy as np

In [None]:
ending_names = ['A', 'B', 'C', 'D']
model_chkpt = "bert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(model_chkpt, use_fast=True)
model = AutoModelForMultipleChoice.from_pretrained(model_chkpt)

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [ending_names.index(feature.pop(label_name)) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch


def choices(example):
    for dic in example['question.choices']:
        example[dic['label']] = dic['text']
    example.pop('question.choices', None)
#    example.pop('question.stem', None)
    return example

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    pprint(df.to_html())
    
def show_one(example):
    print(f"Context: {example['fact1']}")
    print(f"  A - {example['question.stem']} {example['A']}")
    print(f"  B - {example['question.stem']} {example['B']}")
    print(f"  C - {example['question.stem']} {example['C']}")
    print(f"  D - {example['question.stem']} {example['D']}")
    print(f"\nGround truth: option {example['label']}")    
    
def preprocess_function(examples):
    # Repeat each first sentence four times to go with the four possibilities of second sentences.
    first_sentences = [[context] * 4 for context in examples["fact1"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["question.stem"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [None]:
facts = 1

input_files = ['train_complete.jsonl','test_complete.jsonl','dev_complete.jsonl']
if facts == 0:
    output_files = ['train_complete_d.jsonl','test_complete_d.jsonl','dev_complete_d.jsonl']
else:
    output_files = ['train_complete_e.jsonl','test_complete_e.jsonl','dev_complete_e.jsonl']

for io in range(3):
    file_name = input_files[io]
    with open(file_name) as json_file:
        json_list = list(json_file)
    for i in range(len(json_list)):
        json_str = json_list[i]
        result = json.loads(json_str)       
        print(result['fact1'])
        if facts == 0:
            result['fact1'] = ''
        json_list[i] = json.dumps(result)
    file_name = output_files[io]
    fout = open(file_name,'wt')
    for i in range(len(json_list)):
        fout.write('%s\n' % json_list[i])
    fout.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
a thermometer is used to measure temperature
a compass is used to navigate seas
An example of conservation is avoiding waste
if a habitat is removed then that habitat is destroyed
moving changes position
contact between rocks over long periods of time causes rocks to smooth
refracting sunlight causes light to split into different colors
precipitation is when water falls from the sky
a chloroplast contains chlorophyll
the Earth rotates on its axis on its axis
an animal requires nutrients for survival
eagles eat rabbits
cool temperatures cause animals to shiver
planting trees has a positive impact on an ecosystem
decomposition increases the amount of nutrients in the soil
reproduction is when an organism passes genetic information from itself to its offspring
as the amount of food an animal eats increases , the weight of that animal will increase
matter is made of molecules
earthquakes cause rock layers to fold on top of ea

In [None]:
if facts == 0:
    openbookQA = load_dataset('json', data_files={'train': 'train_complete_d.jsonl', 'validation': 'dev_complete_d.jsonl', 'test': 'test_complete_d.jsonl'})
else:
    openbookQA = load_dataset('json', data_files={'train': 'train_complete_e.jsonl', 'validation': 'dev_complete_e.jsonl', 'test': 'test_complete_e.jsonl'})
pprint(openbookQA['train'][0])

Using custom data configuration default-55178cfb0440cffa


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-55178cfb0440cffa/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-55178cfb0440cffa/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

{'answerKey': 'D',
 'clarity': '2.00',
 'fact1': 'the sun is the source of energy for physical cycles on Earth',
 'humanScore': '1.00',
 'id': '7-980',
 'question': {'choices': [{'label': 'A', 'text': 'puppies learning new tricks'},
                          {'label': 'B',
                           'text': 'children growing up and getting old'},
                          {'label': 'C', 'text': 'flowers wilting in a vase'},
                          {'label': 'D',
                           'text': 'plants sprouting, blooming and wilting'}],
              'stem': 'The sun is responsible for'},
 'turkIdAnonymized': 'b356d338b7'}


In [None]:
flatten = openbookQA.flatten()

In [None]:
updated = flatten.map(choices)
updated = updated.rename_column('answerKey', 'label')
pprint(updated['train'][0])

show_one(updated['train'][0])

examples = updated['train'][:5]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

{'A': 'puppies learning new tricks',
 'B': 'children growing up and getting old',
 'C': 'flowers wilting in a vase',
 'D': 'plants sprouting, blooming and wilting',
 'clarity': '2.00',
 'fact1': 'the sun is the source of energy for physical cycles on Earth',
 'humanScore': '1.00',
 'id': '7-980',
 'label': 'D',
 'question.stem': 'The sun is responsible for',
 'turkIdAnonymized': 'b356d338b7'}
Context: the sun is the source of energy for physical cycles on Earth
  A - The sun is responsible for puppies learning new tricks
  B - The sun is responsible for children growing up and getting old
  C - The sun is responsible for flowers wilting in a vase
  D - The sun is responsible for plants sprouting, blooming and wilting

Ground truth: option D


In [None]:
len(updated['train'])

4957

In [None]:
question_lst = []
ans_lst = []
fact_lst = []
for ii in range(len(updated['train'])):
  question_lst.append(updated['train'][ii]['question.stem'])
  ans_seq = [updated['train'][ii]['A']]+[updated['train'][ii]['B']]+[updated['train'][ii]['C']]+[updated['train'][ii]['D']]
  ans_lst.append(ans_seq)
  if facts == 1:
    fact_lst.append(updated['train'][ii]['fact1'])

In [None]:
question_lst_val = []
ans_lst_val = []
fact_lst_val = []
for ij in range(len(updated['validation'])):
  question_lst_val.append(updated['validation'][ij]['question.stem'])
  ans_seq = [updated['validation'][ij]['A']]+[updated['validation'][ij]['B']]+[updated['validation'][ij]['C']]+[updated['validation'][ij]['D']]
  ans_lst_val.append(ans_seq)
  if facts == 1:
    fact_lst_val.append(updated['validation'][ij]['fact1'])

In [None]:
question_lst_test = []
ans_lst_test = []
fact_lst_test = []
for ik in range(len(updated['test'])):
  question_lst_test.append(updated['test'][ik]['question.stem'])
  ans_seq = [updated['test'][ik]['A']]+[updated['test'][ik]['B']]+[updated['test'][ik]['C']]+[updated['test'][ik]['D']]
  ans_lst_test.append(ans_seq)
  if facts == 1:
    fact_lst_test.append(updated['test'][ik]['fact1'])

In [None]:
!wget http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
!wget http://www.cs.toronto.edu/~rkiros/models/utable.npy
!wget http://www.cs.toronto.edu/~rkiros/models/btable.npy
!wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz
!wget http://www.cs.toronto.edu/~rkiros/models/uni_skip.npz.pkl
!wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz
!wget http://www.cs.toronto.edu/~rkiros/models/bi_skip.npz.pkl

--2022-03-08 22:38:48--  http://www.cs.toronto.edu/~rkiros/models/dictionary.txt
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7996547 (7.6M) [text/plain]
Saving to: ‘dictionary.txt’


2022-03-08 22:38:50 (8.51 MB/s) - ‘dictionary.txt’ saved [7996547/7996547]

--2022-03-08 22:38:50--  http://www.cs.toronto.edu/~rkiros/models/utable.npy
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.edu (www.cs.toronto.edu)|128.100.3.30|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2342138474 (2.2G)
Saving to: ‘utable.npy’


2022-03-08 22:40:10 (28.0 MB/s) - ‘utable.npy’ saved [2342138474/2342138474]

--2022-03-08 22:40:10--  http://www.cs.toronto.edu/~rkiros/models/btable.npy
Resolving www.cs.toronto.edu (www.cs.toronto.edu)... 128.100.3.30
Connecting to www.cs.toronto.ed

In [None]:
from skip_thoughts_master import skipthoughts

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# You would need to download pre-trained models first
model_skipthoughts = skipthoughts.load_model()

Loading model parameters...
Compiling encoders...




Loading tables...
Packing up...


In [None]:
encoder = skipthoughts.Encoder(model_skipthoughts)

In [None]:
encoded =  encoder.encode(question_lst)

5
7
2
9
8
6
12
10
38
25
41
20
42
14
11
15
17
4
1
3
44
16
13
31
39
23
33
18
24
19
21
22
35
30
28
48
29
36
27
37
34
32
46
50
61
60
26
76
63
49
40


In [None]:
encoded_val =  encoder.encode(question_lst_val)
encoded_test =  encoder.encode(question_lst_test)

21
11
12
15
8
6
14
10
13
2
7
4
1
39
5
9
20
44
3
18
17
29
26
16
19
28
23
30
24
31
42
45
38
43
33
22
25
51
62
40
10
2
34
7
11
9
20
6
16
14
8
17
3
29
19
12
4
5
27
1
33
13
66
15
28
18
25
23
48
22
31
26
53
37
36
24
21
38
43
50
59


In [None]:
if facts == 1:
  encoded_fact =  encoder.encode(fact_lst)
  encoded_fact_val =  encoder.encode(fact_lst_val)
  encoded_fact_test =  encoder.encode(fact_lst_test)

12
8
6
14
11
5
13
15
9
3
7
10
27
25
4
17
21
19
22
18
16
24
20
23
2
28
8
5
13
6
10
4
15
9
7
21
12
16
18
3
14
19
20
17
11
2
28
26
27
9
10
3
18
11
13
7
8
6
4
14
16
17
12
5
15
22
19
21
20


In [None]:
encoded_ans = []
for k in range(len(ans_lst)):
  encoded_1 = encoder.encode(ans_lst[k])
  encoded_ans.append(encoded_1)

encoded_ans_final = np.array(encoded_ans)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4
6
7
3
2
2
3
1
2
2
6
8
7
10
1
5
6
7
4
1
3
1
2
1
2
6
4
8
6
5
4
1
1
3
4
5
1
2
4
2
3
2
3
3
3
10
5
6
7
1
2
1
2
3
2
3
9
7
4
5
4
5
6
1
4
2
1
5
3
2
6
5
1
2
5
8
10
1
8
10
12
2
1
2
3
6
4
3
2
1
2
1
2
1
4
5
1
3
1
3
2
3
1
5
8
7
6
2
3
3
4
7
6
2
1
1
1
2
1
1
2
1
3
1
2
1
2
5
5
4
2
1
2
2
4
3
2
2
3
2
6
8
1
1
3
2
3
2
2
2
3
3
3
2
1
2
3
4
5
3
3
2
1
1
2
3
1
2
1
2
6
4
5
3
2
3
3
2
1
2
1
3
2
3
2
1
2
2
1
4
7
6
2
3
2
3
2
2
3
1
2
5
13
8
7
3
2
5
4
3
1
1
1
2
2
3
3
3
2
4
5
2
3
2
3
4
1
1
2
1
2
1
6
7
13
8
2
1
4
7
5
1
1
2
2
2
3
1
2
4
1
1
6
7
1
2
2
4
2
4
5
1
2
1
1
5
6
4
4
3
2
7
6
5
2
2
3
1
3
2
1
2
5
2
3
2
2
1
1
4
4
12
5
6
4
6
9
8
3
1
2
2
1
2
3
3
2
4
15
6
12
10
2
1
4
5
3
2
1
2
2
3
3
2
2
3
2
1
3
5
4
7
4
1
2
2
1
3
2
3
1
2
8
5
7
10
1
2
3
9
4
7
7
8
6
5
4
3
1
1
2
3
1
4
4
4
1
1
2
3
9
8
2
4
1
1
2
3
2
1
3
2
1
3
4
5
3
2
1
1
2
3
2
1
2
1
2
2
1
2
3
2
6
11
7
6
5
1
5
4
1
2
1
1
2
1
11
12
4
5
11
9
10
6
3
1
4
4
14
1
2
2
1
3
3
1
3
4
6
3
1
2
1
2
1
2
2
1
2
3
4
2
1
2
1
13
16
1

In [None]:
encoded_ans_val = []
for r in range(len(ans_lst_val)):
  encoded_2 = encoder.encode(ans_lst_val[r])
  encoded_ans_val.append(encoded_2)

encoded_ans_final_val = np.array(encoded_ans_val)

3
1
5
4
5
6
1
6
4
5
1
2
1
7
6
4
5
6
4
1
6
4
5
1
4
7
5
10
3
1
2
2
1
12
8
6
1
5
7
4
6
1
5
4
8
6
6
7
4
6
11
4
5
6
4
1
2
1
3
4
9
6
7
4
13
12
6
5
2
3
6
5
4
1
4
5
2
1
1
2
3
4
6
4
7
5
2
2
3
2
1
6
9
10
13
2
1
4
8
7
1
1
4
5
6
7
5
2
3
6
4
5
4
5
6
1
1
4
5
7
10
6
2
1
2
3
8
9
6
1
2
1
2
9
7
12
1
2
1
2
1
6
4
5
1
3
7
9
6
6
5
4
5
1
2
1
1
1
2
1
2
5
4
3
2
4
5
3
4
4
5
1
2
3
7
4
5
4
3
2
1
2
16
9
12
13
1
3
2
3
1
1
2
1
2
1
1
2
1
2
3
6
6
4
5
2
4
4
5
6
4
4
6
7
6
4
1
4
8
5
7
2
1
6
5
7
8
9
6
1
1
2
1
2
7
11
10
8
2
3
1
2
5
13
8
6
1
2
3
3
1
3
3
1
1
4
8
4
6
9
6
8
7
10
7
4
20
15
11
4
5
1
6
11
1
2
2
1
1
3
5
4
2
3
1
3
6
4
5
3
1
6
7
6
4
5
7
9
2
2
3
2
2
1
2
1
3
2
1
1
1
1
2
1
1
3
3
2
1
1
4
2
6
7
2
1
4
5
1
2
1
2
9
6
3
1
2
1
2
3
4
1
2
1
1
1
3
2
3
6
4
1
1
6
7
5
1
2
1
4
5
5
7
1
1
3
4
5
2
7
5
6
4
6
2
2
1
2
3
2
1
1
2
5
7
4
5
4
5
5
7
1
3
2
1
4
5
1
2
2
3
1
8
6
4
11
7
6
4
5
3
4
1
2
3
2
2
5
4
6
8
6
5
4
1
1
4
5
7
6
7
6
10
3
1
6
5
5
14
11
6
2
5
4
8
6
6
7
4
1
1
2
1
2
3
6
7
4
2
3
1
3
1
2
4
6
1
5
7
1
2
2
3
3
2
2
7
8
12
10
5
8
11
1
2
5
4

In [None]:
encoded_ans_test = []
for s in range(len(ans_lst_test)):
  encoded_3 = encoder.encode(ans_lst_test[s])
  encoded_ans_test.append(encoded_3)

encoded_ans_final_test = np.array(encoded_ans_test)

4
5
2
1
4
5
1
3
2
1
1
2
3
1
1
2
2
3
1
1
2
2
3
7
5
6
9
5
4
2
3
2
3
1
2
1
1
1
2
1
7
5
8
7
6
4
5
2
6
5
3
1
2
1
2
1
3
7
5
6
1
10
4
6
5
4
5
6
8
2
1
7
5
1
1
2
1
1
2
3
1
3
6
3
2
6
7
5
7
6
3
2
1
1
1
4
1
6
5
3
2
3
1
2
2
1
1
7
4
8
5
1
13
16
11
7
1
4
5
6
4
5
3
1
2
1
2
2
3
1
2
5
4
4
5
6
5
6
1
1
5
4
11
6
5
4
7
5
6
2
1
1
5
4
3
5
4
3
3
2
5
4
4
5
9
1
6
5
6
7
5
2
1
3
1
4
5
3
2
3
1
2
6
7
5
2
1
2
2
1
1
3
2
1
5
8
4
4
5
1
2
1
5
8
7
1
4
5
4
5
6
5
6
7
1
4
5
3
2
1
3
6
4
8
6
7
5
17
5
13
4
9
5
1
4
3
7
6
9
2
3
1
2
3
1
3
1
1
2
2
6
4
5
5
8
7
6
2
2
3
7
4
5
1
3
2
5
4
6
1
2
1
2
1
1
2
2
7
6
1
2
1
1
3
1
4
1
2
3
7
5
3
2
1
4
5
3
5
6
4
2
1
6
7
3
2
1
2
6
7
5
1
5
6
3
1
1
3
2
7
8
11
5
7
4
1
1
2
1
3
2
1
2
3
4
5
3
6
5
4
7
5
4
6
4
6
1
3
2
1
6
5
4
3
5
4
1
1
8
12
3
4
9
5
3
2
1
2
2
1
1
2
6
4
1
8
5
6
5
6
11
8
2
3
2
1
5
6
2
3
1
3
1
2
4
1
4
5
7
4
5
5
6
4
2
1
2
8
7
5
1
3
2
5
8
4
5
3
2
2
3
3
2
2
3
1
2
3
2
1
1
5
4
5
1
4
5
1
3
5
6
9
1
3
2
4
5
1
3
1
2
1
5
7
8
9
1
2
1
2
1
8
6
4
5
1
1
4
5
1
3
4
4
1
1
1
2
1
1
1
2
1
3
2
3
1
4
4
5
3
1
1
2
1
2


In [None]:
import math

def cosine_sim(vec1, vec2):
    vec1 = list(vec1)
    vec2 = list(vec2)
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    return dot_prod / (mag_1 * mag_2)

In [None]:
dct_idx_label = {0:'A',1:'B',2:'C',3:'D'}
lst_drop_label = []
lst_other_min = []
for j in range(len(encoded)):
  score_lst = []
  if facts == 0:
    for q in range(4):
      score = cosine_sim(encoded[j], encoded_ans_final[j][q])
      score_lst.append(score)
  else:
    temp_vec = np.add(encoded[j], encoded_fact[j])
    for q in range(4):
      score = cosine_sim(temp_vec, encoded_ans_final[j][q])
      score_lst.append(score)
  min_score = min(score_lst)
  other_min = sorted(score_lst)[1]
  idx_min = score_lst.index(min_score)
  idx_other_min = score_lst.index(other_min)
  lst_drop_label.append(dct_idx_label[idx_min])
  lst_other_min.append(dct_idx_label[idx_other_min])

In [None]:
lst_drop_label_val = []
lst_other_min_val = []
for j in range(len(encoded_val)):
  score_lst = []
  if facts == 0:
    for q in range(4):
      score = cosine_sim(encoded_val[j], encoded_ans_final_val[j][q])
      score_lst.append(score)
  else:
    temp_vec = np.add(encoded_val[j], encoded_fact_val[j])
    for q in range(4):
      score = cosine_sim(temp_vec, encoded_ans_final_val[j][q])
      score_lst.append(score)
  min_score = min(score_lst)
  other_min = sorted(score_lst)[1]
  idx_min = score_lst.index(min_score)
  idx_other_min = score_lst.index(other_min)
  lst_drop_label_val.append(dct_idx_label[idx_min])
  lst_other_min_val.append(dct_idx_label[idx_other_min])

In [None]:
lst_drop_label_test = []
lst_other_min_test = []
for j in range(len(encoded_test)):
  score_lst = []
  if facts == 0:
    for q in range(4):
      score = cosine_sim(encoded_test[j], encoded_ans_final_test[j][q])
      score_lst.append(score)
  else:
    temp_vec = np.add(encoded_test[j], encoded_fact_test[j])
    for q in range(4):
      score = cosine_sim(temp_vec, encoded_ans_final_test[j][q])
      score_lst.append(score)
  min_score = min(score_lst)
  other_min = sorted(score_lst)[1]
  idx_min = score_lst.index(min_score)
  idx_other_min = score_lst.index(other_min)
  lst_drop_label_test.append(dct_idx_label[idx_min])
  lst_other_min_test.append(dct_idx_label[idx_other_min])

In [None]:
updated_2 = updated.copy()

In [None]:
lst_train = list(updated_2['train'])
for l in range(len(lst_train)):
  if lst_train[l]['label'] != lst_drop_label[l]:
    del lst_train[l][lst_drop_label[l]]
  else:
    del lst_train[l][lst_other_min[l]]

In [None]:
ini_list = ['id', 'fact1', 'humanScore', 'clarity', 'turkIdAnonymized', 'label', 'question.stem', 'A', 'B', 'C']
lst_train_final = []
for jj in range(len(lst_train)):
  if lst_train[jj]['label'] == 'D':
    lst_train[jj]['label'] = 'C'
  final_dict = dict(zip(ini_list, list(lst_train[jj].values())))
  lst_train_final.append(final_dict)

In [None]:
lst_val = list(updated_2['validation'])
for t in range(len(lst_val)):
  if lst_val[t]['label'] != lst_drop_label_val[t]:
    del lst_val[t][lst_drop_label_val[t]]
  else:
    del lst_val[t][lst_other_min_val[t]]

In [None]:
lst_val_final = []
for jj in range(len(lst_val)):
  if lst_val[jj]['label'] == 'D':
    lst_val[jj]['label'] = 'C'
  final_dict = dict(zip(ini_list, list(lst_val[jj].values())))
  lst_val_final.append(final_dict)

In [None]:
lst_test = list(updated_2['test'])
for v in range(len(lst_test)):
  if lst_test[v]['label'] != lst_drop_label_test[v]:
    del lst_test[v][lst_drop_label_test[v]]
  else:
    del lst_test[v][lst_other_min_test[v]]

In [None]:
lst_test_final = []
for jj in range(len(lst_test)):
  if lst_test[jj]['label'] == 'D':
    lst_test[jj]['label'] = 'C'
  final_dict = dict(zip(ini_list, list(lst_test[jj].values())))
  lst_test_final.append(final_dict)

In [None]:
df = pd.DataFrame(lst_train_final)
df.to_csv('train_ds.csv')
df_val = pd.DataFrame(lst_val_final)
df_val.to_csv('val_ds.csv')
df_test = pd.DataFrame(lst_test_final)
df_test.to_csv('test_ds.csv')

In [None]:
from google.colab import files
files.download('train_ds.csv')
files.download('val_ds.csv')
files.download('test_ds.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df = pd.read_csv('/content/train_ds.csv', index_col=[0], keep_default_na=False)
df_val = pd.read_csv('/content/val_ds.csv', index_col=[0], keep_default_na=False)
df_test = pd.read_csv('/content/test_ds.csv', index_col=[0], keep_default_na=False)

In [None]:
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset

#df = pd.DataFrame(lst_train_final)
dataset = ds.dataset(pa.Table.from_pandas(df).to_batches())

### convert to Huggingface dataset
upd_train = Dataset(pa.Table.from_pandas(df))
upd_train

Dataset({
    features: ['id', 'fact1', 'humanScore', 'clarity', 'turkIdAnonymized', 'label', 'question.stem', 'A', 'B', 'C', '__index_level_0__'],
    num_rows: 4957
})

In [None]:
# df.to_csv('train_ds.csv')

In [None]:
#df_val = pd.DataFrame(lst_val_final)
dataset = ds.dataset(pa.Table.from_pandas(df_val).to_batches())

### convert to Huggingface dataset
upd_val = Dataset(pa.Table.from_pandas(df_val))
upd_val

Dataset({
    features: ['id', 'fact1', 'humanScore', 'clarity', 'turkIdAnonymized', 'label', 'question.stem', 'A', 'B', 'C', '__index_level_0__'],
    num_rows: 500
})

In [None]:
#df_val.to_csv('val_ds.csv')

In [None]:
#df_test = pd.DataFrame(lst_test_final)
dataset = ds.dataset(pa.Table.from_pandas(df_test).to_batches())

### convert to Huggingface dataset
upd_test = Dataset(pa.Table.from_pandas(df_test))
upd_test

Dataset({
    features: ['id', 'fact1', 'humanScore', 'clarity', 'turkIdAnonymized', 'label', 'question.stem', 'A', 'B', 'C', '__index_level_0__'],
    num_rows: 500
})

In [None]:
# df_test.to_csv('test_ds.csv')

In [None]:
# from google.colab import files
# files.download('train_ds.csv')
# files.download('val_ds.csv')
# files.download('test_ds.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
ending_names = ['A', 'B', 'C']
model_chkpt = "bert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(model_chkpt, use_fast=True)
model = AutoModelForMultipleChoice.from_pretrained(model_chkpt)
def preprocess_function(examples):
    # Repeat each first sentence three times to go with the three possibilities of second sentences.
    first_sentences = [[context] * 3 for context in examples["fact1"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["question.stem"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [None]:
features = preprocess_function(upd_train[:5])

In [None]:
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]]) 

5 3 [25, 26, 30]


In [None]:
import datasets
dd = datasets.DatasetDict({"train":upd_train,"validation":upd_val,"test":upd_test})

In [None]:
def show_one(example):
    print(f"Context: {example['fact1']}")
    print(f"  A - {example['question.stem']} {example['A']}")
    print(f"  B - {example['question.stem']} {example['B']}")
    print(f"  C - {example['question.stem']} {example['C']}")
    print(f"\nGround truth: option {example['label']}")  

In [None]:
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(3)]    
show_one(upd_train[idx])

encoded_datasets = dd.map(preprocess_function, batched=True)

Context: a star is made of gases
  A - Stars are warm lights that float
  B - Stars are made out of nitrate
  C - Stars are great balls of gas burning billions of miles away

Ground truth: option C


  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
encoded_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'fact1', 'humanScore', 'clarity', 'turkIdAnonymized', 'label', 'question.stem', 'A', 'B', 'C', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4957
    })
    validation: Dataset({
        features: ['id', 'fact1', 'humanScore', 'clarity', 'turkIdAnonymized', 'label', 'question.stem', 'A', 'B', 'C', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'fact1', 'humanScore', 'clarity', 'turkIdAnonymized', 'label', 'question.stem', 'A', 'B', 'C', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 500
    })
})

In [None]:
batch_size = 16
model_name = model_chkpt.split("/")[-1]
args = TrainingArguments(f"{model_name}-finetuned-swag",
                          evaluation_strategy = "epoch",
                          learning_rate=5e-5,
                          per_device_train_batch_size=batch_size,
                          num_train_epochs=3,
                          weight_decay=0.01)

In [None]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [ending_names.index(feature.pop(label_name)) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in encoded_datasets["train"][i].items() if k in accepted_keys} for i in range(10)]

In [None]:
batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(3)]
show_one(upd_train[8])

Context: as a source of light becomes closer , that source will appear brighter
  A - As a car approaches you in the night the headlights become more intense
  B - As a car approaches you in the night the headlights recede into the dark
  C - As a car approaches you in the night the headlights remain at a constant

Ground truth: option A


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

In [None]:
trainer = Trainer(model,
                  args,
                  train_dataset=encoded_datasets["train"],
                  eval_dataset=encoded_datasets["validation"],
                  tokenizer=tokenizer,
                  data_collator=DataCollatorForMultipleChoice(tokenizer),
                  compute_metrics=compute_metrics)

In [None]:
trainer.train()
print('\n\n\n\n')
print('test set:')
print('\n\n\n\n')
final_eval = trainer.evaluate(eval_dataset=encoded_datasets['test'])
print(final_eval)

The following columns in the training set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: id, __index_level_0__, B, turkIdAnonymized, question.stem, fact1, A, clarity, humanScore, C. If id, __index_level_0__, B, turkIdAnonymized, question.stem, fact1, A, clarity, humanScore, C are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4957
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 930


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.976132,0.542


The following columns in the evaluation set  don't have a corresponding argument in `BertForMultipleChoice.forward` and have been ignored: id, __index_level_0__, B, turkIdAnonymized, question.stem, fact1, A, clarity, humanScore, C. If id, __index_level_0__, B, turkIdAnonymized, question.stem, fact1, A, clarity, humanScore, C are not expected by `BertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
