In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[K     |████████████████████████████████| 312 kB 4.9 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 45.3 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 24.7 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.7 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 44.3 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urlli

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 25.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.11.6 tr

In [None]:
from datasets import load_dataset, load_metric, ClassLabel
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import transformers
import random
import pandas as pd

from pprint import pprint

import torch
import math
import time
import sys
import json
import numpy as np

In [None]:
ending_names = ['A', 'B', 'C', 'D']
model_chkpt = "bert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(model_chkpt, use_fast=True)
model = AutoModelForMultipleChoice.from_pretrained(model_chkpt)

def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [ending_names.index(feature.pop(label_name)) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch


def choices(example):
    for dic in example['question.choices']:
        example[dic['label']] = dic['text']
    example.pop('question.choices', None)
#    example.pop('question.stem', None)
    return example

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    pprint(df.to_html())
    
def show_one(example):
    print(f"Context: {example['fact1']}")
    print(f"  A - {example['question.stem']} {example['A']}")
    print(f"  B - {example['question.stem']} {example['B']}")
    print(f"  C - {example['question.stem']} {example['C']}")
    print(f"  D - {example['question.stem']} {example['D']}")
    print(f"\nGround truth: option {example['label']}")    
    
def preprocess_function(examples):
    # Repeat each first sentence four times to go with the four possibilities of second sentences.
    first_sentences = [[context] * 4 for context in examples["fact1"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["question.stem"]
    second_sentences = [[f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [None]:
facts = 0

input_files = ['train_complete.jsonl','test_complete.jsonl','dev_complete.jsonl']
if facts == 0:
    output_files = ['train_complete_d.jsonl','test_complete_d.jsonl','dev_complete_d.jsonl']
else:
    output_files = ['train_complete_e.jsonl','test_complete_e.jsonl','dev_complete_e.jsonl']

for io in range(3):
    file_name = input_files[io]
    with open(file_name) as json_file:
        json_list = list(json_file)
    for i in range(len(json_list)):
        json_str = json_list[i]
        result = json.loads(json_str)       
        print(result['fact1'])
        if facts == 0:
            result['fact1'] = ''
        json_list[i] = json.dumps(result)
    file_name = output_files[io]
    fout = open(file_name,'wt')
    for i in range(len(json_list)):
        fout.write('%s\n' % json_list[i])
    fout.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
a thermometer is used to measure temperature
a compass is used to navigate seas
An example of conservation is avoiding waste
if a habitat is removed then that habitat is destroyed
moving changes position
contact between rocks over long periods of time causes rocks to smooth
refracting sunlight causes light to split into different colors
precipitation is when water falls from the sky
a chloroplast contains chlorophyll
the Earth rotates on its axis on its axis
an animal requires nutrients for survival
eagles eat rabbits
cool temperatures cause animals to shiver
planting trees has a positive impact on an ecosystem
decomposition increases the amount of nutrients in the soil
reproduction is when an organism passes genetic information from itself to its offspring
as the amount of food an animal eats increases , the weight of that animal will increase
matter is made of molecules
earthquakes cause rock layers to fold on top of ea

In [None]:
if facts == 0:
    openbookQA = load_dataset('json', data_files={'train': 'train_complete_d.jsonl', 'validation': 'dev_complete_d.jsonl', 'test': 'test_complete_d.jsonl'})
else:
    openbookQA = load_dataset('json', data_files={'train': 'train_complete_e.jsonl', 'validation': 'dev_complete_e.jsonl', 'test': 'test_complete_e.jsonl'})
pprint(openbookQA['train'][0])

Using custom data configuration default-a87652a0b82786cf


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-a87652a0b82786cf/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-a87652a0b82786cf/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

{'answerKey': 'D',
 'clarity': '2.00',
 'fact1': '',
 'humanScore': '1.00',
 'id': '7-980',
 'question': {'choices': [{'label': 'A', 'text': 'puppies learning new tricks'},
                          {'label': 'B',
                           'text': 'children growing up and getting old'},
                          {'label': 'C', 'text': 'flowers wilting in a vase'},
                          {'label': 'D',
                           'text': 'plants sprouting, blooming and wilting'}],
              'stem': 'The sun is responsible for'},
 'turkIdAnonymized': 'b356d338b7'}


In [None]:
flatten = openbookQA.flatten()

In [None]:
def choices(example):
    for dic in example['question.choices']:
        example[dic['label']] = dic['text']
    example.pop('question.choices', None)
#    example.pop('question.stem', None)
    return example

In [None]:
def show_one(example):
    print(f"Context: {example['fact1']}")
    print(f"  A - {example['question.stem']} {example['A']}")
    print(f"  B - {example['question.stem']} {example['B']}")
    print(f"  C - {example['question.stem']} {example['C']}")
    print(f"  D - {example['question.stem']} {example['D']}")
    print(f"\nGround truth: option {example['label']}")    

In [None]:
updated = flatten.map(choices)
updated = updated.rename_column('answerKey', 'label')
pprint(updated['train'][0])

show_one(updated['train'][0])

examples = updated['train'][:5]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

{'A': 'puppies learning new tricks',
 'B': 'children growing up and getting old',
 'C': 'flowers wilting in a vase',
 'D': 'plants sprouting, blooming and wilting',
 'clarity': '2.00',
 'fact1': '',
 'humanScore': '1.00',
 'id': '7-980',
 'label': 'D',
 'question.stem': 'The sun is responsible for',
 'turkIdAnonymized': 'b356d338b7'}
Context: 
  A - The sun is responsible for puppies learning new tricks
  B - The sun is responsible for children growing up and getting old
  C - The sun is responsible for flowers wilting in a vase
  D - The sun is responsible for plants sprouting, blooming and wilting

Ground truth: option D


In [None]:
len(updated['train'])

4957

In [None]:
question_lst = []
ans_lst = []
ans_key = []
fact_lst = []
for ii in range(len(updated['train'])):
  question_lst.append(updated['train'][ii]['question.stem'])
  ans_seq = [updated['train'][ii]['A']]+[updated['train'][ii]['B']]+[updated['train'][ii]['C']]+[updated['train'][ii]['D']]
  ans_lst.append(ans_seq)
  ans_key.append(updated['train'][ii]['label'])
  if facts == 1:
    fact_lst.append(updated['train'][ii]['fact1'])

In [None]:
question_lst_val = []
ans_lst_val = []
ans_key_val = []
fact_lst_val = []
for ij in range(len(updated['validation'])):
  question_lst_val.append(updated['validation'][ij]['question.stem'])
  ans_seq = [updated['validation'][ij]['A']]+[updated['validation'][ij]['B']]+[updated['validation'][ij]['C']]+[updated['validation'][ij]['D']]
  ans_lst_val.append(ans_seq)
  ans_key_val.append(updated['validation'][ij]['label'])
  if facts == 1:
    fact_lst_val.append(updated['validation'][ij]['fact1'])

In [None]:
question_lst_test = []
ans_lst_test = []
ans_key_test = []
fact_lst_test = []
for ik in range(len(updated['test'])):
  question_lst_test.append(updated['test'][ik]['question.stem'])
  ans_seq = [updated['test'][ik]['A']]+[updated['test'][ik]['B']]+[updated['test'][ik]['C']]+[updated['test'][ik]['D']]
  ans_lst_test.append(ans_seq)
  ans_key_test.append(updated['test'][ik]['label'])
  if facts == 1:
    fact_lst_test.append(updated['test'][ik]['fact1'])

In [None]:
if facts == 0:
  df = pd.DataFrame(list(zip(question_lst, ans_lst, ans_key)),
               columns =['Question', 'Answer', 'Answer Key'])
  df_val = pd.DataFrame(list(zip(question_lst_val, ans_lst_val, ans_key_val)),
               columns =['Question', 'Answer', 'Answer Key'])
  df_test = pd.DataFrame(list(zip(question_lst_test, ans_lst_test, ans_key_test)),
               columns =['Question', 'Answer', 'Answer Key'])
  df['Fact'] = ""
  df_val['Fact'] = ""
  df_test['Fact'] = ""
else:
  df = pd.DataFrame(list(zip(question_lst, fact_lst, ans_lst, ans_key)),
               columns =['Question', 'Fact', 'Answer', 'Answer Key'])
  df_val = pd.DataFrame(list(zip(question_lst_val, fact_lst_val, ans_lst_val, ans_key_val)),
               columns =['Question', 'Fact', 'Answer', 'Answer Key'])
  df_test = pd.DataFrame(list(zip(question_lst_test, fact_lst_test, ans_lst_test, ans_key_test)),
               columns =['Question', 'Fact', 'Answer', 'Answer Key'])

In [None]:
df

Unnamed: 0,Question,Answer,Answer Key,Fact
0,The sun is responsible for,"[puppies learning new tricks, children growing...",D,
1,When standing miles away from Mount Rushmore,"[the mountains seem very close, the mountains ...",D,
2,When food is reduced in the stomach,"[the mind needs time to digest, take a second ...",C,
3,Stars are,"[warm lights that float, made out of nitrate, ...",C,
4,You can make a telescope with a,"[straw, Glass, Candle, mailing tube]",D,
...,...,...,...,...
4952,A bulldozer alters the area of,"[skyscrapers, the stock market, air, water]",A,
4953,An organism that can survive without the help ...,"[Brewer's yeast, air, sand, sugar]",A,
4954,The nimbleness of this animal is a key adaptio...,"[the praying mantis, the antelope, the butterf...",B,
4955,Birds will have different kinds of beaks depen...,"[organisms they hunt, computer, groceries, seven]",A,


In [None]:
df.to_csv('Train_qaOnly.csv')
df_val.to_csv('Val_qaOnly.csv')
df_test.to_csv('Test_qaOnly.csv')

In [None]:
from google.colab import files
files.download('/content/Train_qaOnly.csv')
files.download('/content/Val_qaOnly.csv')
files.download('/content/Test_qaOnly.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>