In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


# Loading from files

In [2]:
dir = "./data/bAbL"

In [3]:
babi_dataset = datasets.load_from_disk(dir)

In [4]:
babi_dataset["train"][0]

{'story': {'answer': ['', '', 'office'],
  'id': ['1', '2', '3'],
  'supporting_ids': [[], [], ['1']],
  'text': ['The office is north of the kitchen.',
   'The garden is south of the kitchen.',
   'What is north of the kitchen?'],
  'type': [0, 0, 1]}}

In [5]:
flattened_babi = babi_dataset.flatten()
flattened_babi

DatasetDict({
    train: Dataset({
        features: ['story.answer', 'story.id', 'story.supporting_ids', 'story.text', 'story.type'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['story.answer', 'story.id', 'story.supporting_ids', 'story.text', 'story.type'],
        num_rows: 1000
    })
})

In [6]:
flattened_babi["train"][0]

{'story.answer': ['', '', 'office'],
 'story.id': ['1', '2', '3'],
 'story.supporting_ids': [[], [], ['1']],
 'story.text': ['The office is north of the kitchen.',
  'The garden is south of the kitchen.',
  'What is north of the kitchen?'],
 'story.type': [0, 0, 1]}

In [7]:
def get_question_and_facts(story):
    dic = {}
    dic['question'] = story['story.text'][2]
    dic['sentences'] = ' '.join([story['story.text'][0], story['story.text'][1]])
    dic['answer'] = story['story.answer'][2]
    return dic

In [8]:
# dataset map adds the new information to the old information!
processed = flattened_babi.map(get_question_and_facts)

In [9]:
processed['train'][2]

{'story.answer': ['', '', 'bedroom'],
 'story.id': ['1', '2', '3'],
 'story.supporting_ids': [[], [], ['2']],
 'story.text': ['The garden is north of the office.',
  'The bedroom is north of the garden.',
  'What is north of the garden?'],
 'story.type': [0, 0, 1],
 'question': 'What is north of the garden?',
 'sentences': 'The garden is north of the office. The bedroom is north of the garden.',
 'answer': 'bedroom'}

In [10]:
def get_start_end_idx(story): # find the start,end of answer in sentences
    str_idx = story['sentences'].find(story['answer'])
    end_idx = str_idx + len(story['answer'])
    return {'str_idx':str_idx,
          'end_idx': end_idx}

In [11]:
processed = processed.map(get_start_end_idx)

In [12]:
processed['train'][2]

{'story.answer': ['', '', 'bedroom'],
 'story.id': ['1', '2', '3'],
 'story.supporting_ids': [[], [], ['2']],
 'story.text': ['The garden is north of the office.',
  'The bedroom is north of the garden.',
  'What is north of the garden?'],
 'story.type': [0, 0, 1],
 'question': 'What is north of the garden?',
 'sentences': 'The garden is north of the office. The bedroom is north of the garden.',
 'answer': 'bedroom',
 'str_idx': 39,
 'end_idx': 46}

# Modeling

In [13]:
from transformers import DistilBertTokenizer, DistilBertTokenizerFast, PreTrainedTokenizerFast, AutoTokenizer


In [14]:
MODEL_NAME = 'distilbert-base-uncased'
# tkzr_slow = DistilBertTokenizer.from_pretrained(MODEL_NAME)
tkzr_slow = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')
# tokenizer = DistilBertTokenizerFast.from_pretrained(tokenizer_object=tkzr_slow)

TypeError: from_pretrained() missing 1 required positional argument: 'pretrained_model_name_or_path'

In [15]:
# fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tkzr_slow)

In [16]:
tokenizer = DistilBertTokenizerFast.from_pretrained('./data/distilber-fast-tokenizer/')

In [17]:
# ?tokenizer

encoding = tokenizer(
    processed['train'][2]['sentences'], 
    truncation=True, 
    padding=True, 
    max_length=tokenizer.model_max_length
)

In [18]:
print(encoding)
print(processed['train'][2]['sentences'])

{'input_ids': [101, 1996, 3871, 2003, 2167, 1997, 1996, 2436, 1012, 1996, 5010, 2003, 2167, 1997, 1996, 3871, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
The garden is north of the office. The bedroom is north of the garden.


## finding token index of a given character index

In [19]:
print(encoding.char_to_token(5)) # #its char index to input index. BAD name

2


In [20]:
# necessary as many tokenizers break the word. Some times we may have multiple tokens where our answers are.
def tokenize_align(example):
    encoding = tokenizer(
        example['sentences'], 
        example['question'], 
        truncation=True, 
        padding=True, 
        max_length=tokenizer.model_max_length
    )
    start_positions = encoding.char_to_token(example['str_idx'])
    end_positions = encoding.char_to_token(example['end_idx']-1)
    if start_positions is None:
        start_positions = tokenizer.model_max_length
    if end_positions is None:
        end_positions = tokenizer.model_max_length
    return {'input_ids': encoding['input_ids'],
          'attention_mask': encoding['attention_mask'],
          'start_positions': start_positions,
          'end_positions': end_positions}

In [21]:
qa_dataset = processed.map(tokenize_align)

In [22]:
qa_dataset = qa_dataset.remove_columns(['story.answer', 'story.id', 'story.supporting_ids', 'story.text', 'story.type'])

In [23]:
qa_dataset['train'][2]

{'question': 'What is north of the garden?',
 'sentences': 'The garden is north of the office. The bedroom is north of the garden.',
 'answer': 'bedroom',
 'str_idx': 39,
 'end_idx': 46,
 'input_ids': [101,
  1996,
  3871,
  2003,
  2167,
  1997,
  1996,
  2436,
  1012,
  1996,
  5010,
  2003,
  2167,
  1997,
  1996,
  3871,
  1012,
  102,
  2054,
  2003,
  2167,
  1997,
  1996,
  3871,
  1029,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'start_positions': 10,
 'end_positions': 10}

In [24]:
qa_dataset['train'][20]

{'question': 'What is the bathroom south of?',
 'sentences': 'The office is south of the bedroom. The office is north of the bathroom.',
 'answer': 'office',
 'str_idx': 4,
 'end_idx': 10,
 'input_ids': [101,
  1996,
  2436,
  2003,
  2148,
  1997,
  1996,
  5010,
  1012,
  1996,
  2436,
  2003,
  2167,
  1997,
  1996,
  5723,
  1012,
  102,
  2054,
  2003,
  1996,
  5723,
  2148,
  1997,
  1029,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'start_positions': 2,
 'end_positions': 2}

# Training

In [25]:
train_ds = qa_dataset['train']
test_ds = qa_dataset['test']

In [26]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased", return_dict=False)

# pytorch_model = DistilBertForQuestionAnswering.from_pretrained("model/pytorch") # from local

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from torch.utils.data import DataLoader

columns_to_return = ['input_ids','attention_mask', 'start_positions', 'end_positions']
train_ds.set_format(type='pt', columns=columns_to_return)
test_ds.set_format(type='pt', columns=columns_to_return)

In [29]:
train_ds[0]

{'input_ids': tensor([ 101, 1996, 2436, 2003, 2167, 1997, 1996, 3829, 1012, 1996, 3871, 2003,
         2148, 1997, 1996, 3829, 1012,  102, 2054, 2003, 2167, 1997, 1996, 3829,
         1029,  102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]),
 'start_positions': tensor(2),
 'end_positions': tensor(2)}

In [30]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    start_labels = pred.label_ids[0]
    start_preds = pred.predictions[0].argmax(-1)
    end_labels = pred.label_ids[1]
    end_preds = pred.predictions[1].argmax(-1)
    
    f1_start = f1_score(start_labels, start_preds, average='macro')
    f1_end = f1_score(end_labels, end_preds, average='macro')
    
    return {
        'f1_start': f1_start,
        'f1_end': f1_end,
    }

In [31]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='results',          # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=20,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,            # directory for storing logs
    logging_steps=50
)

trainer = Trainer(
    model=model,                 # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_ds,         # training dataset
    eval_dataset=test_ds,
    compute_metrics=compute_metrics             # evaluation dataset
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
50,1.5842
100,0.6971
150,0.4692
200,0.4752
250,0.3368
300,0.2914
350,0.3412


TrainOutput(global_step=375, training_loss=0.5786111284891764, metrics={'train_runtime': 18.626, 'train_samples_per_second': 161.065, 'train_steps_per_second': 20.133, 'total_flos': 19904183208000.0, 'train_loss': 0.5786111284891764, 'epoch': 3.0})

In [32]:
trainer.evaluate(test_ds)

{'eval_loss': 0.3127962350845337,
 'eval_f1_start': 0.8020190299373403,
 'eval_f1_end': 0.8060767612423697,
 'eval_runtime': 1.2885,
 'eval_samples_per_second': 776.07,
 'eval_steps_per_second': 97.009,
 'epoch': 3.0}