<a href="https://colab.research.google.com/github/dbamman/nlp21/blob/main/HW8/HW_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Data and Get Set Up

### In this homework, we'll be using a larger vocabulary of case-sensitive GloVe word embedding vectors - this will take a few minutes to download.

In [None]:
!wget https://people.ischool.berkeley.edu/~jongillick/nlp/devQA.pkl
!wget https://people.ischool.berkeley.edu/~jongillick/nlp/trainQA.pkl
!wget https://people.ischool.berkeley.edu/~dbamman/glove.1M.300d.cased.txt

In [None]:
from tqdm.notebook import tqdm
import numpy as np
import pickle

import sys, json, re, time
from collections import Counter

import spacy

nlp = spacy.load('en')
nlp.remove_pipe('parser')

import torch
import torch.nn as nn
from sklearn.utils import shuffle

In [None]:
PAD_INDEX = 0             # reserved for padding words
UNKNOWN_INDEX = 1         # reserved for unknown words
SEP_INDEX = 2

def read_embeddings(filename, vocab_size=50000):
  """
  Utility function, loads in the `vocab_size` most common embeddings from `filename`
  
  Arguments:
  - filename:     path to file
                  automatically infers correct embedding dimension from filename
  - vocab_size:   maximum number of embeddings to load

  Returns 
  - embeddings:   torch.FloatTensor matrix of size (vocab_size x word_embedding_dim)
  - vocab:        dictionary mapping word (str) to index (int) in embedding matrix
  """

  # get the embedding size from the first embedding
  with open(filename, encoding="utf-8") as file:
    word_embedding_dim = len(file.readline().split(" ")) - 1

  vocab = {}

  embeddings = np.zeros((vocab_size, word_embedding_dim))
  with open(filename, encoding="utf-8") as file:
    for idx, line in tqdm(enumerate(file)):

      if idx + 2 >= vocab_size:
        break

      cols = line.rstrip().split(" ")
      val = np.array(cols[1:])
      word = cols[0]
      embeddings[idx + 2] = val
      vocab[word] = idx + 2
  
  # a FloatTensor is a multidimensional matrix
  # that contains 32-bit floats in every entry
  # https://pytorch.org/docs/stable/tensors.html
  return torch.FloatTensor(embeddings), vocab

print("Loading word embeddings. This will take a minute or two...")

embs, vocab = read_embeddings("glove.1M.300d.cased.txt", vocab_size=1000000)

def make_reverse_vocab(vocab):
    # Flip the keys and values in a dict.
    vocab['UNKNOWN'] = 1
    vocab['PAD'] = 0
    rv = {}
    for k in vocab.keys():
        rv[vocab[k]] = k
    return rv

reverse_vocab = make_reverse_vocab(vocab)

# **IMPORTANT**: GPU is not enabled by default

You must switch runtime environments if your output of the next block of code has an error saying "ValueError: Expected a cuda device, but got: cpu"

Go to Runtime > Change runtime type > Hardware accelerator > GPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on {}".format(device))

# Load QAPair Data

In [None]:
class QAPair:
	def __init__(self, idd, number_annotators_for_answer, is_impossible, question, context, answer_text, answer_character_start):
		self.idd=idd
		# SQUAD has multiple answers per question; this is the number of annotators for the majority answer
		# let's only use answers that have at least two annotators
		self.number_annotators_for_answer=number_annotators_for_answer

		# SQUAD 2.0 has questions that cannot be answered from the context.  Is this question impossible to answer?
		self.is_impossible=is_impossible

		# The question
		self.question=question

		# The answer text within the context
		self.answer_text=answer_text

		# The character offset of the answer within the context
		self.answer_start=answer_character_start

		# The paragraph to try to find the answer in
		self.context=context

		spacy_context_tokens=nlp(self.context)
		# This is the tokenized context
		self.context_tokens=[tok.text for tok in spacy_context_tokens]

		# This is the tokenized question
		self.question_tokens=[tok.text for tok in nlp(self.question)]

		self.context_pos=[tok.tag_ for tok in spacy_context_tokens]
		self.context_ner_iob=["%s-%s" % (tok.ent_iob_, tok.ent_type_) for tok in spacy_context_tokens]

    # The index of the token in self.context_tokens that corresponds to the answer start.  The answer is the sequence
		# self.context_tokens[self.answer_start_token:self.answer_end_token]
		self.answer_start_token=None

		# The index of the token in self.context_tokens that corresponds to the answer end.  The answer is the sequence
		# self.context_tokens[self.answer_start_token:self.answer_end_token]
		self.answer_end_token=None

		if not self.is_impossible:

			self.answer_end=self.answer_start+len(self.answer_text)

			# get index of answer start and end *token* in tokenized context
			currentCharacter=0

			token_start=None
			token_end=None

			for idx, word in enumerate(spacy_context_tokens):

				ws=0
				end=word.idx+len(word)

				if idx < len(spacy_context_tokens)-1:
					nextStart=spacy_context_tokens[idx+1].idx
					ws=nextStart-end

				if self.answer_start == currentCharacter:
					self.answer_start_token=idx

				if self.answer_end == currentCharacter + len(word):
					self.answer_end_token=idx + 1

				currentCharacter+=len(word) + ws

		else:
			self.answer_end=None

		if self.answer_start_token is not None and self.answer_end_token is not None:
			assert re.sub(" ", "", ' '.join(self.context_tokens[self.answer_start_token:self.answer_end_token])) == re.sub(" ", "", self.answer_text)

	def __str__(self):

		return '\t'.join([str(x) for x in [self.idd, self.number_annotators_for_answer, self.is_impossible, self.question, self.context, self.answer_start, self.answer_end, self.answer_start_token, self.answer_end_token]])

def read(filename, limit=100000000):
	question_answer_pairs=[]

	with open(filename) as file:
		data=json.load(file)
		for datum in tqdm(data["data"][:limit]):
			title=datum["title"]
			for paragraph in datum["paragraphs"]:
				context=paragraph["context"]
				for qa in paragraph["qas"]:

					question=qa["question"]
					idd=qa["id"]
					is_impossible=qa["is_impossible"]
					answers=qa["answers"]

					majority_answer=None

					answer_counts=Counter()

					if not is_impossible:
						for answer in answers:
							text=answer["text"]
							answer_start=answer["answer_start"]
							answer_counts[(text, answer_start)]+=1

						for k, v in answer_counts.most_common():
							length=len(text)
							assert text == context[answer_start:answer_start+length]
							question_answer_pairs.append(QAPair(idd, v, is_impossible, question, context, text, answer_start))
							break

					else:
						question_answer_pairs.append(QAPair(idd, v, is_impossible, question, context, None, None))

	return question_answer_pairs

with open('devQA.pkl', 'rb') as f:
    devQA= pickle.load(f)
    
with open('trainQA.pkl', 'rb') as f:
    trainQA= pickle.load(f)

max_paragraph_length = 150
max_question_length = 30

train_qa_pairs=[]
for qa in trainQA:
  if not qa.is_impossible and qa.answer_start_token is not None and qa.answer_end_token is not None and len(qa.context_tokens) < max_paragraph_length and len(qa.question_tokens) < max_question_length:
    train_qa_pairs.append(qa)

dev_qa_pairs=[]
for qa in devQA:
  if not qa.is_impossible and qa.answer_start_token is not None and qa.answer_end_token is not None and len(qa.context_tokens) < max_paragraph_length and len(qa.question_tokens) < max_question_length:
    dev_qa_pairs.append(qa)

train_qa_pairs = sorted(train_qa_pairs, key=lambda x: len(x.context_tokens)) 
dev_qa_pairs = sorted(dev_qa_pairs, key=lambda x: len(x.context_tokens)) 

for tqap in train_qa_pairs:
  tqap.answer_end_token = tqap.answer_end_token-1

for dqap in dev_qa_pairs:
  dqap.answer_end_token = dqap.answer_end_token-1

Now that all the data is loaded, let's take a look at it. We'll be working with question & answer pairs from the SQuAD dataset using a class called QAPair. In SQuAD, questions are paired with a paragraph of text called the "context", and the answer comes in the form of a span of text that is highlighted in that paragraph. We will be training a model to predict the start and end points of that span. Let's take a look at what's in a QAPair datapoint. 


In [None]:
qa_pair = dev_qa_pairs[11]
print("Context paragraph:")
print(f"{qa_pair.context}\n")
print("Question:")
print(f"{qa_pair.question}\n")
print("Answer:")
print(f"{qa_pair.answer_text}\n")

Next, let's look more closely at the way this text data is structured and at the features that we will be using in our QA model.  The question and the context have been tokenized and POS and NER features have been automatically tagged for the context (using the spacy library). The answer label has been provided via `answer_start_token` and `answer_end_token`

In [None]:
print("Tokenized Context")
print(f"{qa_pair.context_tokens}\n")
print("Context POS tags")
print(f"{qa_pair.context_pos}\n")
print("Context NER tags")
print(f"{qa_pair.context_ner_iob}\n")
print("Answer Text:")
print(f"{qa_pair.answer_text}\n")
print("Answer Start Token:")
print(f"{qa_pair.answer_start_token}\n")
print("Answer End Token:")
print(f"{qa_pair.answer_end_token}\n")

# Deliverable 1: Adding Features for Part of Speech and Named Entities

`pos_tag_list` and `ner_tag_list` contain the sets of all possible POS and NER features that the words in a QAPair are tagged with. When we pre-processes our data for training using `get_batches`, these strings will be converted to integer-valued ID's using the `pos_vocab` and `ner_vocab` dictionaries defined below.

In [None]:
pos_tag_list = ['$',"''",',','-LRB-','-RRB-','.',':','ADD',
'AFX','CC','CD','DT','EX','FW','HYPH','IN','JJ','JJR',
'JJS','LS','MD','NFP','NN','NNP','NNPS','NNS','PDT','POS',
'PRP','PRP$','RB','RBR','RBS','RP','SYM','TO','UH','VB',
'VBD','VBG','VBN','VBP','VBZ','WDT','WP','WP$','WRB',
'XX','_SP','``']

ner_tag_list = ['B-CARDINAL', 'B-DATE', 'B-EVENT', 'B-FAC',
 'B-GPE', 'B-LANGUAGE', 'B-LAW', 'B-LOC', 'B-MONEY',
 'B-NORP', 'B-ORDINAL', 'B-ORG', 'B-PERCENT', 'B-PERSON',
 'B-PRODUCT', 'B-QUANTITY', 'B-TIME', 'B-WORK_OF_ART',
 'I-CARDINAL', 'I-DATE', 'I-EVENT', 'I-FAC', 'I-GPE',
 'I-LAW', 'I-LOC', 'I-MONEY', 'I-NORP', 'I-ORG',
 'I-PERCENT', 'I-PERSON', 'I-PRODUCT', 'I-QUANTITY',
 'I-TIME', 'I-WORK_OF_ART', 'O-']

pos_vocab = {}
ner_vocab = {}

for i, t in enumerate(pos_tag_list): 
  pos_vocab[t] = i+1 # 0 means PAD

for i, t in enumerate(ner_tag_list): 
  ner_vocab[t] = i+1 # 0 means PAD

In [None]:
class NeuralQA(nn.Module):

  def __init__(self, pretrained_embeddings, pos_tag_list, ner_tag_list):
    super(NeuralQA, self).__init__()
    self.word_embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
    self.vocab_size, self.embedding_dim=pretrained_embeddings.shape
    self.lstm_dim=128

    self.pos_tag_list=pos_tag_list
    self.ner_tag_list=ner_tag_list
    self.pos_dim_embedding_dim=16
    self.ner_dim_embedding_dim=16
    
    self.question_lstm = nn.LSTM(self.embedding_dim, self.lstm_dim, bidirectional=True, batch_first=True, dropout=0.3, num_layers=2)
    self.context_lstm = nn.LSTM(2*self.embedding_dim+self.pos_dim_embedding_dim+self.ner_dim_embedding_dim, self.lstm_dim, bidirectional=True, batch_first=True, dropout=0.3, num_layers=3)

    # p attends over the output of the question LSTM
    self.p_attention = nn.MultiheadAttention(2*self.lstm_dim, 1)

    # p_to_q attends from the context *embeddings* to the question *embeddings*
    self.p_to_q_attention = nn.MultiheadAttention(self.embedding_dim, 1)
    
    self.W_start=nn.Linear(2*self.lstm_dim,2*self.lstm_dim)
    self.W_end=nn.Linear(2*self.lstm_dim,2*self.lstm_dim)
    self.drop_layer_030 = nn.Dropout(p=0.3)

    # YOUR CODE GOES HERE
    self.pos_embeddings = ...
    self.ner_embeddings = ...
    # END OF YOUR CODE

  def forward(self, question_ids, context_ids, pos_ids, ner_ids):

    batch_size, _=question_ids.shape
    question_query=torch.FloatTensor(np.ones((batch_size,1,2*self.lstm_dim))).to(device)

    question_embeds=self.word_embeddings(question_ids)
    context_embeds=self.word_embeddings(context_ids)

    question_embeds=self.drop_layer_030(question_embeds)
    context_embeds=self.drop_layer_030(context_embeds)

    question_embeds=question_embeds.transpose(0,1)
    context_embeds=context_embeds.transpose(0,1)
    
    question_lstm_out, _=self.question_lstm(question_embeds)

    p_attn_output, _ = self.p_attention(question_query, question_lstm_out, question_lstm_out)
    p_q_attn_output, _ = self.p_to_q_attention(context_embeds, question_embeds, question_embeds)

    p_q_attn_output=p_q_attn_output.transpose(0,1) # max_seq_length x batch_size x self.embedding_dim -> batch_size x max_seq_length x self.embedding_dim

    context_embeds=context_embeds.transpose(0,1)
    question_embeds=question_embeds.transpose(0,1)

    context_representation=torch.cat((context_embeds, p_q_attn_output), dim=2)

    # YOUR CODE GOES HERE 
    pos_embeds=...
    ner_embeds=...
    context_representation=...
    # END OF YOUR CODE

    context_lstm_out, _=self.context_lstm(context_representation)
    context_lstm_out=context_lstm_out.transpose(1,2) # batch_size x max_seq_length x 200 -> batch_size x 200 x max_seq_length

    p_start=self.W_start(p_attn_output) 
    p_start = torch.matmul(p_start, context_lstm_out)

    p_end=self.W_end(p_attn_output) 
    p_end = torch.matmul(p_end, context_lstm_out)

    p_start=p_start.squeeze(1)
    p_end=p_end.squeeze(1)

    p_start=torch.exp(p_start)
    p_start=p_start/(torch.sum(p_start, dim=1).unsqueeze(-1))
    
    p_end=torch.exp(p_end)
    p_end=p_end/(torch.sum(p_end, dim=1).unsqueeze(-1))

    return p_start, p_end

In [None]:
def get_ids(tokens, vocab):
  ids = []
  for t in tokens:
    if t in vocab:
      ids.append(vocab[t])
    else:
      ids.append(1) #UNKNOWN
  return ids

def get_tag_ids(tokens, tag_vocab):
  ids = []
  for t in tokens:
    ids.append(tag_vocab[t])
  return ids

def get_batches(qa_pairs, batch_size=32):
    batches_x=[]
    batches_y=[]

    question_ids = []
    context_ids = []
    pos_ids = []
    ner_ids = []
    exact_match_features = []
    answer_start_indexes = []
    answer_end_indexes = []

    for i in range(len(qa_pairs)):
      qa_pair = qa_pairs[i]

      question_ids.append(get_ids(qa_pair.question_tokens, vocab))
      context_ids.append(get_ids(qa_pair.context_tokens, vocab))

      pos_ids.append(get_tag_ids(qa_pair.context_pos, pos_vocab))
      ner_ids.append(get_tag_ids(qa_pair.context_ner_iob, ner_vocab))

      answer_start_indexes.append(qa_pair.answer_start_token)
      answer_end_indexes.append(qa_pair.answer_end_token)

    for i in range(0, len(qa_pairs), batch_size):
      batch_x = {}; batch_y = {}

      batch_question_ids = question_ids[i:i+batch_size]
      max_q_len = max([len(qids) for qids in batch_question_ids])
      for qids in batch_question_ids:
        qids.extend([PAD_INDEX] * (max_q_len-len(qids)))
      batch_x['question_ids'] = batch_question_ids

      batch_context_ids = context_ids[i:i+batch_size]
      max_c_len = max([len(cids) for cids in batch_context_ids])
      for cids in batch_context_ids:
        cids.extend([PAD_INDEX] * (max_c_len-len(cids)))
      batch_x['context_ids'] = batch_context_ids

      batch_pos_ids = pos_ids[i:i+batch_size]
      for pids in batch_pos_ids:
        pids.extend([PAD_INDEX] * (max_c_len-len(pids)))
      batch_x['pos_ids'] = batch_pos_ids

      batch_ner_ids = ner_ids[i:i+batch_size]
      for nids in batch_ner_ids:
        nids.extend([PAD_INDEX] * (max_c_len-len(nids)))
      batch_x['ner_ids'] = batch_ner_ids

      batch_answer_start_indexes = answer_start_indexes[i:i+batch_size]
      batch_answer_end_indexes = answer_end_indexes[i:i+batch_size]
      batch_y['answer_start_indexes'] = batch_answer_start_indexes
      batch_y['answer_end_indexes'] = batch_answer_end_indexes

      batches_x.append(batch_x)
      batches_y.append(batch_y)

    return batches_x, batches_y

In [None]:
def evaluate(model, dev_batches_x, dev_batches_y):
  batch_losses = []
  model.eval()
  correct=total=0.
  for i in range(len(dev_batches_x)):
    batch_x = dev_batches_x[i]
    batch_y = dev_batches_y[i]

    question_ids = torch.LongTensor(batch_x['question_ids']).to(device)
    context_ids = torch.LongTensor(batch_x['context_ids']).to(device)
    pos_ids = torch.LongTensor(batch_x['pos_ids']).to(device)
    ner_ids = torch.LongTensor(batch_x['ner_ids']).to(device)

    question_mask = (question_ids > 0).int()
    context_mask = (context_ids > 0).int()

    answer_start_indexes = batch_y['answer_start_indexes']
    answer_end_indexes = batch_y['answer_end_indexes']

    batch_size, context_size = context_ids.shape

    start_labels = torch.FloatTensor(np.zeros((batch_size, context_size))).to(device)
    end_labels = torch.FloatTensor(np.zeros((batch_size, context_size))).to(device)

    for index, label in enumerate(answer_start_indexes):
      start_labels[index, label] = 1

    for index, label in enumerate(answer_end_indexes):
      end_labels[index, label] = 1

    start_preds, end_preds = model(question_ids, context_ids, pos_ids, ner_ids)

    start_preds=start_preds*context_mask
    end_preds=end_preds*context_mask

    start_max=torch.argmax(start_preds, dim=1)
    end_max=torch.argmax(end_preds, dim=1)

    start_loss = (cross_entropy_loss(start_preds, start_labels) * context_mask).mean()
    end_loss = (cross_entropy_loss(end_preds, end_labels) * context_mask).mean()

    loss = start_loss + end_loss

    batch_losses.append(float(loss.detach().cpu()))

    for index, label in enumerate(answer_start_indexes):
      start=label
      end=answer_end_indexes[index]

      bestProb=0
      bestPair=None,None

      start_index=start_max[index]
      w_idx=context_ids[index][start_index]
      
      if w_idx > 0:
        # end token can be the same as the start token
        for offset in range(0,15):
          end_index=start_index+offset

          if end_index >= context_size:
            continue

          w_idx2=context_ids[index][end_index]

          if w_idx2 <= 0:
            continue

          prob=start_preds[index][start_index]*end_preds[index][end_index]
          if prob > bestProb:
            bestProb=prob
            bestPair=start_index, end_index
      
      total+=1

      if start == bestPair[0] and end == bestPair[1]:
        correct+=1

  eval_loss = np.mean(batch_losses)
  exact_match=correct/total
  return eval_loss, exact_match

# Training the Model

After completing Deliverable 1.1 and 1.2, you will be able to train your network to answer questions.  We evaluate the model using an accuracy metric called "Exact Match". This metric tests whether the span predicted by the model exactly matches the labeled answer.  This means that both the `answer_start` and `answer_end` tokens need to be correctly predicted in order to count as a correct answer.

Because Question Answering is a challenging task that requires reasoning not just about the syntax and semantics of text, but about any entities (e.g. The Empire State Building or California) that might appear in the data, we need a fairly large amount of data to train this model, and we use a larger, case-sensitive vocabulary of pre-trained GloVe vectors.  The training code might take 10 minutes or more to run, so keep that in mind when working on your assignment! 

In [None]:
model=NeuralQA(embs, pos_tag_list, ner_tag_list).to(device)
cross_entropy_loss = nn.BCELoss()
optimizer = torch.optim.Adamax(model.parameters())

In [None]:
train_batches_x, train_batches_y = get_batches(train_qa_pairs, batch_size=128)
dev_batches_x, dev_batches_y = get_batches(dev_qa_pairs, batch_size=128)

In [None]:
train_loss = None
model.eval()
dev_loss, exact_match = evaluate(model, dev_batches_x, dev_batches_y)
print(f"Dev Loss: {dev_loss}, Accuracy(Exact Match): {exact_match}")
t0 = time.time()
for epoch in range(10):
  print(f"Epoch {epoch}")
  model.train()
  train_batches_x, train_batches_y = shuffle(train_batches_x, train_batches_y)
  epoch_losses = []
  for i in tqdm(range(len(train_batches_x))):
    batch_x = train_batches_x[i]
    batch_y = train_batches_y[i]

    question_ids = torch.LongTensor(batch_x['question_ids']).to(device)
    context_ids = torch.LongTensor(batch_x['context_ids']).to(device)
    pos_ids = torch.LongTensor(batch_x['pos_ids']).to(device)
    ner_ids = torch.LongTensor(batch_x['ner_ids']).to(device)

    question_mask = (question_ids > 0).int()
    context_mask = (context_ids > 0).int()

    answer_start_indexes = batch_y['answer_start_indexes']
    answer_end_indexes = batch_y['answer_end_indexes']

    batch_size, context_size = context_ids.shape

    start_labels = torch.FloatTensor(np.zeros((batch_size, context_size))).to(device)
    end_labels = torch.FloatTensor(np.zeros((batch_size, context_size))).to(device)

    for index, label in enumerate(answer_start_indexes):
      start_labels[index, label] = 1

    for index, label in enumerate(answer_end_indexes):
      end_labels[index, label] = 1

    start_preds, end_preds = model(question_ids, context_ids, pos_ids, ner_ids)
    start_loss = (cross_entropy_loss(start_preds, start_labels) * context_mask).mean()
    end_loss = (cross_entropy_loss(end_preds, end_labels) * context_mask).mean()

    loss = start_loss + end_loss
    
    optimizer.zero_grad()
    loss.backward()
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
    optimizer.step()
    epoch_losses.append(float(loss.detach().cpu()))
  train_loss = np.mean(epoch_losses)
  model.eval()
  dev_loss, exact_match = evaluate(model, dev_batches_x, dev_batches_y)
  training_time = time.time()-t0
  secs = int(training_time % 60)
  mins = int(training_time / 60)
  print(f"Train Loss: {train_loss}, Dev Loss: {dev_loss}, Accuracy(Exact Match): {exact_match}, training time: {mins}:{secs}")


# Deliverable 2: Answering Questions

In [None]:
def get_top_spans(start_probs, end_probs, context_length):
  # "we choose the best span from token i to token i′ such that i≤i′≤i+15 and Pstart(i)×Pend(i′) is maximized"
  span_pairs = {}
  for i in range(context_length):
    for j in range(context_length):
      if j >= i and j <= i+15:
        prod = float(start_probs[i]) * float(end_probs[j])
        key = f"{i},{j}"      
        span_pairs[key] = prod
  top_spans = {k: v for k, v in sorted(span_pairs.items(), key=lambda item: item[1], reverse=True)}
  return top_spans

def ids_to_readable_string(ids):
  return ' '.join([reverse_vocab[id] for id in ids])

def get_answer(context_ids, start_index, end_index):
  return ids_to_readable_string(context_ids[start_index:end_index+1])

def get_top_answers(context_ids, context_length, start_probs, end_probs, k):
  answers = []
  top_spans = get_top_spans(start_probs, end_probs, context_length)
  for i in range(k):
    start_index, end_index = [int(s) for s in list(top_spans.keys())[i].split(',')]
    answers.append(get_answer(context_ids, start_index, end_index))
  return answers

def answer_question(your_paragraph, your_question):
  your_qa_pair = QAPair(idd='123456789',number_annotators_for_answer=None,is_impossible=True,
        question=your_question,context=your_paragraph,answer_text=None,answer_character_start=None)
  your_batch = get_batches([your_qa_pair], batch_size=1)
  batches_x, batches_y = your_batch
  batch_x = batches_x[0]
  question_ids = torch.LongTensor(batch_x['question_ids']).to(device)
  context_ids = torch.LongTensor(batch_x['context_ids']).to(device)
  pos_ids = torch.LongTensor(batch_x['pos_ids']).to(device)
  ner_ids = torch.LongTensor(batch_x['ner_ids']).to(device)
  question_mask = (question_ids > 0).int()
  context_mask = (context_ids > 0).int()
  start_preds, end_preds = model(question_ids, context_ids, pos_ids, ner_ids)
  start_preds = start_preds*context_mask
  end_preds = end_preds*context_mask
  batch_index = 0
  readable_q = ids_to_readable_string(batch_x['question_ids'][batch_index])
  readable_par = ids_to_readable_string(batch_x['context_ids'][batch_index])
  print("\nContext:")
  print(f"{readable_par}\n")
  print("Question:")
  print(f"{readable_q}\n")
  print("Top Predicted Answers:")
  top_answers = get_top_answers(batch_x['context_ids'][batch_index], int(context_mask.sum()), start_preds[batch_index], end_preds[batch_index], k=10)
  for answer in top_answers:
    print(answer)

Now that we have a trained model, let's try answering some questions of your own. Choose a paragraph from any page on Wikipedia (or write your own if you are feeling adventurous), copy that paragraph into the Colab (make sure it's less than 150 words, since we restricted our training data to short paragraphs to make the training time more manageable), and write a question that can be answered by highlighting a span from that paragraph. Try to find one paragraph/question pair that the model is able to answer, and another paragraph/question pair that the model is not able to answer.

In [None]:
# YOUR CODE GOES HERE
your_paragraph_1 = ...
your_question_1 = ...
# END OF YOUR CODE

answer_question(your_paragraph_1, your_question_1)

In [None]:
# YOUR CODE GOES HERE
your_paragraph_2 = ...
your_question_2 = ...
# END OF YOUR CODE

answer_question(your_paragraph_2, your_question_2)