# Multiple choice question answering script for MC500 data set
A python script made for MC500, RACE and DREAM data set using pre-trained model `bert-large-uncased-whole-word-masking-finetuned-squad`

## Libraries needed
`transformers`, `pytorch`, `pandas`, `sklearn`, `nltk`, `gensim`, `numpy`


In [1]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import random
import gensim
import os
import pathlib
import json

## Preconfigurations for BERT model

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
if torch.cuda.is_available():
  model.to('cuda')

## Read data sets

In [3]:
def readMcTest(data_path, question_paths, answer_paths):
  
  question_data = pd.concat([pd.read_csv(data_path + _question_path, sep='\t',  names=range(0,23)) for _question_path in question_paths], ignore_index=True, sort=False)
  print('Total question count ', len(question_data))
  
  answer_data = pd.concat([pd.read_csv(data_path + _answer_path, sep='\t',  names=['1','2','3','4']) for _answer_path in answer_paths], ignore_index=True, sort=False)
  print('Total answer count ', len(answer_data))

  passages = list(map(lambda _passage: _passage.replace('\\newline', ' '), question_data[2]))

  questions = question_data[[3,8,13,18]]
  questions.columns=range(0,4)

  options =  question_data[[4,5,6,7, 9,10,11,12,14,15,16,17,19,20,21,22]]
  options.columns = ['1A','1B','1C','1D','2A','2B','2C','2D','3A','3B','3C','3D','4A','4B','4C','4D']

  return (passages, questions, options, answer_data)

In [4]:
def readDREAM(docPath):
    with open(docPath) as f:
        data = json.load(f)

    dataset = []
    for story in data:
        temp = {}

        # pre-processing of article
        # for sentence spoke by "M:", add prefix "Men:" to every sentence 
        # for sentence spoke by "W:", add prefix "Women:" to every sentence 
        temp["article"] = ""
        for sentence in story[0]:
            if "M:" in sentence:
                sentence = sentence.replace("M: ", "")
                for sent in sent_tokenize(sentence):
                    temp["article"] += "Men: " + sent + " "
            elif "W:" in sentence:
                sentence = sentence.replace("W: ", "")
                for sent in sent_tokenize(sentence):
                    temp["article"] += "Woman: " + sent + " "
            else:
                temp["article"] += sentence

        temp["question"] = story[1][0]["question"]
        for i in range(len(story[1][0]["choice"])):
            temp[f"choice {i}"] = story[1][0]["choice"][i]

        # answer choice = A/B/C/D, answer index = 0/1/2/3, answer = answer in string format
        temp["answer"] = story[1][0]["answer"]
        for i in range(len(story[1][0]["choice"])):
            if story[1][0]["choice"][i] == story[1][0]["answer"]:
                temp["answer choice"] = chr(i + 65)      # from 0 to "A"
                temp["answer index"] = i
                break
        
        dataset.append(temp)

    return pd.DataFrame(dataset)

In [5]:
def readRACE(relativeDocPath):
    dataset = []

    for filename in os.listdir(os.path.join(os.getcwd(), relativeDocPath)):
        with open(os.path.join(relativeDocPath, filename), 'r') as f: # open in readonly mode
            story = json.load(f)
        
        temp = {}
        temp["article"] = story["article"].replace('\\n', ' ')
        # print(temp["article"])
        tempPerQuestion = {}
        for i in range(len(story["questions"])):
            temp = {"article": temp["article"]}   

            temp["question"] = story["questions"][i]
            for j in range(len(story["options"][i])):
                temp[f"choice {j}"] = story["options"][i][j]
                
            # answer choice = A/B/C/D, answer index = 0/1/2/3, answer = answer in string format
            temp["answer index"] = ord(story["answers"][i]) - 65      # from "A" to 0
            temp["answer"] = temp[f"choice {temp['answer index']}"]
            temp["answer choice"] = story["answers"][i]

            dataset.append(temp)

    return pd.DataFrame(dataset)


## Utils

### Develop simlarity model

In [6]:
def similarity_model(ans_str):

  tokenized_docs = list(map(lambda ans: [w.lower() for w in word_tokenize(ans)], ans_str))

  dictionary = gensim.corpora.Dictionary(tokenized_docs)

  corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

  tf_idf = gensim.models.TfidfModel(corpus)

  sims = gensim.similarities.docsim.MatrixSimilarity(tf_idf[corpus],num_features=len(dictionary))

  return (dictionary, sims, tf_idf)


### Cosine similarity

In [7]:
def cos_sim(dictionary, sim_model, tf_idf, ans_str):

  query_doc = [w.lower() for w in word_tokenize(ans_str)]

  query_doc_bow = dictionary.doc2bow(query_doc)

  # perform a similarity query against the corpus
  query_doc_tf_idf = tf_idf[query_doc_bow]

  probability = sim_model[query_doc_tf_idf]

  return probability


### Pick predicted anwser choice based on probabilities

In [8]:
def pick_answer_from_probabilities(probabilities, choice_count = 4):
  option_probabilities =  {'A':'','B':'','C':''} if choice_count == 3 else  {'A':'','B':'','C':'', 'D': ''} 

  for opt, proba in zip(option_probabilities.keys(), probabilities):
    option_probabilities[opt] = proba

  print('Probabilities for this question:', option_probabilities)

  max_probability = max(option_probabilities.values())
  if max_probability == 0:
      random_picked_answer = random.choice(['A','B','C'] if choice_count == 3 else ['A','B','C'] )
      print('Random picked option: {}'.format(random_picked_answer))
      return random_picked_answer
  else:
    for key, val in option_probabilities.items(): 
      if val == max_probability:
        print('Predicted option: {}'.format(key))
        return key

## Predict answer using the model

In [9]:
def answer_question(question, text):

  input_ids = tokenizer.encode(question, text, padding=True, truncation=True, max_length=510, add_special_tokens = True)
  sep_index = input_ids.index(tokenizer.sep_token_id)
  num_seg_a = sep_index + 1
  num_seg_b = len(input_ids) - num_seg_a
  segment_ids = [0]*num_seg_a + [1]*num_seg_b

  assert len(segment_ids) == len(input_ids)

  input_tensors = torch.tensor([input_ids]).to('cuda') if torch.cuda.is_available() else torch.tensor([input_ids])
  segment_id_tensors = torch.tensor([segment_ids]).to('cuda') if torch.cuda.is_available() else torch.tensor([segment_ids])
  outputs = model(input_tensors,
                  token_type_ids=segment_id_tensors,
                  return_dict=True) 

  start_scores = outputs.start_logits
  end_scores = outputs.end_logits

  # Find the tokens with the highest `start` and `end` scores.
  answer_start = torch.argmax(start_scores)
  answer_end = torch.argmax(end_scores)

  tokens = tokenizer.convert_ids_to_tokens(input_ids)

  # Answer start with the first token.
  answer = tokens[answer_start]

  for i in range(answer_start + 1, answer_end + 1):
      # If it's a subword token, then recombine it with the previous token.
      if tokens[i][0:2] == '##':
          answer += tokens[i][2:]
      else:
          answer += ' ' + tokens[i]

  return answer


## Running MC500 test data set

In [10]:
  passages, questions, options, answer_data = readMcTest('./data/MCTest/', ['mc500.test.tsv'], ['mc500.test.ans'])

  all_predicted_options = []
  for i in range(len(passages)):
    for q in range(len(questions.loc[i])):
      print('Answering passage no ', str(i), ', question ',str(q+1))

      predicted_answer_string = answer_question(questions.loc[i][q].replace('multiple: ', '').replace('one: ', ''), passages[i])
      print('Predicted answer string: "{}"'.format(predicted_answer_string))

      correct_ans = answer_data.loc[i][q] 
      print('Correct option is '+ correct_ans)

      option_strings = list(map(lambda testing_opt: options.loc[i][str(q+1)+testing_opt], ['A', 'B', 'C', 'D']))
      options_map = {'A': option_strings[0], 'B': option_strings[1], 'C': option_strings[2], 'D': option_strings[3] }

      dictionary, sims, tf_idf = similarity_model(option_strings)

      probabilities = cos_sim(dictionary, sims, tf_idf, predicted_answer_string)
      
      picked_answer = pick_answer_from_probabilities(probabilities)
      print('Answer predicted: "{}"\n'.format(options_map[picked_answer]))

      all_predicted_options.append(picked_answer)

  assert len(all_predicted_options) == len(answer_data.values.flatten().tolist())


  print('Accuracy', accuracy_score(answer_data.values.flatten().tolist(), all_predicted_options))
  print(classification_report(answer_data.values.flatten().tolist(), all_predicted_options))

ion: {'A': 1.0, 'B': 0.0, 'C': 0.0, 'D': 0.0}
Predicted option: A
Answer predicted: "Green"

Answering passage no  132 , question  1
Predicted answer string: "to see if he can help"
Correct option is C
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0}
Random picked option: A
Answer predicted: "Oil"

Answering passage no  132 , question  2
Predicted answer string: "a machine that pulls oil out of the earth"
Correct option is D
Probabilities for this question: {'A': 0.07856742, 'B': 0.0, 'C': 0.48112524, 'D': 0.80013233}
Predicted option: D
Answer predicted: "A pump-jack is a machine that pulls Oil out of the Earth."

Answering passage no  132 , question  3
Predicted answer string: "very tough time keeping them working"
Correct option is A
Probabilities for this question: {'A': 0.78446454, 'B': 0.0, 'C': 0.0, 'D': 0.0}
Predicted option: A
Answer predicted: "Keeping his pump-jacks working"

Answering passage no  132 , question  4
Predicted answer string: "oil"
Corr

## Running RACE middle test data set

In [11]:
  race_data = readRACE('./data/RACE/test/middle')

  all_predicted_options = []
  for i in range(len(race_data)):
    predicted_answer_string = answer_question(race_data['question'][i], race_data['article'][i])
    print('Predicted answer string: "{}"'.format(predicted_answer_string))

    correct_ans = race_data['answer choice'][i]
    print('Correct option is '+ correct_ans)

    option_strings = list(map(lambda choice: race_data.loc[i][choice], ['choice 0', 'choice 1', 'choice 2', 'choice 3']))
    options_map = {'A': option_strings[0], 'B': option_strings[1], 'C': option_strings[2], 'D': option_strings[3] }

    dictionary, sims, tf_idf = similarity_model(option_strings)

    probabilities = cos_sim(dictionary, sims, tf_idf, predicted_answer_string)
    
    picked_answer = pick_answer_from_probabilities(probabilities)
    print('Answer precited: "{}"\n'.format(options_map[picked_answer]))
    
    all_predicted_options.append(picked_answer)


  assert len(all_predicted_options) == len(race_data)

  print('Accuracy', accuracy_score(race_data['answer choice'], all_predicted_options))
  print(classification_report(race_data['answer choice'], all_predicted_options))

 'B': 0.0, 'C': 0.0, 'D': 0.67936623}
Predicted option: D
Answer precited: "Because Poor wanted the writer to pay for the dinner."

Predicted answer string: "read"
Correct option is C
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0}
Random picked option: A
Answer precited: "Poor is an old man"

Predicted answer string: "if you don ' t make money , no one is going to give it to you"
Correct option is A
Probabilities for this question: {'A': 0.41305107, 'B': 0.039561987, 'C': 0.039561987, 'D': 0.029974544}
Predicted option: A
Answer precited: "they make a living by driving"

Predicted answer string: "not only " take " but also " give " ."
Correct option is C
Probabilities for this question: {'A': 0.28623122, 'B': 0.23776507, 'C': 0.0, 'D': 0.0}
Predicted option: A
Answer precited: "They give the poor children a lunch party at the sea each summer."

Predicted answer string: "" take " but also " give ""
Correct option is D
Probabilities for this question: {'A': 0.0

## Running RACE high test data set

In [12]:
  race_data = readRACE('./data/RACE/test/high')

  all_predicted_options = []
  for i in range(len(race_data)):
    predicted_answer_string = answer_question(race_data['question'][i], race_data['article'][i])
    print('Predicted answer string: "{}"'.format(predicted_answer_string))

    correct_ans = race_data['answer choice'][i]
    print('Correct option is '+ correct_ans)

    option_strings = list(map(lambda choice: race_data.loc[i][choice], ['choice 0', 'choice 1', 'choice 2', 'choice 3']))
    options_map = {'A': option_strings[0], 'B': option_strings[1], 'C': option_strings[2], 'D': option_strings[3] }

    dictionary, sims, tf_idf = similarity_model(option_strings)

    probabilities = cos_sim(dictionary, sims, tf_idf, predicted_answer_string)
    
    picked_answer = pick_answer_from_probabilities(probabilities)
    print('Answer precited: "{}"\n'.format(options_map[picked_answer]))
    
    all_predicted_options.append(picked_answer)


  assert len(all_predicted_options) == len(race_data)

  print('Accuracy', accuracy_score(race_data['answer choice'], all_predicted_options))
  print(classification_report(race_data['answer choice'], all_predicted_options))

answer string: "[CLS]"
Correct option is A
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0}
Random picked option: A
Answer precited: "Benefits of Sunflower Oil for Skin"

Predicted answer string: "[CLS]"
Correct option is B
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0}
Random picked option: A
Answer precited: "To explain what caused the explosion of NASA-s unmanned rocket."

Predicted answer string: "really feel sorry for my students"
Correct option is C
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0}
Random picked option: C
Answer precited: "Frustrated."

Predicted answer string: "astronauts at the iss felt a more urgent disappointment"
Correct option is C
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0, 'D': 0.0}
Random picked option: B
Answer precited: "interesting"

Predicted answer string: "" astronauts on board the iss already drink water distilled from sweat and urine"
Correct option is

## Running DREAM test data set

In [13]:
  dream_data = readDREAM('./data/DREAM/test.json')

  all_predicted_options = []
  for i in range(len(dream_data)):
    predicted_answer_string = answer_question(dream_data['question'][i], dream_data['article'][i])
    print('Predicted answer string: "{}"'.format(predicted_answer_string))

    correct_ans = dream_data['answer choice'][i]
    print('Correct option is '+ correct_ans)
    option_strings = list(map(lambda choice: dream_data.loc[i][choice], ['choice 0', 'choice 1', 'choice 2']))
    options_map = {'A': option_strings[0], 'B': option_strings[1], 'C': option_strings[2] }

    dictionary, sims, tf_idf = similarity_model(option_strings)

    probabilities = cos_sim(dictionary, sims, tf_idf, predicted_answer_string)
    
    picked_answer = pick_answer_from_probabilities(probabilities, choice_count = 3)
    print('Answer precited: "{}"\n'.format(options_map[picked_answer]))
    
    all_predicted_options.append(picked_answer)


  assert len(all_predicted_options) == len(dream_data)

  print('Accuracy', accuracy_score(dream_data['answer choice'], all_predicted_options))
  print(classification_report(dream_data['answer choice'], all_predicted_options))

abilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0}
Random picked option: C
Answer precited: "Come to a meeting."

Predicted answer string: "men : hi , janet , i hear you ' ve just returned from a tour of australia"
Correct option is B
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.42640144}
Predicted option: C
Answer precited: "Janet thinks it's a shame for anyone not to visit Australia."

Predicted answer string: "see my parents first"
Correct option is B
Probabilities for this question: {'A': 0.57735026, 'B': 0.0, 'C': 0.0}
Predicted option: A
Answer precited: "To visit his parents."

Predicted answer string: "in the teachers ' office this afternoon"
Correct option is C
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0}
Random picked option: C
Answer precited: "3:30."

Predicted answer string: "men"
Correct option is B
Probabilities for this question: {'A': 0.0, 'B': 0.0, 'C': 0.0}
Random picked option: C
Answer precited: "He is busy."

Predicted a