In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Process Kaggle Dataset

In [None]:
miscel = pd.read_csv('miscellaneous.csv') # ques.csv contain https://www.kaggle.com/datasets/radek1/sci-or-not-sci-hypthesis-testing-pack?select=6000_wiki_en_sci_questions_with_excerpts.csv

In [None]:
miscel.head(2)

Unnamed: 0,prompt,A,B,C,D,E,answer,wikipedia_excerpt
0,Who was responsible for the reorganisation of ...,Territorial brigades,First line divisions,Training Reserve,Second line divisions,British home army,C,British home army in the First World War: The ...
1,What film earned Rakshit Shetty the Karnataka ...,Rakshit Shetty did not win the Karnataka State...,Nam Areal Ondina,Ulidavaru Kandanthe,The information is not provided in the Wikiped...,Simple Agi Ondh Love Story,C,Rakshit Shetty: Rakshit made his acting debut ...


In [None]:
def get_correct_answer(row):
    ans = row[str(row['answer']).strip()]
    row['answer'] = ans
    return row

In [None]:
miscel = miscel.apply(get_correct_answer, axis=1)
miscel.drop(labels=['A', 'B', 'C', 'D', 'E'], axis=1, inplace=True)

In [None]:
miscel.sample(2)

Unnamed: 0,prompt,answer,wikipedia_excerpt
5077,What happens to Aidan in the excerpt from Lost...,Aidan grabs a suitcase thrown to him by a man ...,Lost in Blue: Shipwrecked: Aidan gets onto a l...
2272,"How many square miles does White House, Tennes...",11 square miles,"White House, Tennessee: 0 square miles (23 2 k..."


In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
def find_src_noun_chunk(token, doc):
  for chunk in doc.noun_chunks:
    if token.i in range(chunk.start, chunk.end):
      return chunk
  return None

In [None]:
import random

def remove_information(text, nlp, num_pre=1):
  """Remove parts of the input text, subject subtext to remove is of the form: "adposition + noun_chunk",
  if more instances than num_pre, choose randomly to remove

  Args:
    text: text
    num_pre: number of phrases to remove, default 1

  Returns:
    str: text with parts removed, same text if no target phrase is found
  """
  rm_targets = [] # list(dict: {"text": , "start_id": })
  doc = nlp(text)

  for t_i in range(len(doc)):
    token = doc[t_i]
    if token.pos_ == "ADP":
      for child in token.children:
        if child.pos_ in ['PROPN', 'NOUN', 'PRON']:
          rm_phrase = find_src_noun_chunk(child, doc)
          if not rm_phrase:
            continue

          # add text to list to be removed, and advance token index
          phrase_data = {"text": token.text + ' ' + rm_phrase.text.strip(), "start_id": token.idx}
          rm_targets.append(phrase_data)
          t_i = rm_phrase.end
          break

  rm_ids = random.sample(range(0, len(rm_targets)), min(num_pre, len(rm_targets)))
  rm_ids.sort(reverse=True)

  for rm_idx in rm_ids:
    phrase = rm_targets[rm_idx]
    length = len(phrase['text'])
    text = text[:phrase['start_id']] + " " * length + text[phrase['start_id']+length:]
  return ' '.join(text.split())

In [None]:
import random

def add_incomplete_question(row):
  num_pre = random.choices([1, 2, 3, 4])[0]
  row['incomplete_prompt'] = remove_information(row['prompt'], nlp, num_pre)
  return row

In [None]:
new_miscel = miscel.apply(add_incomplete_question, axis=1)

In [None]:
# additional cleansing

new_miscel = new_miscel[new_miscel['prompt'] != "What are some models that attempt to account for all observations without invoking supplemental non-baryonic matter?"]
new_miscel = new_miscel.reset_index(drop=True)

In [None]:
new_miscel

Unnamed: 0,prompt,answer,wikipedia_excerpt,incomplete_prompt
0,Who was responsible for the reorganisation of ...,Training Reserve,British home army in the First World War: The ...,Who was responsible of New Army reserve units ?
1,What film earned Rakshit Shetty the Karnataka ...,Ulidavaru Kandanthe,Rakshit Shetty: Rakshit made his acting debut ...,What film earned Rakshit Shetty the Karnataka ...
2,What is the population of Maklavan?,"Maklavan has a population of 2,170 individuals...","Maklavan: Maklavan (, also Romanized as Mākalā...",What is the population ?
3,What was the stud fee for Empire Maker at Gain...,"$100,000","Empire Maker: In September 2015, it was announ...",What was the stud fee in 2016?
4,What books has Brian J. Bowe published for Ens...,"Books about The Ramones, The Clash, and Judas ...",Brian J. Bowe: He co-edited the 2007 anthology...,What books has Brian J. Bowe published ?
...,...,...,...,...
5984,Who was Edward Kingsley?,"Edward Kingsley was a pioneer in ""art house"" f...",Kingsley-International Pictures: The company w...,Who was Edward Kingsley?
5985,What is the purpose of the box set compilation...,"""Singles 1965–1967"" is a box set compilation o...",Singles 1965–1967: Singles 1965–1967 is a box ...,"What is the purpose ""Singles 1965–1967"" , and ..."
5986,"When was the town of De Kalb, New York organized?",1806,"De Kalb, New York: The town was organized in 1...","When was the town , New York organized?"
5987,What is the reason that Newton's second law ca...,The interaction between particles at the quant...,"Arthur C. Clarke bibliography: k a ""The Haunte...",What is the reason that Newton's second law ca...


In [None]:
# more cleansing

filtered_ids = []
for r in new_miscel.iterrows():
  if not ("||" in r[1]['wikipedia_excerpt']) and not ("||" in r[1]['prompt']):
    filtered_ids.append(r[0])

new_miscel = new_miscel.iloc[filtered_ids, :]
new_miscel = new_miscel.reset_index(drop=True)

In [None]:
new_miscel = new_miscel.rename(columns={'prompt': 'question', 'wikipedia_excerpt': 'context', 'incomplete_prompt': 'incomplete_question'})

In [None]:
def add_source(row):
  row['source_dataset'] = 'kaggle'
  return row
new_miscel = new_miscel.apply(add_source, axis=1)

In [None]:
new_miscel.head()

Unnamed: 0,question,answer,context,incomplete_question,source_dataset
0,Who was responsible for the reorganisation of ...,Training Reserve,British home army in the First World War: The ...,Who was responsible of New Army reserve units ?,kaggle
1,What film earned Rakshit Shetty the Karnataka ...,Ulidavaru Kandanthe,Rakshit Shetty: Rakshit made his acting debut ...,What film earned Rakshit Shetty the Karnataka ...,kaggle
2,What is the population of Maklavan?,"Maklavan has a population of 2,170 individuals...","Maklavan: Maklavan (, also Romanized as Mākalā...",What is the population ?,kaggle
3,What was the stud fee for Empire Maker at Gain...,"$100,000","Empire Maker: In September 2015, it was announ...",What was the stud fee in 2016?,kaggle
4,What books has Brian J. Bowe published for Ens...,"Books about The Ramones, The Clash, and Judas ...",Brian J. Bowe: He co-edited the 2007 anthology...,What books has Brian J. Bowe published ?,kaggle


#Process RACE Dataset

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

race = load_dataset("ehovy/race", "all")
race_train, race_val, race_test = pd.DataFrame(race['train']), pd.DataFrame(race['validation']), pd.DataFrame(race['test'])

In [None]:
race_train.shape, race_val.shape, race_test.shape # 90 - 5 - 5 ratio

((87866, 5), (4887, 5), (4934, 5))

Remove Noise

In [None]:
race_train, race_val, race_test = race_train.dropna(), race_val.dropna(), race_test.dropna()
race_train, race_val, race_test = race_train.drop(columns='example_id', axis=1), race_val.drop(columns='example_id', axis=1), race_test.drop(columns='example_id', axis=1)

In [None]:
condition = [len(q.split(" ")) >= 3 for q in race_train['question']]
race_train = race_train[condition]

condition = [len(q.split(" ")) >= 3 for q in race_val['question']]
race_val = race_val[condition]

condition = [len(q.split(" ")) >= 3 for q in race_test['question']]
race_test = race_test[condition]

In [None]:
race_train = race_train.rename(columns={'article':'context'})
race_val = race_val.rename(columns={'article':'context'})
race_test = race_test.rename(columns={'article':'context'})

Extract Useful Data

In [None]:
def remove_trailing_blank(text):
  """Remove trailing blank (if found), and replace it with question mark ('?')
  """
  words = [char for char in text]

  b_pos = text.rfind('_')
  if b_pos == -1:
    return text

  if b_pos != 0:
    if words[b_pos - 1] == ' ':
      b_pos -=1

  words = words[:b_pos]
  words.append('?')

  return ''.join(words)

In [None]:
new_race_train = pd.DataFrame(columns=['context', 'question', 'answer', 'source_dataset'])
new_race_val = pd.DataFrame(columns=['context', 'question', 'answer', 'source_dataset'])
new_race_test = pd.DataFrame(columns=['context', 'question', 'answer', 'source_dataset'])

In [None]:
for r in race_train.iterrows():
  options = r[1]['options']

  option_ids = ['A', 'B', 'C', 'D']
  answer_id = option_ids.index(r[1]['answer'])

  if r[1]['question'].count('_') >= 2:
    continue
  question = remove_trailing_blank(r[1]['question'])

  # get answer and distractors
  answer = options[answer_id]

  datapoint = {"context": [r[1]['context']], "question": [question], "answer": [answer], 'source_dataset': ['race']}
  new_race_train = pd.concat([new_race_train, pd.DataFrame(datapoint)], axis=0, ignore_index=True)

In [None]:
for r in race_val.iterrows():
  options = r[1]['options']

  option_ids = ['A', 'B', 'C', 'D']
  answer_id = option_ids.index(r[1]['answer'])

  if r[1]['question'].count('_') >= 2:
    continue
  question = remove_trailing_blank(r[1]['question'])

  # get answer and distractors
  answer = options[answer_id]

  datapoint = {"context": [r[1]['context']], "question": [question], "answer": [answer], 'source_dataset': ['race']}
  new_race_val = pd.concat([new_race_val, pd.DataFrame(datapoint)], axis=0, ignore_index=True)

In [None]:
for r in race_test.iterrows():
  options = r[1]['options']

  option_ids = ['A', 'B', 'C', 'D']
  answer_id = option_ids.index(r[1]['answer'])

  if r[1]['question'].count('_') >= 2:
    continue
  question = remove_trailing_blank(r[1]['question'])

  # get answer and distractors
  answer = options[answer_id]

  datapoint = {"context": [r[1]['context']], "question": [question], "answer": [answer], 'source_dataset': ['race']}
  new_race_test = pd.concat([new_race_test, pd.DataFrame(datapoint)], axis=0, ignore_index=True)

In [None]:
# additional cleansing: remove questions requiring summary-related understanding, not poking at specific information
sum_ques_keywords = ['main idea', 'passage', 'article', 'which is true', 'is true', 'is not true', 'is false', 'is correct', 'is not correct', 'statement', 'author', 'writer', 'conclude', 'conclusion', 'infer', 'inferred', 'tell us'] # key words found in questions requiring summarization

filtered_ids = []
for r in new_race_train.iterrows():
  if not any([key in r[1]['question'].lower() for key in sum_ques_keywords]):
    filtered_ids.append(r[0])

new_race_train = new_race_train.iloc[filtered_ids]
new_race_train = new_race_train.reset_index(drop=True)

In [None]:
filtered_ids = []
for r in new_race_val.iterrows():
  if not any([key in r[1]['question'].lower() for key in sum_ques_keywords]):
    filtered_ids.append(r[0])

new_race_val = new_race_val.iloc[filtered_ids]
new_race_val = new_race_val.reset_index(drop=True)

In [None]:
filtered_ids = []
for r in new_race_test.iterrows():
  if not any([key in r[1]['question'].lower() for key in sum_ques_keywords]):
    filtered_ids.append(r[0])

new_race_test = new_race_test.iloc[filtered_ids]
new_race_test = new_race_test.reset_index(drop=True)

In [None]:
import random
def add_incomplete_question_for_race(row):
  num_pre = random.choices([1, 2, 3, 4])[0]
  row['incomplete_question'] = remove_information(row['question'], nlp, num_pre)
  return row

new_race_train = new_race_train.apply(add_incomplete_question_for_race, axis=1)
new_race_val = new_race_val.apply(add_incomplete_question_for_race, axis=1)
new_race_test = new_race_test.apply(add_incomplete_question_for_race, axis=1)

Final Dataset

In [None]:
final_train = pd.concat([new_race_train, new_miscel], axis=0, ignore_index=True)
final_val = new_race_val
final_test = new_race_test

In [None]:
# gets only long questions

condition = [len(q.split(" ")) >= 7 for q in final_train['question']]
final_train = final_train[condition]

condition = [len(q.split(" ")) >= 7 for q in final_val['question']]
final_val = final_val[condition]

condition = [len(q.split(" ")) >= 7 for q in final_test['question']]
final_test = final_test[condition]

Extract Only Questions with Long Answers: not used

In [None]:
def count_words(string):
  return len(str(string).split(' '))

In [None]:
AVG_LEN_THRES = 8

In [None]:
filtered_dataset = pd.DataFrame(columns=['context', 'question', 'answer', 'source_dataset', 'incomplete_question'])

In [None]:
# columns: ['context', 'question', 'answer', 'source_dataset']
for r in final_dataset.iterrows():
  answer = r[1]['answer']
  ans_len = len(str(answer).split(' '))

  if ans_len >= AVG_LEN_THRES:
    datapoint = {"context": [r[1]['context']], "question": [r[1]['question']], "answer": [answer], "incomplete_question": [r[1]['incomplete_question']], 'source_dataset': [r[1]['source_dataset']]}
    filtered_dataset = pd.concat([filtered_dataset, pd.DataFrame(datapoint)], axis=0, ignore_index=True)

In [None]:
# remove error
filtered_dataset = filtered_dataset[filtered_dataset['question'] != "What are some models that attempt to account for all observations without invoking supplemental non-baryonic matter?"]
filtered_dataset = filtered_dataset.reset_index(drop=True)

In [None]:
# additional cleansing

filtered_ids = []
for r in filtered_dataset.iterrows():
  if not ("||" in r[1]['context']):
    filtered_ids.append(r[0])

filtered_dataset = filtered_dataset.iloc[filtered_ids, :]
filtered_dataset = filtered_dataset.reset_index(drop=True)

In [None]:
# more cleaning
import re

def remove_space_before_question_mark(row):
  ques = re.sub(r'\s+', ' ', row['question'])
  row['question'] = ques.replace(' ?', '?')

  ques = re.sub(r'\s+', ' ', row['incomplete_question'])
  row['incomplete_question'] = ques.replace(' ?', '?')
  return row

final_dataset = final_dataset.apply(remove_space_before_question_mark, axis=1)
filtered_dataset = filtered_dataset.apply(remove_space_before_question_mark, axis=1)

In [None]:
final_dataset.to_csv('final_dataset.csv', index=False)

In [None]:
filtered_dataset.to_csv("filtered_dataset.csv", index=False)

In [None]:
# download the file
from google.colab import files

filename = 'final_dataset.csv'
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
filtered_dataset.head(1)

Unnamed: 0,context,question,answer,source_dataset
0,Many people keep the stamps in a drawer at hom...,The problem of keeping stamps at home is that ?,you do not know how many are left,race


#New Method for Removing Random Spans in Question

Modify "incomplete_question" row from "incomplete_dataset" above

In [None]:
# additional cleansing
import re

def remove_space_before_question_mark(row):
  ques = re.sub(r'\s+', ' ', row['question'])
  row['question'] = ques.replace(' ?', '?')
  return row

final_dataset = final_dataset.apply(remove_space_before_question_mark, axis=1)

In [None]:
# gets only long questions

condition = [len(q.split(" ")) >= 7 for q in final_dataset['question']]
final_dataset = final_dataset[condition]

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import spacy
import pandas as pd
import numpy as np
from spacy import displacy

pd.set_option('display.max_colwidth', None)

nlp = spacy.load('en_core_web_lg')

In [None]:
import random
import re

def remove_random_information(nlp, text, max_span_len=5, num_pre=3, min_start=2):
    """Remove randome pieces of information in text

    Args:
        text (str):
        max_span_len (int): max span for each piece of text to remove
        num_pre (int): number of spans to remove
        min_start (int): first token id to can be removed
    Return:
        str: new gapped question
    """
    doc = nlp(text)
    text_ids = [i for i in range(min_start, len(doc)) if doc[i].is_alpha] # ensure only select alphanumeric tokens
    root_inds = random.sample(text_ids, min(len(text_ids),num_pre))
    root_inds.sort()
    span_ids = [] # ids of tokens to be removed

    for i in range(len(root_inds)):
      idx = root_inds[i]
      root_token = doc[idx]
      cur_span_ids = [] # ids of tokens in current span to be removed
      for child in root_token.subtree:
          if i != len(root_inds) - 1 and child.i >= root_inds[i + 1]:
            break
          if len(cur_span_ids) >= max_span_len:
            break
          if child.i < min_start:
            break
          cur_span_ids.append(child)

      # insert start and end ind of span
      if len(cur_span_ids) == 0:
        break
      start_ind = cur_span_ids[0].idx
      end_ind = cur_span_ids[-1].idx + len(cur_span_ids[-1].text)

      span_ids.append((start_ind, end_ind))

    new_chars = []
    for i in range(len(text)):
      flag = True
      for start, end in span_ids:
        if i in range(start, end):
          flag = False

      if flag:
        new_chars.append(text[i])

    new_text = ''.join(new_chars)
    new_text = re.sub(r'\s+', ' ', new_text)
    return new_text

In [None]:
def preprocess_questions(row):
    row['incomplete_question'] = remove_random_information(nlp, row['question'])
    return row

In [None]:
final_dataset = final_dataset.apply(preprocess_questions, axis=1)
final_dataset = final_dataset.apply(remove_space_before_question_mark, axis=1)

In [None]:
final_dataset.shape

(47450, 5)

In [None]:
final_dataset.to_csv('final_dataset_2.csv', index=False)

In [None]:
final_dataset.head(15)

Unnamed: 0,context,question,answer,source_dataset,incomplete_question
0,"After an extra day in space,the crew of Discovery returned to Earth,landing at Edwards Air Force Base in California early on Tuesday morning.The shuttle and its crew spent two weeks in space,most of it aboard the International Space Station.It was the first flight of NASA's spaceshuttle fleet since February 2003,when the shuttle Columbia came apart while reentering Earth's atmosphere.\nNASA officials delayed Discovery's return for one day because of cloudy weather in Florida,where the shuttle was supposed to lanD. On Tuesday morning,mission controllers directed Discovery to a landing site in California,where the skies were clear.The crew members will have to wait until Wednesday to see their families when they all meet together in Houston at the Johnson Space Center.\nDiscovery had a very busy mission in space,compared to past missions.The space shuttle docked with the International Space Station most of the journey,delivering badly needed supplies and repairing damaged parts.The crew spent a lot of time testing new repair techniques on their own shuttle,conducting three different spacewalks(where astronauts exit the space shuttle to do work outside).On the third spacewalk,astronaut Steve Robinson went underneath the shuttle to remove material sticking out from between the spacecraft's protective tiles.\nThere were concerns in the last remaining days of Discovery's mission that a torn heat blanket--another shield against overheating in the space shuttle--may pose a problem to crew members. NASA officials and technicians worked on ways that the crew could fix the problem, but later decided it was not a risk.\nWith the shuttle and its crew safely back on firm ground, NASA is hailing this mission, labeled STS114,as a huge success. ""I hope this shows people that we're coming back,"" NASA spaceflight chief Bill Readdy said after Discovery's successful landing. ""We've got some more work to do. We know what we need to do and we'll do it.",Which is the most probable period of time that Discovery was aboard the International Space Station?,Nine days.,race,Which is most probable period of time that Discovery was aboard International Station?
1,"Anyone who cares about what schools and colleges teach and how their students learn will be interested in the memoir of Ralph W. Tyler, who is one of the most famous men in American education.\nBorn in Chicago in 1902, brought up and schooled in Nebraska, the 19-year-old college graduate Ralph Tyler became _ while teaching as a science teacher in South Dakota and changed his major from medicine to education.\nGraduate work at the University of Chicago found him connected with honorable educators Charles Judd and W. W. Charters, whose ideas of teaching and testing had an effect on his later work. In 1927, he became a teacher of Ohio State University where he further developed a new method of testing.\nTyler became well-known nationality in 1938, when he carried his work with the Eight-Year Study from Ohio State University to the University of Chicago at the invitation of Robert Hutchins.\nTyler was the first director of the Center for Advanced Study in the Behavioral Sciences at Stanford, a position he held for fourteen years. There, he firmly believed that researchers should be free to seek an independent spirit in their work.\nAlthough Tyler officially retired in 1967, he never actually retired. He served on a long list of educational organizations in the United States and abroad. Even in his 80s he traveled across the country to advise teachers and management people on how to set objectives that develop the best teaching and learning within their schools.",Who are most probably interested in Ralph W. Tyler's memoir?,Serious educators.,race,Who are ?
2,"Children in England mustn't work until they are 13. They need to have a work permit to start working.\nThe jobs teenagers can do\nDelivering newspapers\nMany teenagers will get up early to deliver newspapers to houses in their local area before going to school. They are known as Paper-boys or Papergirls.\nBabysitting: Looking after young children in their home while their parents have gone out for the evening is a popular job for teenagers, as they get money for watching children and television all at the same time!\nHelping the Milkman: From the age of 14 some teenagers help the milkman deliver milk to houses.\nOther popular jobs : Working in a shop; Office work; Washing cars ; In a cafe or restaurant. The hours teenagers (13 and 14 year olds )can work:\nSchool Days\nNot more than 2 hours in one day during the following periods:\nMorning 7 a. m. --start of school or Evening\nclose of school-- 7 p. m.\nSaturdays: Up to 5 hours between 7 a.m. and 7 p.m.\nSundays\nUp to 2 hours between 7 a.m. and 11 a. m.\nTerm time\nUp to 12 hours a week (Including weekends)",In England how old do children have to be before they can work?,13.,race,In England how old do have to be can work?
3,"Lucy is a student in Class Two , Grade Seven . She is eleven years old . She had a beautiful toothbrush . But it was broken last Sunday . She was very sad because not only it was beautiful ,but also it was blue --- her favorite color . So her mother went shopping with her to buy a toothbrush on Sunday afternoon .\nThere are many toothbrushes in the shop . They bought a blue one . There is a blue bird in it . And it is made in Guangzhou . It's ten yuan . _ . But it is so beautiful . And she likes it very much . Then they went home . Lucy can brush teeth now . How happy she is !\nA, B, C, D.",Which sentence has the same meaning as the underline one?,It's not cheap,race,Which sentence has underline one?
4,"Niall O'Meara: O'Meara missed most of the 2018 National League due to injury but returned to the panel in April On 30 June 2019, O'Meara was named on the bench when Tipperary faced Limerick in the Munster final He was introduced as a substitute for Michael Breen at midfield but ended the game on the losing side following a 2-26 to 2-14 defeat",What position did Niall O'Meara play when Tipperary faced Limerick in the Munster final in June 2019?,Midfield,kaggle,What position did O'Meara play when Tipperary faced in June 2019?
5,"Do you ever fight with your little brother or sister? How would you like to travel around the world with them in a musical group? Sisters Johanna and Klara Soderburg are doing just that in their band, First Aid Kit! The two sisters are sweeping the world with a perfect mix of pop and folk music.\nThe Soderburg sisters come from Sweden. Their music sounds like music from the 1960s. They both play several instruments. And, since they are sisters, their voices fit together perfectly!\nLike many other teen stars before them, the two girls got their start on the website, YouTube. Their singing caught the attention of a Swedish musician. It wasn't long before people all over the world discovered their music!\nThe sisters don't just sing or play instruments; they also write their own music and design all of their album art!They have worked very hard, and they _ four new songs at the beginning of 2011! If you want to hear the songs, check out their music in YouTube for yourself! Listen to see why First Aid Kit is your new favorite band!",Johanna and Klara Soderburg are famous for?,a perfect mix of music,race,Johanna and Soderburg are ?
7,"To the mom I used to be:\nTwo years ago, you were happy and whole. You had a plan for life -- start a new business, get involved in the Parent-teacher Association, teach your sons how to ride bicycles, spend as much time with your friends and parents as possible, watch your children mature and grow old with your husband. You were a "" _ "" person who often felt so much joy. You always looked forward, smiled and danced.\nTwo years ago, on December 14, 2012, the world changed and you changed with it. Disturbed young men with access to high-powered guns went to your sons' school and killed six educators and twenty first-graders. Your eldest son Jake survived, but was changed by the day he discovered some monsters are real. He describes it as the day ""when evils came to my school."" Your youngest son, Dylan, whom you thought of as a pure love, with his charming eyes and infectious giggle ,was killed. Shot multiple times, he died instantly in the arms of his special-education assistant who also died while trying to protect him.\nThe tragedy changed all your life, not only because of losing your child, but because of the hole inside you that can never be filled. Your eldest son has been forced to grow up too fast because of the loss of his brother. The pain has altered the lines on your husband's face. The way you look at the world has changed. Your interactions with friends and family seem foreign.\n... ...\nBut the things have moved on with hopefulness. You are now someone far more realistic. You control your feelings because you fear if you really let it out, you would never recover. It would destroy you. You know what you should do is try everything to protect more children. You're fighting a good fight, what the whole society really needs.\nWith love,\nNicole Hockley, Dylan's mom",Which can best describe Dylan's mom's attitude towards life today?,Positive.,race,Which can best describe mom's attitude towards ?
8,"WASHINGTON--More than one in 10 of the nation's airline pilots are cleared to carry a handgun while flying, and the number will continue to grow, according to a Transportation Security Administration projection.\nThe Federal Air Marshal Service, a TSA agency that runs the armed-pilots program, reports that 85,000 to 90,000 pilots and crewmembers flying domestic passengers and cargo planes are eligible to carry a gun. That puts the number of armed pilots at about 9,500.\nThe TSA projects the program to grow to 16.5% of eligible pilots by the year 2011. Aviation experts were surprised and alarmed that so many pilots are toting guns in the sky.\n""That's a big number compared to what I thought it would be,"" said aviation-security consultant Rich Roth, who said he had predicted there would be fewer than 1,000 armed pilots. The 5-year-old program trains pilots for one week and arms them with 40-caliber semiautomatic pistols.\n""That's a scary number,"" said Joseph Gutheinz, a former Transportation Department special agent and aviation attorney in Houston. ""By allowing so many pilots the opportunity to fly armed, we're giving terrorists opportunity to identify somebody who has a gun and overpower him.""\nCapt. Bob Hesselbein, head of security for the Air Line Pilots Association, said the number of armed pilots is ""a tremendous deterrent "" to hijackings. ""The challenge of an organized terrorist team is to take control of the cabin, then the flight deck.""\nArmed pilots have come under scrutiny since March 22 when the gun of a US Airways pilot fired in the cockpit of Flight 1536 as it approached Charlotte from Denver. No one was hurt, and the plane landed safely after the bullet pierced the fuselage.",Joseph Gutheinz's attitude towards pilots' carrying guns while flying is?,negative,race,Joseph Gutheinz's attitude towards pilots' carrying guns while flying is?
10,"Many people keep the stamps in a drawer at home because they'll never know when they might need them! The trouble is that they have no idea of how many stamps they have left. When they want to send something by post, it's very annoying to find out that they have run out of stamps!\nThe Internet is possible to solve the problem. The United States Postal Service has come up with a system which allows people to print their own postage stamps at their own home! This is the perfect solution for those people that have busy lives!\nWhenever you go to the post office to buy a stamp, you are buying a boring and standard postage stamp. They are red in color, and you can easily buy them on the Internet as well. For example, you can just order a regular 1st class stamp and then print it off using your own printer .\nThese days, you can have much more fun with your stamps. You can buy them online instead of going to the post office. You can decorate, preview and print your stamps all at your own home. You are able to use any of your own photos as part of your postage stamp as well.\nThese stamps aren't just allowed to be used in the United States; they can be used to send your mail throughout the world. They are perfect to use for many special occasions, such as weddings, festivals and even birthdays! They are fantastic!",The problem of keeping stamps at home is that?,you do not know how many are left,race,The problem is ?
11,"Jack London is a famous American writer. His most widely known book is the Call of the Wild, the story of the adventure of a large dog in the frozen north.\nJack London was born on January 12, 1876, in San Francisco, California. His family was very poor, and Jack had to leave school to make money. He worked hard at many different jobs.\nLater, Jack returned to school, but he didn't stay. He wrote, ""Life and pocket book were both too short."" In 1897, he went to Alaska to find gold. Instead, he found ideas there for his book stories. He returned home and started to write. His writings were successful, and he became rich and famous in his twenties.\nJack London was not a happy man, however. In poor health, he took his own life in 1916. He was then only 40 years old.",What do we know about Jack London?,He killed himself because of poor health.,race,What do know about Jack London?


#Process NarrativeQA Dataset

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

nar = load_dataset("deepmind/narrativeqa")

Downloading readme:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/24 [00:00<?, ?files/s]

Downloading data:   0%|          | 0.00/8.56M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/121M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/243M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/58.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32747 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10557 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3461 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [None]:
def get_longest_ans(ans_list):
  """
  Return the longest answer out of those provided

  Args:
    ans_list (List(Dict)):
  Returns:
    str: longest ans
  """
  longest_ans = ""
  cur_max_len = 0
  for ans_dict in ans_list:
    if (len(ans_dict['tokens']) > cur_max_len):
      longest_ans = ans_dict['text']
      cur_max_len = len(ans_dict['tokens'])

  return longest_ans

In [None]:
import pandas as pd

new_nar_train = pd.DataFrame(columns=['context', 'question', 'answer', 'source_dataset'])

for data in nar['train']:
  context = data['document']['summary']['text']
  question = data['question']['text']
  answer = get_longest_ans(data['answers'])
  datapoint = pd.DataFrame({'context': [context], 'question': [question], 'answer': [answer], 'source_dataset': ['narqa']})

  new_nar_train = pd.concat([new_nar_train, pd.DataFrame(datapoint)], axis=0, ignore_index=True)

In [None]:
new_nar_val = pd.DataFrame(columns=['context', 'question', 'answer', 'source_dataset'])

for data in nar['validation']:
  context = data['document']['summary']['text']
  question = data['question']['text']
  answer = get_longest_ans(data['answers'])
  datapoint = pd.DataFrame({'context': [context], 'question': [question], 'answer': [answer], 'source_dataset': ['narqa']})

  new_nar_val = pd.concat([new_nar_val, pd.DataFrame(datapoint)], axis=0, ignore_index=True)

In [None]:
new_nar_test = pd.DataFrame(columns=['context', 'question', 'answer', 'source_dataset'])

for data in nar['test']:
  context = data['document']['summary']['text']
  question = data['question']['text']
  answer = get_longest_ans(data['answers'])
  datapoint = pd.DataFrame({'context': [context], 'question': [question], 'answer': [answer], 'source_dataset': ['narqa']})

  new_nar_test = pd.concat([new_nar_test, pd.DataFrame(datapoint)], axis=0, ignore_index=True)

In [None]:
last_final_train = pd.concat([final_train, new_nar_train], axis=0, ignore_index=True)
last_final_val = pd.concat([final_val, new_nar_val], axis=0, ignore_index=True)
last_final_test = pd.concat([final_test, new_nar_test], axis=0, ignore_index=True)

In [None]:
# more cleaning
import re

def remove_space_before_question_mark(row):
  ques = re.sub(r'\s+', ' ', row['question'])
  row['question'] = ques.replace(' ?', '?')

  return row

In [None]:
last_final_train = last_final_train.apply(remove_space_before_question_mark, axis=1)
last_final_val = last_final_val.apply(remove_space_before_question_mark, axis=1)
last_final_test = last_final_test.apply(remove_space_before_question_mark, axis=1)

In [None]:
last_final_train.source_dataset.value_counts()

source_dataset
race      42536
narqa     32747
kaggle     5764
Name: count, dtype: int64

In [None]:
last_final_val.source_dataset.value_counts()

source_dataset
narqa    3461
race     2350
Name: count, dtype: int64

In [None]:
last_final_test.source_dataset.value_counts()

source_dataset
narqa    10557
race      2412
Name: count, dtype: int64

In [None]:
wh_question_words = [
  "what",
  "who",
  "when",
  "where",
  "why",
  "which",
  "how",
  "whose",
  "that"
]

In [None]:
# helper
ques = []
for r in last_final_train.sample(500).iterrows():
  if 'how' in r[1]['question'].lower():
    ques.append(r[1]['question'])
    if len(ques) == 50:
      break
ques

In [None]:
a = [1, 4, 3,4 , 5, 6]
a.index(4)

1

In [None]:
def extract_key_words(row):
  ques = row['question']
  words = ques.split(' ')
  words = [word.lower().strip() for word in words]
  incomplete_question = ""

  for word in words:
    if word.strip() in wh_question_words:
      incomplete_question = word
      break

  if len(incomplete_question.strip()) == 0: # no predefined keyword found -> get two first and two last words
    incomplete_question = ' '.join(words[:2]) + ' ' + ' '.join(words[-2:])

  incomplete_question = incomplete_question.strip().capitalize()
  incomplete_question = incomplete_question.replace('?', '')
  row['incomplete_question'] = incomplete_question
  return row

In [None]:
last_final_train = last_final_train.apply(extract_key_words, axis=1)
last_final_val = last_final_val.apply(extract_key_words, axis=1)
last_final_test = last_final_test.apply(extract_key_words, axis=1)

In [None]:
last_final_train.sample(5)

Unnamed: 0,context,question,answer,source_dataset,incomplete_question
32133,Trains have long been an important form of tra...,The world's first high speed trains started tr...,200 km/hr,race,The world's traveling at
56049,"The ""William"" of the book's title is Kaiser W...",From which House did Kaiser Wilhilm descend?,The House of Hohenzollern.,narqa,Which
42956,C3 (railcar): The C car is normally at the opp...,How are the C3 cars operated within a consist?,The C3 cars must always face the same directio...,kaggle,How
50296,"The eponymous heroine, Isabel Thorne, is a yo...",What happened to Grimm?,He was captured,narqa,What
28318,"In 2003, Bethany Hamilton,13, lost her left ar...",What can we learn from Anna Sophia Robb's words?,She was absorbed in the story.,race,What


In [None]:
last_final_train.to_csv('train_set.csv', index=False)
last_final_val.to_csv('validation_set.csv', index=False)
last_final_test.to_csv('test_set.csv', index=False)

# Prep SQuAD Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

squad = load_dataset("rajpurkar/squad")
squad_train, squad_val = pd.DataFrame(squad['train']), pd.DataFrame(squad['validation'])

In [None]:
# # get long questions only: skipped

# condition = [len(q.split(" ")) >= 3 for q in race_train['question']]
# race_train = race_train[condition]

# condition = [len(q.split(" ")) >= 3 for q in race_val['question']]
# race_val = race_val[condition]

# condition = [len(q.split(" ")) >= 3 for q in race_test['question']]
# race_test = race_test[condition]

In [None]:
def reformat_squad_ans(row):
  ans_dict = row['answers'] # dict
  answers = ans_dict['text'] # list

  final_ans = ""
  for ans in answers:
    if len(ans) > len(final_ans):
      final_ans = ans

  row['answers'] = final_ans
  return row

In [None]:
squad_train = squad_train.apply(reformat_squad_ans, axis=1)
squad_val = squad_val.apply(reformat_squad_ans, axis=1)

In [None]:
squad_train = squad_train.drop(columns=['id', 'title'])
squad_train = squad_train.rename(columns={"answers": 'answer'})

squad_val = squad_val.drop(columns=['id', 'title'])
squad_val = squad_val.rename(columns={"answers": 'answer'})

In [None]:
# extract keywords

wh_question_words = [
  "what",
  "who",
  "when",
  "where",
  "why",
  "which",
  "how",
  "whose",
  "that"
]

def extract_key_words(row):
  ques = row['question']
  words = ques.split(' ')
  words = [word.lower().strip() for word in words]
  incomplete_question = ""

  for word in words:
    if word.strip() in wh_question_words:
      incomplete_question = word
      break

  if len(incomplete_question.strip()) == 0: # no predefined keyword found -> get two first and two last words
    incomplete_question = ' '.join(words[:2]) + ' ' + ' '.join(words[-2:])

  incomplete_question = incomplete_question.strip().capitalize()
  incomplete_question = incomplete_question.replace('?', '')
  row['incomplete_question'] = incomplete_question
  return row

In [None]:
squad_train = squad_train.apply(extract_key_words, axis=1)
squad_val = squad_val.apply(extract_key_words, axis=1-\)

In [None]:
# combine all datasets

train_set = pd.read_csv('train_set.csv')
validation_set = pd.read_csv('validation_set.csv')
test_set = pd.read_csv('test_set.csv')

final_train_set = pd.concat([train_set, squad_train], axis=0, ignore_index=True)
final_validation_set = pd.concat([validation_set, squad_val], axis=0, ignore_index=True)
final_test_set = test_set

In [None]:
final_train_set.to_csv('final_train_set.csv', index=False)
final_validation_set.to_csv('final_validation_set.csv', index=False)
final_test_set.to_csv('final_test_set.csv', index=False)

In [None]:
# one part not added: get the last noun chunk of the question using spacy, and add that to the question, so "Why is she playing the guitar?" becomes "Why guitar". Wrote the code somewhere else and too lazy to add it here.