This notebook converts the given NewsQA datasets into SQuAD format for evaluation

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_file = pd.read_csv("train.csv")

In [3]:
train_file.head()

Unnamed: 0,story_id,story_text,question,answer_token_ranges
0,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61
1,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12
2,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26
3,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98
4,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143


In [4]:
valid_file = pd.read_csv("dev.csv")
test_file = pd.read_csv("test.csv")

In [5]:
## Adding a unique ID column

train_file["id"] = train_file.index
train_file.head()

Unnamed: 0,story_id,story_text,question,answer_token_ranges,id
0,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61,0
1,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12,1
2,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26,2
3,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98,3
4,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143,4


In [6]:
valid_file.head()

Unnamed: 0,story_id,story_text,question,answer_token_ranges
0,./cnn/stories/d191e20468fc7675fcfa55c33fab1e65...,"TEHRAN , Iran -LRB- CNN -RRB- -- Iran 's parli...",Iran criticizes who ?,13:17
1,./cnn/stories/248826b18cd578159c43c5df340e9a30...,"LONDON , England -LRB- CNN -RRB- -- Israeli mi...",What happened to the U.N. compound ?,590:595
2,./cnn/stories/1d77d0687a32e8147f87d0f97cb72242...,WASHINGTON -LRB- CNN -RRB- -- There are no imm...,Who said there is no immediate plans for deplo...,22:24
3,./cnn/stories/00359f516cdf8b1800c7102711bd9aa4...,"LOS ANGELES , California -LRB- CNN -RRB- -- Fo...",Will Lieberman investigate further ?,343:348
4,./cnn/stories/7c0bda3744be6f7d95eef695e59a4e40...,-LRB- CNN -RRB- -- A Colorado prosecutor Frida...,Who spent nine years in prison ?,18:20


In [7]:
test_file.head()

Unnamed: 0,story_id,story_text,question,answer_token_ranges
0,./cnn/stories/289a45e715707cf650352f3eaa123f85...,-LRB- CNN -RRB- -- Comcast rolled out a Web-ba...,What is going live on Tuesday ?,8:14
1,./cnn/stories/bce33bb5b5cff6b93065aa0cf91917c8...,-LRB- CNN -RRB- -- NASA wo n't have to maneuve...,What was the space station crew forced to take...,302:306
2,./cnn/stories/017df5c4fe1e79eb26957ff6a8b4c1e4...,-LRB- CNN -RRB- -- Decorating in the midst of ...,for what People just do n't trust their instin...,652:671
3,./cnn/stories/44f55c84c4a580853e384c860bb2ba3a...,WASHINGTON -LRB- CNN -RRB- -- During the presi...,who is Sonia Sotomayor ?,29:30
4,./cnn/stories/e117408ad19cc69e15b1e21b9ae54f10...,-LRB- CNN -RRB- -- The partnership started as ...,Where did Lewis Partnership begin ?,4:22


In [8]:
## Applying to the dev and test sets

test_file["id"] = test_file.index
valid_file["id"] = valid_file.index

In [9]:
cols = train_file.columns.tolist()
cols = cols[-1:] + cols[:-1]
cols

['id', 'story_id', 'story_text', 'question', 'answer_token_ranges']

In [10]:
train_file = train_file[cols]
train_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges
0,0,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61
1,1,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12
2,2,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26
3,3,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98
4,4,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143


In [11]:
valid_file = valid_file[cols]
test_file = test_file[cols]
valid_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges
0,0,./cnn/stories/d191e20468fc7675fcfa55c33fab1e65...,"TEHRAN , Iran -LRB- CNN -RRB- -- Iran 's parli...",Iran criticizes who ?,13:17
1,1,./cnn/stories/248826b18cd578159c43c5df340e9a30...,"LONDON , England -LRB- CNN -RRB- -- Israeli mi...",What happened to the U.N. compound ?,590:595
2,2,./cnn/stories/1d77d0687a32e8147f87d0f97cb72242...,WASHINGTON -LRB- CNN -RRB- -- There are no imm...,Who said there is no immediate plans for deplo...,22:24
3,3,./cnn/stories/00359f516cdf8b1800c7102711bd9aa4...,"LOS ANGELES , California -LRB- CNN -RRB- -- Fo...",Will Lieberman investigate further ?,343:348
4,4,./cnn/stories/7c0bda3744be6f7d95eef695e59a4e40...,-LRB- CNN -RRB- -- A Colorado prosecutor Frida...,Who spent nine years in prison ?,18:20


In [12]:
## Making the IDs for each entry unique and in string format

train_file["id"] = train_file["id"].astype(str)
train_file["id"] = train_file["id"] + "a"

valid_file["id"] = valid_file["id"].astype(str)
valid_file["id"] = valid_file["id"] + "b"

test_file["id"] = test_file["id"].astype(str)
test_file["id"] = test_file["id"] + "c"

train_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges
0,0a,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61
1,1a,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12
2,2a,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26
3,3a,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98
4,4a,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143


In [13]:
train_file.to_csv("train.csv", index=False)
valid_file.to_csv("dev.csv", index=False)
test_file.to_csv("test.csv", index=False)

In [14]:
valid_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges
0,0b,./cnn/stories/d191e20468fc7675fcfa55c33fab1e65...,"TEHRAN , Iran -LRB- CNN -RRB- -- Iran 's parli...",Iran criticizes who ?,13:17
1,1b,./cnn/stories/248826b18cd578159c43c5df340e9a30...,"LONDON , England -LRB- CNN -RRB- -- Israeli mi...",What happened to the U.N. compound ?,590:595
2,2b,./cnn/stories/1d77d0687a32e8147f87d0f97cb72242...,WASHINGTON -LRB- CNN -RRB- -- There are no imm...,Who said there is no immediate plans for deplo...,22:24
3,3b,./cnn/stories/00359f516cdf8b1800c7102711bd9aa4...,"LOS ANGELES , California -LRB- CNN -RRB- -- Fo...",Will Lieberman investigate further ?,343:348
4,4b,./cnn/stories/7c0bda3744be6f7d95eef695e59a4e40...,-LRB- CNN -RRB- -- A Colorado prosecutor Frida...,Who spent nine years in prison ?,18:20


In [15]:
test_string = "This is a test of the find method, disregard"
test_string.find("find method, disregard")

22

In [16]:
train_file.answer_token_ranges.value_counts(dropna=False)

4:6                585
7:9                530
4:5                380
8:10               361
6:8                340
                  ... 
299:307              1
351:352,365:366      1
1227:1234            1
228:253              1
948:955              1
Name: answer_token_ranges, Length: 16810, dtype: int64

In [18]:
## Grabbing only the first range of answer
train_file["answer_token_lists"] =  train_file["answer_token_ranges"].str.split(",").apply(lambda x: x[0])

In [19]:
train_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges,answer_token_lists
0,0a,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61,60:61
1,1a,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12,8:12
2,2a,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26,23:26
3,3a,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98,97:98
4,4a,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143,125:143


In [21]:
## Selecting the start and end indices of the answer

train_file["answer_token_start"] = train_file["answer_token_lists"].str.split(":").apply(lambda x: x[0]).astype(int)
train_file["answer_token_end"] = train_file["answer_token_lists"].str.split(":").apply(lambda x: x[1]).astype(int)

In [22]:
train_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges,answer_token_lists,answer_token_start,answer_token_end
0,0a,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61,60:61,60,61
1,1a,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12,8:12,8,12
2,2a,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26,23:26,23,26
3,3a,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98,97:98,97,98
4,4a,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143,125:143,125,143


In [32]:
## Getting the answer text

train_file["answer_text"] = train_file[["story_text", "answer_token_start", "answer_token_end"]].apply(
lambda row: " ".join(row["story_text"].split()[row["answer_token_start"]:row["answer_token_end"]]), axis=1)

In [33]:
train_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges,answer_token_lists,answer_token_start,answer_token_end,answer_text
0,0a,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61,60:61,60,61,19
1,1a,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12,8:12,8,12,Sudanese region of Darfur
2,2a,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26,23:26,23,26,Archbishop Desmond Tutu
3,3a,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98,97:98,97,98,29-year-old
4,4a,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143,125:143,125,143,a series of killings and threats by Muslim ext...


In [34]:
## Getting the start character of the answer in the story text

train_file["answer_start_char"] = train_file[["story_text", "answer_text"]].apply(
lambda row: row["story_text"].find(row["answer_text"]), axis=1)

In [35]:
train_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges,answer_token_lists,answer_token_start,answer_token_end,answer_text,answer_start_char
0,0a,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61,60:61,60,61,19,305
1,1a,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12,8:12,8,12,Sudanese region of Darfur,44
2,2a,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26,23:26,23,26,Archbishop Desmond Tutu,114
3,3a,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98,97:98,97,98,29-year-old,540
4,4a,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143,125:143,125,143,a series of killings and threats by Muslim ext...,697


In [36]:
## Putting the answer text and answer start char in dictionary format (as in SQuAD)

train_file["answers"] = train_file[["answer_text", "answer_start_char"]].apply(
lambda row: {"text": [row["answer_text"]], "answer_start": [row["answer_start_char"]]}, axis=1)

In [37]:
train_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges,answer_token_lists,answer_token_start,answer_token_end,answer_text,answer_start_char,answers
0,0a,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61,60:61,60,61,19,305,"{'text': ['19'], 'answer_start': [305]}"
1,1a,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12,8:12,8,12,Sudanese region of Darfur,44,"{'text': ['Sudanese region of Darfur'], 'answe..."
2,2a,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26,23:26,23,26,Archbishop Desmond Tutu,114,"{'text': ['Archbishop Desmond Tutu'], 'answer_..."
3,3a,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98,97:98,97,98,29-year-old,540,"{'text': ['29-year-old'], 'answer_start': [540]}"
4,4a,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143,125:143,125,143,a series of killings and threats by Muslim ext...,697,{'text': ['a series of killings and threats by...


In [38]:
## Applying these same transformations to the dev and test sets

valid_file["answer_token_lists"] =  valid_file["answer_token_ranges"].str.split(",").apply(lambda x: x[0])
valid_file["answer_token_start"] = valid_file["answer_token_lists"].str.split(":").apply(lambda x: x[0]).astype(int)
valid_file["answer_token_end"] = valid_file["answer_token_lists"].str.split(":").apply(lambda x: x[1]).astype(int)
valid_file["answer_text"] = valid_file[["story_text", "answer_token_start", "answer_token_end"]].apply(
lambda row: " ".join(row["story_text"].split()[row["answer_token_start"]:row["answer_token_end"]]), axis=1)
valid_file["answer_start_char"] = valid_file[["story_text", "answer_text"]].apply(
lambda row: row["story_text"].find(row["answer_text"]), axis=1)
valid_file["answers"] = valid_file[["answer_text", "answer_start_char"]].apply(
lambda row: {"text": [row["answer_text"]], "answer_start": [row["answer_start_char"]]}, axis=1)

test_file["answer_token_lists"] =  test_file["answer_token_ranges"].str.split(",").apply(lambda x: x[0])
test_file["answer_token_start"] = test_file["answer_token_lists"].str.split(":").apply(lambda x: x[0]).astype(int)
test_file["answer_token_end"] = test_file["answer_token_lists"].str.split(":").apply(lambda x: x[1]).astype(int)
test_file["answer_text"] = test_file[["story_text", "answer_token_start", "answer_token_end"]].apply(
lambda row: " ".join(row["story_text"].split()[row["answer_token_start"]:row["answer_token_end"]]), axis=1)
test_file["answer_start_char"] = test_file[["story_text", "answer_text"]].apply(
lambda row: row["story_text"].find(row["answer_text"]), axis=1)
test_file["answers"] = test_file[["answer_text", "answer_start_char"]].apply(
lambda row: {"text": [row["answer_text"]], "answer_start": [row["answer_start_char"]]}, axis=1)

In [39]:
valid_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges,answer_token_lists,answer_token_start,answer_token_end,answer_text,answer_start_char,answers
0,0b,./cnn/stories/d191e20468fc7675fcfa55c33fab1e65...,"TEHRAN , Iran -LRB- CNN -RRB- -- Iran 's parli...",Iran criticizes who ?,13:17,13:17,13,17,U.S. President-elect Barack Obama,75,{'text': ['U.S. President-elect Barack Obama']...
1,1b,./cnn/stories/248826b18cd578159c43c5df340e9a30...,"LONDON , England -LRB- CNN -RRB- -- Israeli mi...",What happened to the U.N. compound ?,590:595,590:595,590,595,hit and set on fire,3246,"{'text': ['hit and set on fire'], 'answer_star..."
2,2b,./cnn/stories/1d77d0687a32e8147f87d0f97cb72242...,WASHINGTON -LRB- CNN -RRB- -- There are no imm...,Who said there is no immediate plans for deplo...,22:24,22:24,22,24,President Obama,122,"{'text': ['President Obama'], 'answer_start': ..."
3,3b,./cnn/stories/00359f516cdf8b1800c7102711bd9aa4...,"LOS ANGELES , California -LRB- CNN -RRB- -- Fo...",Will Lieberman investigate further ?,343:348,343:348,343,348,intends to follow up with,1980,"{'text': ['intends to follow up with'], 'answe..."
4,4b,./cnn/stories/7c0bda3744be6f7d95eef695e59a4e40...,-LRB- CNN -RRB- -- A Colorado prosecutor Frida...,Who spent nine years in prison ?,18:20,18:20,18,20,Tim Masters,112,"{'text': ['Tim Masters'], 'answer_start': [112]}"


In [41]:
## dropping added preprocessing columns
dead_cols = ["answer_token_lists", "answer_token_start", "answer_token_end", "answer_text", "answer_start_char"]

train_file.drop(dead_cols, axis=1, inplace=True)
valid_file.drop(dead_cols, axis=1, inplace=True)
test_file.drop(dead_cols, axis=1, inplace=True)

In [42]:
train_file.head()

Unnamed: 0,id,story_id,story_text,question,answer_token_ranges,answers
0,0a,./cnn/stories/42d01e187213e86f5fe617fe32e716ff...,"NEW DELHI , India -LRB- CNN -RRB- -- A high co...",What was the amount of children murdered ?,60:61,"{'text': ['19'], 'answer_start': [305]}"
1,1a,./cnn/stories/c48228a52f26aca65c31fad273e66164...,-LRB- CNN -RRB- -- Fighting in the volatile Su...,Where was one employee killed ?,8:12,"{'text': ['Sudanese region of Darfur'], 'answe..."
2,2a,./cnn/stories/c65ed85800e4535f4bbbfa2c34d7d963...,Johannesburg -LRB- CNN -RRB- -- Miffed by a vi...,who did say South Africa did not issue a visa ...,23:26,"{'text': ['Archbishop Desmond Tutu'], 'answer_..."
3,3a,./cnn/stories/0cf66b646e9b32076513c050edf32a79...,-LRB- CNN -RRB- -- England international footb...,How many years old was the businessman ?,97:98,"{'text': ['29-year-old'], 'answer_start': [540]}"
4,4a,./cnn/stories/13012604e3203c18df09289dfedd14cd...,"BAGHDAD , Iraq -LRB- CNN -RRB- -- At least 6,0...",What frightened the families ?,125:143,{'text': ['a series of killings and threats by...


In [43]:
train_file.to_csv("train.csv", index=False)
valid_file.to_csv("dev.csv", index=False)
test_file.to_csv("test.csv", index=False)