In [1]:
from sacremoses import MosesDetokenizer, MosesTokenizer
import pandas as pd
import csv


In [2]:
data = pd.read_csv('data/questions.tsv', sep='\t', engine='python', warn_bad_lines=True, quoting=csv.QUOTE_NONE)



  data = pd.read_csv('data/questions.tsv', sep='\t', engine='python', warn_bad_lines=True, quoting=csv.QUOTE_NONE)


In [3]:
md = MosesDetokenizer(lang='en')
mt = MosesTokenizer(lang='en')

In [4]:
def moses_tokenize(sentence):
    return mt.tokenize(sentence)

In [5]:
def moses_detokenize(sentence):
    return md.detokenize(sentence)

In [6]:
def dumb_quotes(x):
    return (x
        .replace('\u201c', '"').replace('\u201d', '"') # double quotes
        .replace('\u2018', "'").replace('\u2019', "'") # single quotes
    )

In [7]:
def dumb_spaces(sentence):
    return (sentence
            .replace(" ' ", "'")
            .replace(" - ", "-"))

In [8]:
example = data['Sentence'][0]
example

'The nation ’ s largest gun - rights group is taking some Texans to task over their headline - generating demonstrations advocating the legal , open carrying of weapons .'

In [9]:
space_ctrl = dumb_spaces(dumb_quotes(example))
mtok = moses_tokenize(space_ctrl)
print(f"Get rid of extra spaces: {space_ctrl}")
print(f"Tokenize: {mtok}")
print(f"Detokenized: {moses_detokenize(mtok)}")

Get rid of extra spaces: The nation's largest gun-rights group is taking some Texans to task over their headline-generating demonstrations advocating the legal , open carrying of weapons .
Tokenize: ['The', 'nation', '&apos;s', 'largest', 'gun-rights', 'group', 'is', 'taking', 'some', 'Texans', 'to', 'task', 'over', 'their', 'headline-generating', 'demonstrations', 'advocating', 'the', 'legal', ',', 'open', 'carrying', 'of', 'weapons', '.']
Detokenized: The nation's largest gun-rights group is taking some Texans to task over their headline-generating demonstrations advocating the legal, open carrying of weapons.


In [10]:
example.encode('unicode_escape')

b'The nation \\u2019 s largest gun - rights group is taking some Texans to task over their headline - generating demonstrations advocating the legal , open carrying of weapons .'

In [11]:
def preprocess_sent(sentence):
    return moses_detokenize(dumb_spaces(dumb_quotes(sentence)).split())

In [12]:
preprocess_sent(example)

"The nation's largest gun-rights group is taking some Texans to task over their headline-generating demonstrations advocating the legal, open carrying of weapons."

In [13]:
data['Context'] = data['Sentence'].apply(preprocess_sent)

In [14]:
filtered_data = data.filter(['Article_Id', 'Context', 'Question'], axis=1)

In [15]:
filtered_data

Unnamed: 0,Article_Id,Context,Question
0,1,The nation's largest gun-rights group is takin...,"What does \""to task\"" mean?"
1,1,The nation's largest gun-rights group is takin...,What is this group called?
2,1,The nation's largest gun-rights group is takin...,Which group?
3,1,The nation's largest gun-rights group is takin...,Why don't you just come out and say the NRA?
4,1,Officials with the National Rifle Association ...,How many people is a small number?
...,...,...,...
19811,1500,John Bennardo is crisscrossing the country to ...,Why are $2 bills seen as so much more desirable?
19812,1500,John Bennardo is crisscrossing the country to ...,What magic are they referring to?
19813,1500,"""I think everyone's curious about it,"" he said.",Why is everyone so curious about it?
19814,1500,"""I think everyone's curious about it,"" he said.",Why do they feel everyone would be curious abo...


In [16]:
article_ids = filtered_data['Article_Id']
is_validation = (
    (article_ids <= 100) |
    (article_ids > 1050) & (article_ids <= 1100))
validation = filtered_data[is_validation]
len(validation)

1991

In [17]:
is_test = (
    (article_ids > 100) & (article_ids <= 150) |
    (article_ids > 500) & (article_ids <= 550) |
    (article_ids > 1100) & (article_ids <= 1150)
)
test = filtered_data[is_test]
len(test)

1894

In [18]:
is_train = (
    (article_ids > 150) & (article_ids <= 500) | 
    (article_ids > 550) & (article_ids <= 1050) | 
    (article_ids > 1150)
)
train = filtered_data[is_train]
len(train)

15931

In [19]:
assert (is_train & is_validation).sum() == 0
assert (is_validation & is_test).sum() == 0
assert (is_train & is_test).sum() == 0

train.to_json('./data/train.json', orient='records', lines=True)
test.to_json('./data/test.json', orient='records', lines=True)
validation.to_json('./data/validation.json', orient='records', lines=True)