In [1]:
from sacremoses import MosesDetokenizer
import pandas as pd
import numpy as np
import csv

In [2]:
data = pd.read_csv('data/questions.tsv', sep='\t', engine='python', warn_bad_lines=True, quoting=csv.QUOTE_NONE)



  data = pd.read_csv('data/questions.tsv', sep='\t', engine='python', warn_bad_lines=True, quoting=csv.QUOTE_NONE)


In [3]:
md = MosesDetokenizer(lang='en')

In [4]:
def moses_detokenize(sentence):
    return md.detokenize(sentence)

def dumb_quotes(x):
    return (x
        .replace('\u201c', '"').replace('\u201d', '"') # double quotes
        .replace('\u2018', "'").replace('\u2019', "'") # single quotes
    )

def dumb_spaces(sentence):
    return (sentence
            .replace(" ' ", "'")
            .replace(" - ", "-"))

In [5]:
def preprocess_sent(sentence):
    return moses_detokenize(dumb_spaces(dumb_quotes(sentence)).split())

In [6]:
data['Sentence'] = data['Sentence'].apply(preprocess_sent)

In [33]:
def get_cumulative_context(sentences): 
    current_ctx = ""
    context = []

    for sent in sentences.unique():
        current_ctx += (sent + " ")
        context.append(current_ctx)
    
    return context

In [34]:
def add_cumulative_context_to_group(article_group): 
    return article_group.assign(Context=get_cumulative_context(article_group['Sentence']))

* `head(1)`: gives the first row of each sentence group
* `groupby("Article_Id")`: Access to each article group 
* `apply(do_thing_to_group)`: apply to each group

In [35]:
# data.groupby(['Article_Id', "Sentence_Id"]).head(1).groupby("Article_Id").groups

In [36]:
df_with_cumulative_context = data.groupby(['Article_Id', "Sentence_Id"]).head(1).groupby("Article_Id").apply(add_cumulative_context_to_group)

In [37]:
df_with_cumulative_context['Context'].iloc[0]

"The nation's largest gun-rights group is taking some Texans to task over their headline-generating demonstrations advocating the legal, open carrying of weapons. "

In [38]:
df_with_cumulative_context_filtered = df_with_cumulative_context.filter(["Article_Id", "Sentence_Id", "Context"], axis=1)
df_with_cumulative_context_filtered

Unnamed: 0,Article_Id,Sentence_Id,Context
0,1,1,The nation's largest gun-rights group is takin...
4,1,2,The nation's largest gun-rights group is takin...
6,1,3,The nation's largest gun-rights group is takin...
9,1,4,The nation's largest gun-rights group is takin...
10,1,5,The nation's largest gun-rights group is takin...
...,...,...,...
19804,1500,1,"It's the underdog of U. S. currency, the green..."
19807,1500,2,"It's the underdog of U. S. currency, the green..."
19809,1500,3,"It's the underdog of U. S. currency, the green..."
19811,1500,4,"It's the underdog of U. S. currency, the green..."


"Expand" on the context, based on the `Article_Id` and `Sentence_Id`.

In [39]:
expand_context_to_all_df = pd.merge(data, df_with_cumulative_context_filtered, on=["Article_Id", "Sentence_Id"])
expand_context_to_all_df

Unnamed: 0,Article_Id,Sentence_Id,Sentence,Span,Question,Span_Start_Position,Span_End_Position,Context
0,1,1,The nation's largest gun-rights group is takin...,to task,"What does \""to task\"" mean?",13,15,The nation's largest gun-rights group is takin...
1,1,1,The nation's largest gun-rights group is takin...,largest gun - rights group,What is this group called?,4,9,The nation's largest gun-rights group is takin...
2,1,1,The nation's largest gun-rights group is takin...,gun - rights group,Which group?,5,9,The nation's largest gun-rights group is takin...
3,1,1,The nation's largest gun-rights group is takin...,nation ’ s largest gun - rights group,Why don't you just come out and say the NRA?,1,9,The nation's largest gun-rights group is takin...
4,1,2,Officials with the National Rifle Association ...,small number,How many people is a small number?,67,69,The nation's largest gun-rights group is takin...
...,...,...,...,...,...,...,...,...
19811,1500,4,John Bennardo is crisscrossing the country to ...,"the story of the two and its "" magic """,Why are $2 bills seen as so much more desirable?,14,24,"It's the underdog of U. S. currency, the green..."
19812,1500,4,John Bennardo is crisscrossing the country to ...,""" magic """,What magic are they referring to?,21,24,"It's the underdog of U. S. currency, the green..."
19813,1500,5,"""I think everyone's curious about it,"" he said.","everyone ' s curious about it , """,Why is everyone so curious about it?,3,11,"It's the underdog of U. S. currency, the green..."
19814,1500,5,"""I think everyone's curious about it,"" he said.",everyone ' s curious,Why do they feel everyone would be curious abo...,3,7,"It's the underdog of U. S. currency, the green..."


In [40]:
article_ids = expand_context_to_all_df['Article_Id']

In [32]:
is_validation = (
    (article_ids <= 100) |
    (article_ids > 1050) & (article_ids <= 1100))
validation = expand_context_to_all_df[is_validation]
len(validation)

In [None]:
is_test = (
    (article_ids > 100) & (article_ids <= 150) |
    (article_ids > 500) & (article_ids <= 550) |
    (article_ids > 1100) & (article_ids <= 1150)
)
test = expand_context_to_all_df[is_test]
len(test)

In [None]:
is_train = (
    (article_ids > 150) & (article_ids <= 500) | 
    (article_ids > 550) & (article_ids <= 1050) | 
    (article_ids > 1150)
)
train = expand_context_to_all_df[is_train]
len(train)

In [None]:
assert (is_train & is_validation).sum() == 0
assert (is_validation & is_test).sum() == 0
assert (is_train & is_test).sum() == 0

train.to_json('./data/train-full-context.json', orient='records', lines=True)
test.to_json('./data/test-full-context.json', orient='records', lines=True)
validation.to_json('./data/validation-full-context.json', orient='records', lines=True)