In [1]:
from sacremoses import MosesDetokenizer
import pandas as pd
import numpy as np
import csv

In [2]:
data = pd.read_csv('data/questions.tsv', sep='\t', engine='python', warn_bad_lines=True, quoting=csv.QUOTE_NONE)



  data = pd.read_csv('data/questions.tsv', sep='\t', engine='python', warn_bad_lines=True, quoting=csv.QUOTE_NONE)


In [3]:
md = MosesDetokenizer(lang='en')

In [4]:
def moses_detokenize(sentence):
    return md.detokenize(sentence)

def fix_quotes(x):
    return (x
        .replace('\u201c', '"').replace('\u201d', '"') # double quotes
        .replace('\u2018', "'").replace('\u2019', "'") # single quotes
    )

def delete_spaces(sentence):
    return (sentence
            .replace(" ' ", "'")
            .replace(" - ", "-"))

In [5]:
def preprocess_sent(sentence):
    return moses_detokenize(fix_quotes(delete_spaces(sentence)).split())

In [6]:
data['Sentence'] = data['Sentence'].apply(preprocess_sent)

In [7]:
def get_cumulative_context(sentences): 
    current_ctx = ""
    context = []

    for sent in sentences.unique():
        current_ctx += (sent + " ")
        context.append(current_ctx)
    
    return context

In [8]:
def add_cumulative_context_to_group(article_group): 
    return article_group.assign(Context=get_cumulative_context(article_group['Sentence']))

* `head(1)`: gives the first row of each sentence group
* `groupby("Article_Id")`: Access to each article group 
* `apply(do_thing_to_group)`: apply to each group

In [9]:
# data.groupby(['Article_Id', "Sentence_Id"]).head(1).groupby("Article_Id").groups

In [10]:
df_with_cumulative_context = data.groupby(['Article_Id', "Sentence_Id"]).head(1).groupby("Article_Id").apply(add_cumulative_context_to_group)

In [11]:
df_with_cumulative_context['Context'].iloc[0]

"The nation 's largest gun-rights group is taking some Texans to task over their headline-generating demonstrations advocating the legal, open carrying of weapons. "

In [12]:
df_with_cumulative_context_filtered = df_with_cumulative_context.filter(["Article_Id", "Sentence_Id", "Context"], axis=1)
df_with_cumulative_context_filtered

Unnamed: 0,Article_Id,Sentence_Id,Context
0,1,1,The nation 's largest gun-rights group is taki...
4,1,2,The nation 's largest gun-rights group is taki...
6,1,3,The nation 's largest gun-rights group is taki...
9,1,4,The nation 's largest gun-rights group is taki...
10,1,5,The nation 's largest gun-rights group is taki...
...,...,...,...
19804,1500,1,"It's the underdog of U. S. currency, the green..."
19807,1500,2,"It's the underdog of U. S. currency, the green..."
19809,1500,3,"It's the underdog of U. S. currency, the green..."
19811,1500,4,"It's the underdog of U. S. currency, the green..."


"Expand" on the context, based on the `Article_Id` and `Sentence_Id`.

In [13]:
expand_context_to_all_df = pd.merge(data, df_with_cumulative_context_filtered, on=["Article_Id", "Sentence_Id"])
# strip last space
expand_context_to_all_df["Context"] = expand_context_to_all_df["Context"].apply(lambda x: x.strip())
#expand_context_to_all_df["Context"][0]

In [14]:
article_ids = expand_context_to_all_df['Article_Id']

In [15]:
is_validation = (
    (article_ids <= 100) |
    (article_ids > 1050) & (article_ids <= 1100))
validation = expand_context_to_all_df[is_validation]
len(validation)

1991

In [16]:
is_test = (
    (article_ids > 100) & (article_ids <= 150) |
    (article_ids > 500) & (article_ids <= 550) |
    (article_ids > 1100) & (article_ids <= 1150)
)
test = expand_context_to_all_df[is_test]
len(test)

1894

In [17]:
is_train = (
    (article_ids > 150) & (article_ids <= 500) | 
    (article_ids > 550) & (article_ids <= 1050) | 
    (article_ids > 1150)
)
train = expand_context_to_all_df[is_train]
len(train)

15931

In [18]:
assert (is_train & is_validation).sum() == 0
assert (is_validation & is_test).sum() == 0
assert (is_train & is_test).sum() == 0

train.to_json('./data/data-sentence-context/train-full-context.json', orient='records', lines=True)
test.to_json('./data/data-sentence-context/test-full-context.json', orient='records', lines=True)
validation.to_json('./data/data-sentence-context/validation-full-context.json', orient='records', lines=True)