# Process Dataset
This notebook reads the FEVER dataset, retrieves the evidence articles from wikipedia, tokenizes it and outputs the resulting reformatted dataset as CSV.

In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import wikipedia



In [2]:
wikipedia.set_lang('en')
data_path = '../data/train.jsonl'
out_path = '../data/test-processed.csv'

In [3]:
wiki_base_url = 'https://en.wikipedia.org/wiki/'

def get_wiki_article(name):
    try:
        return wikipedia.summary(name)
    except:
        return None

def aggregate_evidence(evidence):

    article_index_map = {}

    if not type(evidence) == list:
        raise Exception('evidence is not a list')

    if type(evidence[0]) == list:
        for sub in evidence:
            res = aggregate_evidence(sub)
            for k, v in res.items():
                if k in article_index_map:
                    article_index_map[k].extend(v)
                else:
                    article_index_map[k] = v

        return article_index_map

    elif type(evidence[0]) == int:
        return {evidence[2]: [evidence[3]]}

    else:
        raise Exception('evidence is malformed')

def process_sample(sample):
    claim = sample['claim']
    evidence = sample['evidence']
    label = sample['label']

    processed = {
        'claim': claim,
        'orig_evidence': evidence,
        'label': label
    }

    processed_list = []

    if label != 'NOT ENOUGH INFO':
        agg_evidence = aggregate_evidence(evidence)

        for article in agg_evidence.keys():
            article_text = get_wiki_article(article.replace('_', ' '))
            if article_text is None:
                print('could not find article for {}'.format(article))
                continue
            sentences = sent_tokenize(article_text)

            curr_processed = processed.copy()
            curr_processed['article'] = article
            curr_processed['article_text'] = article_text
            curr_processed['sentences'] = sentences
            curr_processed['correct'] = agg_evidence[article]

            processed_list.append(curr_processed)

        return processed_list

    else:
        return [processed]

def get_processed_df(dataframe):
    processed = []
    for i, row in dataframe.iterrows():
        processed.extend(process_sample(row))
    return pd.DataFrame(processed)

In [4]:
df = pd.read_json(data_path, lines=True)

In [7]:
ev = df['evidence'].tolist()[:10]
for e in ev:
    print('–––')
    print(e)

–––
[[[92206, 104971, 'Nikolaj_Coster-Waldau', 7], [92206, 104971, 'Fox_Broadcasting_Company', 0]]]
–––
[[[174271, 187498, 'Roman_Atwood', 1]], [[174271, 187499, 'Roman_Atwood', 3]]]
–––
[[[255136, 254645, 'History_of_art', 2]]]
–––
[[[180804, 193183, 'Adrienne_Bailon', 0]]]
–––
[[[100277, None, None, None]]]
–––
[[[151831, 166598, 'Homeland_-LRB-TV_series-RRB-', 0], [151831, 166598, 'Prisoners_of_War_-LRB-TV_series-RRB-', 0]]]
–––
[[[173384, None, None, None]]]
–––
[[[273626, None, None, None]]]
–––
[[[49158, 58489, 'Boston_Celtics', 3]], [[49159, 58490, 'Boston_Celtics', 3]]]
–––
[[[23513, 28977, 'The_Ten_Commandments_-LRB-1956_film-RRB-', 0]], [[23513, 28978, 'The_Ten_Commandments_-LRB-1956_film-RRB-', 20]]]


In [103]:
df = pd.read_json(data_path, lines=True)
test_data = df[:10]
processed_df = get_processed_df(test_data)
processed_df.to_csv(out_path)

could not find article for The_Ten_Commandments_-LRB-1956_film-RRB-
