In [49]:
import pandas as pd
import sys
import json
import numpy as np

In [50]:
bleu_test_file = '../../../data/squad_bleu/test-qar_squad_all_bleu.jsonl'
rouge_test_file = '../../../data/squad_rouge/test-qar_squad_all_rouge.jsonl'

In [51]:
span_questions = {}
rouge_questions = {}

def get_questions(filename):
    questions = {}
    with open(filename, 'r', encoding='utf-8') as fp_inp:
        for line in fp_inp:
            try:
                qar = json.loads(line)
            except json.JSONDecodeError:
                raise Exception('\"%s\" is not a valid json' % line)
            context = qar['context']
            qar = qar['qas'][0]
            spans = [a['text'] for a in qar['answers'] if a['text'] != '']
            question = qar['question']
            answers = qar['human_answers']
            questions[hash((context, question))] = {
                'question': question,
                'context': context,
                'answers': answers,
                'spans': spans
            }
        return questions

span_questions = get_questions(bleu_test_file)
rouge_questions = get_questions(rouge_test_file)

In [52]:
len(span_questions), len(rouge_questions)

(57032, 57032)

In [53]:
np.random.seed = 1

In [54]:
ids = np.random.permutation(list(span_questions.keys()))[:400]

In [57]:
entries = []
for key in ids:
    bleu_entry = span_questions[key]
    rouge_entry = rouge_questions[key]
    for x in ['question', 'context', 'answers']:
        assert bleu_entry[x] == rouge_entry[x]
    entries.append({
        'question': bleu_entry['question'],
        'context': bleu_entry['context'],
        'answers': bleu_entry['answers'],
        'bleu_spans': bleu_entry['spans'],
        'rouge_spans': rouge_entry['spans'],
    })

In [63]:
data = pd.DataFrame(entries)[['question', 'context', 'bleu_spans', 'rouge_spans', 'answers']]
def enum_value(l):
    return '\n'.join(['%d) %s' % (i + 1, val) for i, val in enumerate(l)])

for key in ['answers', 'bleu_spans', 'rouge_spans']:
    data[key] = data[key].apply(lambda x: enum_value(x))

In [64]:
data.head()

Unnamed: 0,question,context,bleu_spans,rouge_spans,answers
0,Is the iPhone 4 Front Screen Display White (GS...,"Upon my first order for this screen, everythin...",1) practical advice for replacing the screen g...,1) and compare it with the\n2) and compare it ...,"1) Yes it is and it fits exactly, with all con..."
1,what are the dimensions of this item?,"Reasonable shipping time, and just as ordered....",1) break on the door\n2) on the door (and\n3) ...,1) for the most part the\n2) of RV Designer's ...,"1) normal size for RV doors\n2) Its about 4"" t..."
2,Can you use this device if your do not have a ...,Took less than a minute to setup. I have an ol...,1) to connect to my TV but I had to\n2) connec...,"1) and use those I have a\n2) a techie, I woul...",1) I hooked a HP Pavilion laptop up to my HDTV...
3,What are the sizes of these jars?,"I'm happy with my purchase, but just wanted to...",1) in my bathroom and the rest of the jars\n2)...,1) use in genetic lab to hold\n2) to use in ge...,1) Approximately 7 inches tall and 4-5 inches ...
4,Where do the birds hummer come from? I have ne...,I feed 50-60 hummers all summer long with 6 of...,1) for all the birds that are attracted to the...,1) for all the birds that are attracted to the...,1) It took about a week for the birds to find ...


In [66]:
data.to_csv('/home/ubuntu/capstone/src/evaluation/span_analysis/span_annotations.csv', index=False)