In [44]:
# imports
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 140)
pd.set_option('display.width', 2000)
import sys,os
from IPython.core.debugger import Tracer
from IPython.core.debugger import BdbQuit_excepthook
%matplotlib inline
%load_ext line_profiler
import matplotlib.pyplot as plt
import matplotlib
import nltk
import io
import json
import time
from pandas import ExcelWriter
from ast import literal_eval
import hashlib
import unicodedata
import subprocess
import datetime
m = hashlib.md5()
from nltk.metrics import *
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import zipfile
import pymongo
import re
import random
from tqdm import tqdm
import gzip

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


# Experiment config

In [45]:
EXP_NAME = 'MSMARCO-0'
EVAL_SET = 'train' # train, dev , which split to run on 
WRITE_EVIDENCE = 0 # should we write the evidence to file
SEARCHRESULTS_FROM_FILES = 1
FILES_PER_QUESTION = 20 # how many evidence files to build per question
LIMIT_TRAIN_SIZE = -1
PRODUCE_ONLY_SAMPLE = 0
RUN_EXPERIMENTS = 0

# DataSets
USE_MSMARCO = 1
USE_SEARCHQA = 0
USE_TRIVIAQA = 0
USE_SQUAD = 0
USE_COMPWEBQ = 0

# Dataset Specific 
# Squad
USE_SQUAD_ORG_CONTEXT = 0
USE_MSMARCO_ORG_CONTEXT = 0
# ComplexWebQuestions
ADD_SPLITS_TO_TRAIN = 1

# GENERAL
#PRODUCE_ONLY_SAMPLE = True
EVIDENCE_DIR = '/Users/alontalmor/Documents/dev/datasets/triviaqa/triviaqa-rc/'

# Tests
CALC_ANSWER_IN_GOOGLE_PERC = 1
GOOGLE_FILTERED_FILES = 0

# Load Datasets

In [46]:
Data = {}

## Dev Set

### SearchQA

In [47]:
filename = '/Users/alontalmor/Documents/dev/datasets/SearchQA/data_json/000488-5243_jeopardy_beginningend_600.json'
with open(filename,'r') as f:
    data = json.load(f)

In [48]:
pd.DataFrame(data)['question'].value_counts()

"From" this to this is an idiom meaning from the start of a meal (or something else) to the end    98
Name: question, dtype: int64

In [49]:
data['question']

'"From" this to this is an idiom meaning from the start of a meal (or something else) to the end'

In [50]:
if USE_SEARCHQA:
    def load_searchqa(dirname,filename):
        print(os.path.join(dirname,filename +'.zip'))
        with zipfile.ZipFile(os.path.join(dirname,filename +'.zip'),'r') as myzip:
            with myzip.open(filename) as myfile:
                searchqa_json = json.load(myfile)
        #with gzip.open(os.path.join(dirname,filename +'.zip'),'rb') as myzip:
        #    searchqa_json=pd.DataFrame(json.loads(myzip.read()))

        return searchqa_json
    if USE_SEARCHQA and EVAL_SET == 'dev':
         Data['searchqa'] = load_searchqa('/Users/alontalmor/Documents/dev/datasets/SearchQA/data_json/','val')
    


### MSMARCO

In [51]:
if USE_MSMARCO:
    import gzip
    def load_msmarco(dirname,filename):
        print(os.path.join(dirname,filename +'.gz'))
        with gzip.open(os.path.join(dirname,filename +'.gz'),'rb') as myzip:
            msmarco = pd.DataFrame(json.loads(myzip.read()))

        print(len(msmarco))
        msmarco = msmarco[msmarco['answers'].str[0] != 'No Answer Present.']
        msmarco = msmarco[msmarco['answers'].str[0] != 'Yes']
        msmarco = msmarco[msmarco['answers'].str[0] != 'No']
        msmarco = msmarco[msmarco['answers'].str[0] != '']

        

        # Filter very long answers:
        msmarco = msmarco[msmarco['answers'].str[0].apply(len) < 30]
        
        # filtering if no well formed answers? 
        # len(Data['msmarco'][Data['msmarco']['wellFormedAnswers'] != '[]'])
        
        print(len(msmarco))

        return msmarco

    if USE_MSMARCO and EVAL_SET == 'dev':
         Data['MSMARCO'] = load_msmarco('/Users/alontalmor/Documents/dev/datasets/MSMARCO/','dev_v2.1.json')
    
    # fileter no answer persent, yes and no answers:
    
    #Data['msmarco']['answers'].str[0].value_counts()[0:10]


In [52]:
if False:
    # answer exact match comparison
    selected_answers = []
    for ind,q in Data['MSMARCO'].iterrows():
        x = pd.DataFrame(q['passages'])
        selected_answers.append(x[x['is_selected'] == 1]['passage_text'].iloc[0])
    Data['MSMARCO']['selected_answers'] = selected_answers
    Data['MSMARCO'][['answers','selected_answers']]

### Squad

In [53]:
def load_squad(dirname,filename):
    with zipfile.ZipFile(os.path.join(dirname,filename +'.zip'),'r') as myzip:
        with myzip.open(filename) as myfile:
            squad_json = json.load(myfile)

    squad_questions = []
    contexts = []
    for part in squad_json['data']:
        for para in part['paragraphs']:
            contexts.append(para['context'])
            for qas in para['qas']:
                question = qas.copy()
                question['title'] = part['title']
                question['context_id'] = len(contexts) - 1
                squad_questions.append(question)
    squad_questions = pd.DataFrame(squad_questions)

    # removing version 2 samples! 
    squad_questionsV1 = squad_questions[~squad_questions['is_impossible']].copy(deep=True)
    del squad_questionsV1['is_impossible']
    del squad_questionsV1['plausible_answers']

    # empty entity pages
    squad_questionsV1['EntityPages'] = np.empty((len(squad_questionsV1), 0)).tolist()

    # empty search results
    squad_questionsV1['SearchResults'] = np.empty((len(squad_questionsV1), 0)).tolist()

    return squad_questionsV1, contexts
    
if USE_SQUAD and EVAL_SET == 'dev':
     Data['Squad'], org_contexts = load_squad('../data/Squad/','dev-v2.0.json')
    

### TriviaQA

In [54]:
if USE_TRIVIAQA and EVAL_SET == 'dev':
    with zipfile.ZipFile('../data/TriviaQA/unfiltered-web-dev.json.zip','r') as myzip:
        with myzip.open('unfiltered-web-dev.json') as myfile:
            questions = json.load(myfile)
            Data['TriviaQA'] = pd.DataFrame(questions['Data'])
            
    # empty entity pages
    Data['TriviaQA']['EntityPages'] = np.empty((len(Data['TriviaQA']), 0)).tolist()

    # empty search results
    Data['TriviaQA']['SearchResults']  = Data['TriviaQA']['SearchResults'].astype(object)
    Data['TriviaQA']['SearchResults'] = np.empty((len(Data['TriviaQA']), 0)).tolist()

### ComplexWebQuestions

In [55]:
# comparing web triavia-rc dev number of question to unfiltered number of questions
if USE_COMPWEBQ and EVAL_SET == 'dev':
    rl_input_df = pd.DataFrame()
    #data_dir = '/Users/alontalmor/Dropbox/Apps/WebKB/webkb_dev_data/RL_preproc_data/rl_cascade1_epoch0-8/'
    data_dir = '../data/V2_dev_splitpoints/'
    filename = 'dev.json.zip'
    if filename.find('.json.zip') > -1:
        print(filename)
        with zipfile.ZipFile(data_dir + filename, 'r') as myzip:
            with myzip.open(filename.replace('.zip', '')) as myfile:
                curr_batch = pd.DataFrame(json.load(myfile))
        curr_batch = curr_batch[(curr_batch[['split_part1', 'split_part2']].isnull(
        ) * 1.0).sum(axis=1) == 0]  # removing null values
        curr_batch['traj_id'] = curr_batch['ID'] + curr_batch['comp'] + curr_batch['split_part1'].str.replace(" ","") \
                                + ',' + curr_batch['split_part2'].str.replace(" ","")
        if len(rl_input_df) > 0:
            len_before_filter = len(curr_batch)
            curr_batch = curr_batch[
                ~curr_batch['traj_id'].isin(rl_input_df['traj_id'])]
        curr_batch['filename'] = filename
        rl_input_df = rl_input_df.append(curr_batch, ignore_index=True)

    rl_input_df = rl_input_df.set_index('traj_id')

    # dropping exact duplicate splits
    print('size before drop dups: ' + str(len(rl_input_df)))
    rl_input_df = rl_input_df.drop_duplicates(
        ['ID', 'comp', 'split_part1', 'split_part2'])
    print('size after drop dups: ' + str(len(rl_input_df)))

    dataset_filename = '../../../mturk/compqgen/final/complexwebquestions_V1_1/ComplexWebQuestions_dev'
    with open(dataset_filename + '.json', 'r') as outfile:
        complexwebquestions = pd.DataFrame(json.load(outfile))

    rl_input_df = rl_input_df.merge(
        pd.DataFrame(complexwebquestions)[['answers', 'ID']], on='ID', how='inner')
    rl_input_df.rename(columns={'answers_y': 'answers'}, inplace=True)
    del rl_input_df['answers_x']
    Data['ComplexWebQuestions'] = rl_input_df

## Training Set

### MSMARCO

In [56]:
if USE_MSMARCO and EVAL_SET == 'train':
    Data['MSMARCO'] = load_msmarco('/Users/alontalmor/Documents/dev/datasets/MSMARCO/','train_v2.1.json')

/Users/alontalmor/Documents/dev/datasets/MSMARCO/train_v2.1.json.gz
808731
160375


### Squad

In [57]:
if USE_SQUAD and EVAL_SET == 'train':
    Data['Squad'], org_contexts = load_squad('../data/Squad/','train-v2.0.json')

### TriviaQA

In [58]:
if USE_TRIVIAQA and EVAL_SET == 'train':
    with zipfile.ZipFile('../data/TriviaQA/unfiltered-web-train.json.zip','r') as myzip:
        with myzip.open('unfiltered-web-train.json') as myfile:
            questions = json.load(myfile)
            triviaqa_dataset = pd.DataFrame(questions['Data'])
            
    # empty entity pages
    triviaqa_dataset['EntityPages'] = np.empty((len(triviaqa_dataset), 0)).tolist()

    # empty search results 
    triviaqa_dataset['SearchResults']  = triviaqa_dataset['SearchResults'].astype(object)
    triviaqa_dataset['SearchResults'] = np.empty((len(triviaqa_dataset), 0)).tolist()
    
    Data['TriviaQA'] = triviaqa_dataset

### ComplexWebQuestions

In [59]:
if USE_COMPWEBQ and EVAL_SET == 'train':
    rl_input_df = pd.DataFrame()
    #data_dir = '/Users/alontalmor/Dropbox/Apps/WebKB/webkb_dev_data/RL_train_data/rl_cascade1_epoch0-8/'
    # V2 Code
    data_dir = '../data/V2_train_splitpoints/'
    for dirname, dirnames, filenames in os.walk(data_dir):

        # making sure noisy sup is added first (because of the default MIN_REWARD_TRESH values
        #if 'noisy_sup_rewarded.json.zip' in filenames:
        #    filenames.remove('noisy_sup_rewarded.json.zip')
        #    filenames = ['noisy_sup_rewarded.json.zip'] + filenames
        # V2 code
        if 'train.json.zip' in filenames:
            filenames.remove('train.json.zip')
            filenames = ['train.json.zip'] + filenames

        for filename in filenames[0:2]:

            if filename.find('.json.zip')>-1:
                print(filename)
                with zipfile.ZipFile(data_dir + filename,'r') as myzip:
                    with myzip.open(filename.replace('.zip','')) as myfile:
                        curr_batch = pd.DataFrame(json.load(myfile))
                curr_batch = curr_batch[(curr_batch[['split_part1', 'split_part2']].isnull() * 1.0).sum(axis=1) == 0] # removing null values
                # V2 Code
                if len(rl_input_df) > 0:
                    curr_batch = curr_batch[curr_batch['ID'].isin(rl_input_df['ID'])]

                curr_batch['traj_id'] = curr_batch['ID'] + curr_batch['comp'] + curr_batch['split_part1'].str.replace(" ","") \
                                            + ',' + curr_batch['split_part2'].str.replace(" ","")
                if len(rl_input_df)>0:
                    len_before_filter = len(curr_batch)
                    curr_batch = curr_batch[~curr_batch['traj_id'].isin(rl_input_df['traj_id'])]
                curr_batch['filename'] = filename
                rl_input_df = rl_input_df.append(curr_batch, ignore_index=True)

    dataset_filename = '../../../mturk/compqgen/final/complexwebquestions_V1_1/ComplexWebQuestions_train'
    with open(dataset_filename + '.json', 'r') as outfile:
        complexwebquestions = pd.DataFrame(json.load(outfile))

    rl_input_df = rl_input_df.merge(
        pd.DataFrame(complexwebquestions)[['answers', 'ID']], on='ID', how='inner')
    rl_input_df.rename(columns={'answers_y': 'answers'}, inplace=True)
    del rl_input_df['answers_x']        

    rl_input_df = rl_input_df.set_index('traj_id')

    # dropping exact duplicate splits
    print('size before drop dups: ' + str(len(rl_input_df)))
    rl_input_df = rl_input_df.drop_duplicates(['ID', 'comp', 'split_part1', 'split_part2'])
    print('size after drop dups: ' + str(len(rl_input_df)))
    
    # checking overlab between V1 training question and V2
    #dataset_filename = '../../mturk/compqgen/final/complexwebquestions/ComplexWebQuestions_train'
    #with open(dataset_filename + '.json', 'r') as outfile:
    #    complexwebquestions_org = pd.DataFrame(json.load(outfile))

In [60]:
if USE_COMPWEBQ:
    dataset_filename = '../data/ComplexWebQuestions_' + EVAL_SET
    with open(dataset_filename + '.json', 'r') as outfile:
        complexwebquestions_org = pd.DataFrame(json.load(outfile)).set_index('ID')

    rl_input_df.loc[rl_input_df['comp'] == 'composition','composition_answer'] = \
        list(complexwebquestions_org.loc[rl_input_df[rl_input_df['comp'] == 'composition']['ID'],'composition_answer'].astype(object))

In [61]:
if USE_COMPWEBQ:
    questions = []   
    for ind in tqdm(range(len(rl_input_df)), total=len(rl_input_df), ncols=80, desc="scoring"):
        question = rl_input_df.iloc[ind]
        if question['comp'] == 'conjunction':
            q_copied = question.copy()
            questions.append(q_copied)
            if ADD_SPLITS_TO_TRAIN:
                q_copied = question.copy()
                q_copied['question'] = question['split_part1']
                questions.append(q_copied)
                q_copied = question.copy()
                q_copied['question'] = question['split_part2']
                questions.append(q_copied)
        elif question['comp'] == 'composition':
            q_copied = question.copy()
            q_copied['ID']
            questions.append(q_copied)
            if ADD_SPLITS_TO_TRAIN:
                q_copied = question.copy()
                q_copied['question'] = question['split_part1']
                q_copied['answers'] = [{'aliases':[],'answer':question['composition_answer']}]
                questions.append(q_copied)
                q_copied = question.copy()
                q_copied['question'] = question['split_part2']
                q_copied['question'] = q_copied['question'].replace('%composition',question['composition_answer'])
                questions.append(q_copied)
        else:
            q_copied = question.copy()
            questions.append(q_copied)

In [62]:
if USE_COMPWEBQ:
    questions = pd.DataFrame(questions)
    # dropping exact duplicate splits
    print('size before drop dups: ' + str(len(questions)))
    questions = questions.drop_duplicates(['question'])
    print('size after drop dups: ' + str(len(questions)))
    questions = questions.reset_index(drop=True)
    Data['ComplexWebQuestions'] = questions

### ComplexWebQuestions to TriviaQA format


# convert to triviaqa format

dev and train are converted in the same place

In [63]:
Data_triviaqa_format = {}

## MSMARCO

In [64]:
if USE_MSMARCO:
    questions = Data['MSMARCO']
    questions_triviaqa_format = pd.DataFrame()
    questions_triviaqa_format['QuestionId'] = questions['query_id']
    questions_triviaqa_format['Question'] = questions['query']
    if USE_MSMARCO_ORG_CONTEXT:
        all_contexts = []
        for ind,q in tqdm(questions.iterrows(),total=len(questions), ncols=80, desc="org context"): 
            passages = []
            for snippet in q['passages']:
                passages += [{'title':'','snippet':snippet['passage_text']}]
            all_contexts.append(passages)
        questions_triviaqa_format['SearchResults'] = all_contexts
    else:
        questions_triviaqa_format['SearchResults'] = [[] for x in range(len(questions_triviaqa_format))]
    questions_triviaqa_format['EntityPages'] = [[] for x in range(len(questions_triviaqa_format))]
    questions_triviaqa_format['QuestionSource'] = ''
    all_answers = []
    for ind,q in tqdm(questions.iterrows(),total=len(questions), ncols=80, desc="scoring"):
        filtered_answer = []
        for answer in q['answers']:
            if len(answer) > 0:
                filtered_answer.append(answer)
        
        
        triviaqa_formated_answers = {'Aliases':[],'NormalizedAliases':[], \
                                    'NormalizedValue':'', \
                                     'Type':'FreeForm','Value':''}
        triviaqa_formated_answers['Value'] = filtered_answer[0]
        triviaqa_formated_answers['NormalizedValue'] = ' '.join(word_tokenize(filtered_answer[0].lower()))
        aliases = [answer for answer in filtered_answer]
        aliases = list(set(aliases))
        triviaqa_formated_answers['Aliases'] = aliases
        
        triviaqa_formated_answers['NormalizedAliases'] = \
            [' '.join(word_tokenize(word.lower())) for word in triviaqa_formated_answers['Aliases']]
        all_answers.append(triviaqa_formated_answers)

    questions_triviaqa_format['Answer'] = all_answers
    questions_triviaqa_format = questions_triviaqa_format[questions_triviaqa_format['Answer'].notnull()] 
    
    Data_triviaqa_format['MSMARCO'] = questions_triviaqa_format

scoring: 100%|████████████████████████| 160375/160375 [00:59<00:00, 2709.61it/s]


In [65]:
snippet['passage_text']

'Today, the town serves as a suburb of Galveston and houses workers from the Johnson Space Center. Since 1984, Hitchcock has been home to the Galveston County Fair & Rodeo. The Galveston County Fair & Rodeo began in 1938 and was held at facilities in Runge Park in Arcadia. In the early 1980s, the County Fair had reached its limits of growth at Runge Park, and plans began for a move to Jack Brooks Park in Hitchcock.'

## Squad

In [66]:
if USE_SQUAD:
    questions = Data['Squad']
    if PRODUCE_ONLY_SAMPLE:
        questions = questions[0:100]
        
    questions_triviaqa_format = pd.DataFrame()
    questions_triviaqa_format['QuestionId'] = questions['id']
    questions_triviaqa_format['Question'] = questions['question']
    if USE_SQUAD_ORG_CONTEXT:
        all_contexts = []
        for ind,q in tqdm(questions.iterrows(),total=len(questions), ncols=80, desc="org context"): 
            all_contexts.append([{'title':'','snippet':org_contexts[q['context_id']]}])
        questions_triviaqa_format['SearchResults'] = all_contexts
    else:
        questions_triviaqa_format['SearchResults'] = [[] for x in range(len(questions_triviaqa_format))]
    questions_triviaqa_format['EntityPages'] = [[] for x in range(len(questions_triviaqa_format))]
    questions_triviaqa_format['QuestionSource'] = ''
    all_answers = []
    for ind,q in tqdm(questions.iterrows(),total=len(questions), ncols=80, desc="appending answers"):
        filtered_answer = []
        for answer in q['answers']:
            if len(answer['text']) > 0:
                filtered_answer.append(answer)
        
        
        triviaqa_formated_answers = {'Aliases':[],'NormalizedAliases':[], \
                                    'NormalizedValue':'', \
                                     'Type':'FreeForm','Value':''}
        triviaqa_formated_answers['Value'] = filtered_answer[0]['text']
        triviaqa_formated_answers['NormalizedValue'] = ' '.join(word_tokenize(filtered_answer[0]['text'].lower()))
        aliases = [answer['text'] for answer in filtered_answer]
        aliases = list(set(aliases))
        triviaqa_formated_answers['Aliases'] = aliases
        
        triviaqa_formated_answers['NormalizedAliases'] = \
            [' '.join(word_tokenize(word.lower())) for word in triviaqa_formated_answers['Aliases']]
        all_answers.append(triviaqa_formated_answers)

    questions_triviaqa_format['Answer'] = all_answers
    questions_triviaqa_format = questions_triviaqa_format[questions_triviaqa_format['Answer'].notnull()] 
    
    Data_triviaqa_format['Squad'] = questions_triviaqa_format

## ComplexWebQuestions

In [67]:
if USE_COMPWEBQ:
    questions = Data['ComplexWebQuestions']
    if PRODUCE_ONLY_SAMPLE:
        questions = questions[0:100]
    questions_triviaqa_format = pd.DataFrame()
    questions_triviaqa_format['QuestionId'] = questions['ID'] + '_' + questions.index.astype(str)
    questions_triviaqa_format['Question'] = questions['question']
    questions_triviaqa_format['SearchResults'] = [[] for x in range(len(questions_triviaqa_format))]
    questions_triviaqa_format['EntityPages'] = [[] for x in range(len(questions_triviaqa_format))]
    questions_triviaqa_format['QuestionSource'] = ''
    all_answers = []
    for ind,q in tqdm(questions.iterrows(),total=len(questions), ncols=80, desc="scoring"):
        if q['answers'][0]['answer'] == None or q['answers'][0]['answer'] == '':
            all_answers.append(None)
        else:
            filtered_answer = []
            for answer in q['answers']:
                if len(answer['answer']) > 0:
                    filtered_answer.append(answer)

            triviaqa_formated_answers = {'Aliases':[],'NormalizedAliases':[], \
                                        'NormalizedValue':'', \
                                         'Type':'FreeForm','Value':''}
            triviaqa_formated_answers['Value'] = filtered_answer[0]['answer']
            triviaqa_formated_answers['NormalizedValue'] = ' '.join(word_tokenize(filtered_answer[0]['answer'].lower()))
            for answer in filtered_answer:
                triviaqa_formated_answers['Aliases'] += answer['aliases']
                triviaqa_formated_answers['Aliases'].append(answer['answer'])

            triviaqa_formated_answers['NormalizedAliases'] = \
                [' '.join(word_tokenize(word.lower())) for word in triviaqa_formated_answers['Aliases']]
            all_answers.append(triviaqa_formated_answers)

    questions_triviaqa_format['Answer'] = all_answers
    questions_triviaqa_format = questions_triviaqa_format[questions_triviaqa_format['Answer'].notnull()]  
    Data_triviaqa_format['ComplexWebQuestions'] = questions_triviaqa_format

## TriviaQA

In [68]:
if USE_TRIVIAQA:
    if PRODUCE_ONLY_SAMPLE:
        Data['TriviaQA'] = Data['TriviaQA'][0:100]
    Data_triviaqa_format['TriviaQA'] = Data['TriviaQA']

# Appending snippets to the questions

google snippets are appended to all sub-datasets in the same manner

## From local mongo (currently only for ComplexWebQuestions)

In [69]:
if USE_COMPWEBQ and WRITE_EVIDENCE:
    MONGODB_URI = 'mongodb://127.0.0.1:27017/webkb'
    mongo_client = pymongo.MongoClient(MONGODB_URI)
    db = mongo_client.get_default_database()
    SearchCache = db['SearchResults_Cache']

In [70]:
if USE_COMPWEBQ:
    if WRITE_EVIDENCE:
        found = 0
        questions_triviaqa_format = Data_triviaqa_format['ComplexWebQuestions']
        questions_triviaqa_format = questions_triviaqa_format.set_index('QuestionId')
        questions_triviaqa_format['SearchResults'] = None
        questions_triviaqa_format['SearchResults'] = questions_triviaqa_format['SearchResults'].astype(object)
        for QuestionId,question in tqdm(questions_triviaqa_format.iterrows(), total=len(questions_triviaqa_format), ncols=80,\
                        desc="appending google search results"):
            CacheResults = SearchCache.find(
                {'querystr': question['Question'], "page": 0, "type": 'SCREEN'})
            CacheResults_Count = CacheResults.count()
            if CacheResults_Count>0:
                found += 1
                cahched_item = CacheResults.next()
                questions_triviaqa_format.at[QuestionId,'SearchResults'] = cahched_item['results']
        questions_triviaqa_format = questions_triviaqa_format.reset_index()

        Data_triviaqa_format['ComplexWebQuestions_Googled'] = questions_triviaqa_format
        del Data_triviaqa_format['ComplexWebQuestions']
    else:
        Data_triviaqa_format['ComplexWebQuestions_Googled'] = Data_triviaqa_format.pop('ComplexWebQuestions')

## From files

In [71]:
def append_google_from_files(questions_triviaqa_format,googled_dir):
    question_count = 0

    questions_triviaqa_format = questions_triviaqa_format.set_index('QuestionId')
    questions_triviaqa_format['SearchResults'] = None
    questions_triviaqa_format['SearchResults'] = questions_triviaqa_format['SearchResults'].astype(object)

    for dirname, dirnames, filenames in os.walk(googled_dir):
        for filename in tqdm(filenames, total=len(filenames), ncols=80, desc='iterating over all googled files'):
            with zipfile.ZipFile(googled_dir + '/' + filename,'r') as myzip:
                with myzip.open(filename.replace('.zip','')) as myfile:
                    googled = json.load(myfile)

            for googled_question in googled:
                question_count += 1
                if 'QuestionId' in googled_question:
                    questions_triviaqa_format.at[googled_question['QuestionId'],\
                                             'SearchResults'] = googled_question['google_results']
                else:
                    questions_triviaqa_format.at[googled_question['id'],\
                                             'SearchResults'] = googled_question['google_results']

    questions_triviaqa_format = questions_triviaqa_format.reset_index()
    print('number of questions googled')
    print(question_count)
    
    return questions_triviaqa_format

In [72]:
# TriviaQA
if USE_TRIVIAQA and SEARCHRESULTS_FROM_FILES:
    if WRITE_EVIDENCE:
        if EVAL_SET == 'dev':
            Data_triviaqa_format['TriviaQA_Googled'] = append_google_from_files(Data_triviaqa_format['TriviaQA'], \
                                                                       '../data/triviaqa_googled_dev')
        else:
            Data_triviaqa_format['TriviaQA_Googled'] = append_google_from_files(Data_triviaqa_format['TriviaQA'], \
                                                                   '../data/triviaqa_googled_train')
        del Data_triviaqa_format['TriviaQA']
    else:
        Data_triviaqa_format['TriviaQA_Googled'] = Data_triviaqa_format.pop('TriviaQA')    

In [73]:
# MSMARCO
if USE_MSMARCO and USE_MSMARCO_ORG_CONTEXT:
    Data_triviaqa_format['MSMARCO_Org'] = Data_triviaqa_format.pop('MSMARCO')
elif USE_MSMARCO and SEARCHRESULTS_FROM_FILES:
    if WRITE_EVIDENCE:
        if EVAL_SET == 'dev':
            Data_triviaqa_format['MSMARCO_Googled'] = append_google_from_files(Data_triviaqa_format['MSMARCO'], \
                                                                       '/Users/alontalmor/Dropbox/Apps/WebKB/MSMARCO/dev')
        else:
            Data_triviaqa_format['MSMARCO_Googled'] = append_google_from_files(Data_triviaqa_format['MSMARCO'], \
                                                                   '/Users/alontalmor/Dropbox/Apps/WebKB/MSMARCO/train')
        del Data_triviaqa_format['MSMARCO']
    else:
        Data_triviaqa_format['MSMARCO_Googled'] = Data_triviaqa_format.pop('MSMARCO')

In [74]:
# Squad 
if USE_SQUAD and USE_SQUAD_ORG_CONTEXT:
    Data_triviaqa_format['Squad_Org'] = Data_triviaqa_format.pop('Squad')
elif USE_SQUAD and SEARCHRESULTS_FROM_FILES:
    if WRITE_EVIDENCE:
        if EVAL_SET == 'dev':
            Data_triviaqa_format['Squad_Googled'] = append_google_from_files(Data_triviaqa_format['Squad'], \
                                                                       '../data/Squad/dev')
        else:
            Data_triviaqa_format['Squad_Googled'] = append_google_from_files(Data_triviaqa_format['Squad'], \
                                                                   '../data/Squad/train')  
        del Data_triviaqa_format['Squad']
    else:
        Data_triviaqa_format['Squad_Googled'] = Data_triviaqa_format.pop('Squad')

In [75]:
# Post PostProcessing
for dataset in Data_triviaqa_format.keys():
    print('---------------------------------')
    print(dataset)
    Data_triviaqa_format[dataset] = \
        Data_triviaqa_format[dataset][Data_triviaqa_format[dataset]['SearchResults'].notnull()]
        
    print('print how many question we found google results for')
    print((Data_triviaqa_format[dataset]['SearchResults'].apply(len)>0).sum())
    

    Data_triviaqa_format[dataset] = \
        Data_triviaqa_format[dataset][Data_triviaqa_format[dataset]['SearchResults'].notnull()]

    Data_triviaqa_format[dataset] = \
        Data_triviaqa_format[dataset][Data_triviaqa_format[dataset]['Answer'].notnull()]

    print('final question count')
    print(len(Data_triviaqa_format[dataset]))

---------------------------------
MSMARCO_Googled
print how many question we found google results for
0
final question count
160375


# Combine all datasets


This operation also supports sampling (to reduce size or change number of examples from each dataset)

In [76]:
# combining all the training sets
questions_triviaqa_format = pd.DataFrame()
for dataset in Data_triviaqa_format.keys():
    Data_triviaqa_format[dataset]['dataset'] = dataset
    questions_triviaqa_format = \
        questions_triviaqa_format.append(Data_triviaqa_format[dataset],ignore_index=True)
if EVAL_SET == 'train' and LIMIT_TRAIN_SIZE != -1:
    if len(questions_triviaqa_format) >= LIMIT_TRAIN_SIZE:
        questions_triviaqa_format = questions_triviaqa_format.sample(n=LIMIT_TRAIN_SIZE)
        
if EVAL_SET == 'dev':
    # We don't need more than 8000 in the mixed dev set.
    if len(questions_triviaqa_format) >= 8000:
        questions_triviaqa_format = questions_triviaqa_format.sample(n=8000)
    
del Data_triviaqa_format
    

In [77]:
questions_triviaqa_format['SearchResults'].apply(len).value_counts()

0    160375
Name: SearchResults, dtype: int64

# Building Evidence Files

In [78]:
#print('checking if there exist cases in which there are no answers?')
#print(len(questions_triviaqa_format[questions_triviaqa_format['Answer'].agg(lambda x: x['Value']).apply(len)==0]['Answer']))

In [79]:
triviaqa_dict = {}
triviaqa_dict['Data'] = questions_triviaqa_format
triviaqa_dict['Domain'] = 'unfiltered-web'
triviaqa_dict['Split'] = EVAL_SET
triviaqa_dict['VerifiedEval'] = False
triviaqa_dict['Version'] = 1.0

In [80]:
data_df = triviaqa_dict['Data']
data_df = data_df.set_index('QuestionId')
org_data_df = data_df.copy(deep=True)
data_df = data_df.rename(columns = {'SearchResults':'OrgSearchResults'})
data_df['SearchResults'] = None
data_df = data_df[data_df['OrgSearchResults'].notnull()]

## build googled evidence

In [81]:
# OLD create a query to file map
if False and not USE_SQUAD_ORG_CONTEXT and not USE_MSMARCO_ORG_CONTEXT:
    if WRITE_EVIDENCE and not os.path.isdir(EVIDENCE_DIR + 'evidence/'):
        os.mkdir(EVIDENCE_DIR + 'evidence/')

    if WRITE_EVIDENCE and not os.path.isdir(EVIDENCE_DIR +'evidence/' + EXP_NAME):
        os.mkdir(EVIDENCE_DIR + 'evidence/' + EXP_NAME)

    train_file_ind = int(0)
    for i in tqdm(range(len(data_df)), total=len(data_df), ncols=80, desc="building evidence files"):
        question = data_df.iloc[i]
        questionID = data_df.index[i]
        # building 10 text files out of 100 snippets
        SearchResults = []
        files = []
        filenames = []
        file_ind = 0
        google_results = question['OrgSearchResults']
        train_file_ind += 1


        if WRITE_EVIDENCE and not os.path.isdir(EVIDENCE_DIR + 'evidence/' + EXP_NAME + '/' + str(int(train_file_ind / 100))):
            os.mkdir(EVIDENCE_DIR + 'evidence/' + EXP_NAME + '/' + str(int(train_file_ind / 100)))
            #if train_file_ind % 1000 == 0:
            #    print(EVIDENCE_DIR + 'evidence/' + EXP_NAME + '/' + str(int(train_file_ind / 100)))

        # go over all google snippets (usually 100)
        for ind, g in enumerate(google_results):
            file_ind = file_ind % FILES_PER_QUESTION
            if len(files) <= file_ind:
                file_name = EXP_NAME + '/' + str(int(train_file_ind / 100)) + "/" + \
                    questionID + '_' + str(file_ind) + '.txt'
                #SearchResults.append({'Rank':ind, 'Description':g['snippet'],'Title':g['title'],'DisplayUrl':g['url'] , \
                #              'Url':g['url'] + file_name.replace('/','_').replace('.txt',''),'Filename':file_name })
                # Moving to empty evidence text ( it will be taken from the files)
                SearchResults.append({'Rank':ind, 'Description':'','Title':'','DisplayUrl':'' , \
                              'Url':g['url'] + file_name.replace('/','_').replace('.txt',''),'Filename':file_name })
                
                files.append('')
                filenames.append(file_name)

            files[file_ind] += str(
                ind) + '. ' + g['title'] + '\n' + g['snippet'] + '\n'

            file_ind += 1

        # saving files
        if WRITE_EVIDENCE:
            for file_str, file_name in zip(files, filenames):
                with open(EVIDENCE_DIR + 'evidence/' + file_name, 'w') as outfile:
                    outfile.write(file_str)

        data_df.at[questionID, 'SearchResults'] = SearchResults

In [82]:
# check folder name distribution (200 folders per dataset)
if False:
    x = []
    for ind in list(data_df.index):
        m = hashlib.md5()
        m.update(ind.encode())
        questionID_hex = m.hexdigest()
        x.append(sum(questionID_hex.encode()) % 200)
    pd.Series(x).hist()

In [83]:
# write_evidence
def write_evidence(data_df,EVIDENCE_DIR):
    if WRITE_EVIDENCE and not os.path.isdir(EVIDENCE_DIR + 'multiqa_evidence/'):
        os.mkdir(EVIDENCE_DIR + 'multiqa_evidence/')

    all_search_results = []
    train_file_ind = int(0)
    for i in tqdm(range(len(data_df)), total=len(data_df), ncols=80, desc="building evidence files"):
    #for i in tqdm(range(1), total=1, ncols=80, desc="building evidence files"):
        question = data_df.iloc[i]
        questionID = data_df.index[i]
        # building 10 text files out of 100 snippets
        SearchResults = []
        files = []
        filenames = []
        file_ind = 0
        google_results = question['OrgSearchResults']
        train_file_ind += 1

        # creating a unique question identifier
        m = hashlib.md5()
        if type(questionID) != str:
            questionID = str(questionID)
        m.update(questionID.encode())
        questionID_hex = m.hexdigest()
        folder_ind = str(sum(questionID_hex.encode()) % 200)

        if WRITE_EVIDENCE and not os.path.isdir(EVIDENCE_DIR + 'multiqa_evidence/' + question['dataset'] ):
            os.mkdir(EVIDENCE_DIR + 'multiqa_evidence/' + question['dataset'] )

        if WRITE_EVIDENCE and not os.path.isdir(EVIDENCE_DIR + 'multiqa_evidence/' + question['dataset'] + '/' + folder_ind):
            os.mkdir(EVIDENCE_DIR + 'multiqa_evidence/' + question['dataset'] + '/' + folder_ind)
            #if train_file_ind % 1000 == 0:
            #    print(EVIDENCE_DIR + 'evidence/' + EXP_NAME + '/' + str(int(train_file_ind / 100)))

        # go over all google snippets (usually 100)
        if WRITE_EVIDENCE:
            for ind, g in enumerate(google_results):
                file_ind = file_ind % FILES_PER_QUESTION
                if len(files) <= file_ind:
                    file_name = question['dataset'] + '/' + folder_ind + "/" + \
                        questionID + '_' + str(file_ind) + '.txt'
                    # Moving to empty evidence text ( it will be taken from the files)
                    SearchResults.append({'Rank':ind, 'Description':'','Title':'','DisplayUrl':'' , \
                                  'Url':question['dataset'] + '_' + file_name.replace('/','_').replace('.txt',''), \
                                          'Filename':file_name })

                    files.append('')
                    filenames.append(file_name)

                if WRITE_EVIDENCE:
                    if len(google_results)>1:
                        files[file_ind] += str(ind) + '. ' + g['title'] + '\n' + g['snippet'] + '\n'
                    else:
                        # this coveres cases with only one search result, or original context like squad
                        if len(g['title'])>0:
                            files[file_ind] += g['title'] + '\n' + g['snippet'] + '\n'
                        else:
                            files[file_ind] += g['snippet'] 

                file_ind += 1
        else:
            for file_ind in range(FILES_PER_QUESTION):
                file_name = question['dataset'] + '/' + folder_ind + "/" + \
                    questionID + '_' + str(file_ind) + '.txt'
                if os.path.exists(EVIDENCE_DIR + 'multiqa_evidence/' + file_name):
                    # Moving to empty evidence text ( it will be taken from the files)
                    SearchResults.append({'Rank':file_ind, 'Description':'','Title':'','DisplayUrl':'' , \
                                  'Url':question['dataset'] + '_' + file_name.replace('/','_').replace('.txt',''), \
                                          'Filename':file_name })
                else:
                    #print(EVIDENCE_DIR + 'multiqa_evidence/' + file_name)
                    break

        # saving files
        if WRITE_EVIDENCE:
            for file_str, file_name in zip(files, filenames):
                with open(EVIDENCE_DIR + 'multiqa_evidence/' + file_name, 'w') as outfile:
                    outfile.write(file_str)

        all_search_results.append(SearchResults)            
    
    data_df['SearchResults'] = all_search_results
        
    return data_df

In [84]:
#%lprun -f write_evidence write_evidence(data_df,EVIDENCE_DIR)
data_df = write_evidence(data_df,EVIDENCE_DIR)

building evidence files: 100%|█████████| 160375/160375 [09:03<00:00, 295.06it/s]


In [85]:
len(data_df[data_df['SearchResults'].isnull()])

0

# Calc answer in google percentage

In [86]:
INSPECT_EVIDENCE = False
if CALC_ANSWER_IN_GOOGLE_PERC:
    answer_found_mat = np.zeros((len(data_df),100))
    q_ind = 0
    for ind,question in tqdm(data_df.iterrows(),total=len(data_df), ncols=80, \
                             desc='checking how many gold answer are within question snippets'):
        normalized_aliases = question['Answer']['NormalizedAliases']

        for s_ind,result in enumerate(question['SearchResults']):
            for alias in normalized_aliases:
                
                with open(EVIDENCE_DIR + 'multiqa_evidence/' + result['Filename'], 'r') as outfile:
                    result_text = outfile.read()
                #result_text = result['Title'] + ' ' + result['Description']
                p = re.compile(r'\b({0})\b'.format(re.escape(alias)), re.IGNORECASE)
                res = re.findall(p, result_text)
                #if result_text.find(alias)>-1:
                 #   answer_found_mat[q_ind , s_ind] = 1
                if len(res) > 0:
                    if INSPECT_EVIDENCE:
                        print('-------------')
                        print('Question  ' + question['Question'])
                        print('Answer:  ' + alias)
                        print('Context:')
                        print(result_text) 
                    answer_found_mat[q_ind , s_ind] = 1
                    break
        q_ind += 1
        
        if INSPECT_EVIDENCE and q_ind > 10:
            break
                #else:
                #    print(alias + ' --- ' + result_text)
    
    print('answer are within question snippets {0}%'.format(100.0* (answer_found_mat.sum(axis=1)>0).sum() / len(data_df)))
    print('number of examples with answer are within question snippets {0}'.format((answer_found_mat.sum(axis=1)>0).sum()))

checking how many gold answer are within question snippets: 100%|█| 160375/160375 [22:33<00:00, 118.53it/s]


answer are within question snippets 44.109119251753704%
number of examples with answer are within question snippets 70740


# Sanity Tests

In [87]:
# additional sanity tests
(data_df.isnull()).sum()

Question            0
OrgSearchResults    0
EntityPages         0
QuestionSource      0
Answer              0
dataset             0
SearchResults       0
dtype: int64

In [88]:
# additional sanity tests
if data_df['SearchResults'].isnull().sum()==0:
    data_df['SearchResults'].apply(len).value_counts()

In [89]:
data_df[data_df['SearchResults'].apply(len) == 0]['Question'].apply(len).mean()

35.83045977011494

In [90]:
data_df[data_df['SearchResults'].apply(len) > 0]['Question'].apply(len).mean()

35.856015639569655

In [91]:
if WRITE_EVIDENCE:
    found = 0
    not_found = 0
    for i in tqdm(range(len(data_df)), total=len(data_df), ncols=80, desc="checking if filenames exist"):
        question = data_df.iloc[i]
        questionID = data_df.index[i]
        for ind, g in enumerate(question['SearchResults']):
            if os.path.exists(EVIDENCE_DIR + 'multiqa_evidence/' + g['Filename']):
                found += 1
            else:
                not_found += 1

    print('found : {0}'.format(found))
    print('not_found : {0}'.format(not_found))

# saving 

In [92]:
data_df

Unnamed: 0_level_0,Question,OrgSearchResults,EntityPages,QuestionSource,Answer,dataset,SearchResults
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
28213,at what age do kids start to hold memories,[],[],,"{'Aliases': ['Before the age of 2–4 years.'], 'NormalizedAliases': ['before the age of 2–4 years .'], 'NormalizedValue': 'before the age...",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_83_28213_0', 'Filename': 'MSMARCO..."
410717,is funner a word?,[],[],,"{'Aliases': ['Yes, funner is a word.'], 'NormalizedAliases': ['yes , funner is a word .'], 'NormalizedValue': 'yes , funner is a word .'...",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_62_410717_0', 'Filename': 'MSMARC..."
604568,what county is columbus city in,[],[],,"{'Aliases': ['Bartholomew'], 'NormalizedAliases': ['bartholomew'], 'NormalizedValue': 'bartholomew', 'Type': 'FreeForm', 'Value': 'Barth...",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_73_604568_0', 'Filename': 'MSMARC..."
1174759,is the bachelor legal,[],[],,"{'Aliases': ['Yes, the bachelor is legal.'], 'NormalizedAliases': ['yes , the bachelor is legal .'], 'NormalizedValue': 'yes , the bache...",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_113_1174759_0', 'Filename': 'MSMA..."
672511,what is BP level j salary,[],[],,"{'Aliases': ['Average $111,082,'], 'NormalizedAliases': ['average $ 111,082 ,'], 'NormalizedValue': 'average $ 111,082 ,', 'Type': 'Free...",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_155_672511_0', 'Filename': 'MSMAR..."
33073,average cost of assisted living in washington state,[],[],,"{'Aliases': ['$4,625 per month.'], 'NormalizedAliases': ['$ 4,625 per month .'], 'NormalizedValue': '$ 4,625 per month .', 'Type': 'Free...",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_43_33073_0', 'Filename': 'MSMARCO..."
612569,what county is seminole fl in,[],[],,"{'Aliases': ['Pinellas County'], 'NormalizedAliases': ['pinellas county'], 'NormalizedValue': 'pinellas county', 'Type': 'FreeForm', 'Va...",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_143_612569_0', 'Filename': 'MSMAR..."
32055,average cost dental implant,[],[],,"{'Aliases': ['$4250'], 'NormalizedAliases': ['$ 4250'], 'NormalizedValue': '$ 4250', 'Type': 'FreeForm', 'Value': '$4250'}",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_13_32055_0', 'Filename': 'MSMARCO..."
610348,what county is nine mile in,[],[],,"{'Aliases': ['Onondaga'], 'NormalizedAliases': ['onondaga'], 'NormalizedValue': 'onondaga', 'Type': 'FreeForm', 'Value': 'Onondaga'}",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_193_610348_0', 'Filename': 'MSMAR..."
31967,average conservatory cost,[],[],,"{'Aliases': ['$10,000 to $30,000'], 'NormalizedAliases': ['$ 10,000 to $ 30,000'], 'NormalizedValue': '$ 10,000 to $ 30,000', 'Type': 'F...",MSMARCO_Googled,"[{'Rank': 0, 'Description': '', 'Title': '', 'DisplayUrl': '', 'Url': 'MSMARCO_Googled_MSMARCO_Googled_124_31967_0', 'Filename': 'MSMARC..."


In [93]:
del data_df['OrgSearchResults']

if EVAL_SET == 'train':
    data_df = data_df[data_df['SearchResults'].apply(len)>0]

# randomizing the samples:
data_df = data_df.sample(frac=1)

print('Amount of examples saved: %d' % len(data_df))

Amount of examples saved: 155503


In [52]:
if not WRITE_EVIDENCE:
    if not os.path.isdir('../output/' ):
        os.mkdir('../output/' )
    if not os.path.isdir('../output/' + EXP_NAME):
        os.mkdir('../output/' + EXP_NAME)

    if EVAL_SET == 'dev':
        triviaqa_dict['Data'] = data_df.reset_index().to_dict(orient='rows')
        with zipfile.ZipFile('../output/' + EXP_NAME + '/' + "unfiltered-web-dev.json.zip", "w", zipfile.ZIP_DEFLATED) as zip_file:
            zip_file.writestr('unfiltered-web-dev.json', json.dumps(triviaqa_dict, sort_keys=True, indent=4))
    else:
        triviaqa_dict['Data'] = data_df.reset_index().to_dict(orient='rows')
        with zipfile.ZipFile('../output/' + EXP_NAME + '/' + "unfiltered-web-train.json.zip", "w", zipfile.ZIP_DEFLATED) as zip_file:
            zip_file.writestr('unfiltered-web-train.json', json.dumps(triviaqa_dict, sort_keys=True, indent=4))

# experiments

## Comparing two different batches

In [53]:
if RUN_EXPERIMENTS:
    path = '/Users/alontalmor/Documents/dev/datasets/NewsQA/newsqa-data-v1/cnn/stories/13012604e3203c18df09289dfedd14cde67cf40b.story'
    with open(path,'r') as f:
        x = f.read()

In [54]:
if RUN_EXPERIMENTS:
    with zipfile.ZipFile('../output/compwebq_with_triviaqa/unfiltered-web-train.json.zip','r') as myzip:
            with myzip.open('unfiltered-web-train.json') as myfile:
                questions = json.load(myfile)
                compwebq_with_triviaqa = pd.DataFrame(questions['Data'])
    compwebq1 = compwebq_with_triviaqa[compwebq_with_triviaqa['QuestionId'].str.startswith('WebQ')]
    del compwebq_with_triviaqa
    len(compwebq1)

In [55]:
#RUN_EXPERIMENTS = 1
if RUN_EXPERIMENTS:
    with zipfile.ZipFile('../output/ComplexWebQuestions/unfiltered-web-dev.json.zip','r') as myzip:
        with myzip.open('unfiltered-web-dev.json') as myfile:
            questions = json.load(myfile)
            compwebq_triviaqa_full_dev = pd.DataFrame(questions['Data'])
    compwebq2 = compwebq_triviaqa_full_dev[compwebq_triviaqa_full_dev['QuestionId'].str.startswith('WebQ')]
    del compwebq_triviaqa_full_dev
    print(len(compwebq2))

In [56]:
if RUN_EXPERIMENTS:
    x = compwebq2.merge(compwebq1,on='QuestionId',how='inner')
    ((x['SearchResults_x'].apply(len) - x['SearchResults_y'].apply(len))!=0).sum()

# Run Google 

In [57]:
#GOOGLE_FILTERED_FILES = True
if GOOGLE_FILTERED_FILES:
    dataset = 'SearchQA'
    BATCH_SIZE = 200
    
    question_for_google = Data_triviaqa_format[dataset].to_dict(orient='rows')

    import dropbox
    dbx = dropbox.Dropbox('7j6m2s1jYC0AAAAAAAHy69fu0OxDAU3fPbIjjarqr_1zalj8Mvypf8U71BoLT-AD')

    print('iterating over files')
    offset = 0
    while True:
        curr_batch_webanswer_question = question_for_google[offset:offset+BATCH_SIZE]
        
        if len(curr_batch_webanswer_question) == 0:
            break
        
        new_questions = []
        for question in curr_batch_webanswer_question: 
            goog_question = {}
            goog_question['goog_query'] = question['Question']
            goog_question['QuestionId'] = question['QuestionId']
            new_questions.append(goog_question)
        
        for_goog_dict = {'target_dir': dataset + '/' + EVAL_SET ,'questions': new_questions}

        filename = dataset + '-' + EVAL_SET + '-' + str(offset) + '_for_goog.json'
        with zipfile.ZipFile(filename + '.zip', "w", zipfile.ZIP_DEFLATED) as zip_file:
            zip_file.writestr(filename, json.dumps(for_goog_dict, sort_keys=True, indent=4))
        
        print(filename)
        
        with open(filename + '.zip', "rb") as f:
            dbx.files_upload(f.read(), '/google/' + filename + '.zip', mode = dropbox.files.WriteMode.overwrite)

        os.remove(filename + '.zip')    
        offset += BATCH_SIZE

        

In [58]:
if True:
    DIR = '/Users/alontalmor/Dropbox/Apps/WebKB/google/'
    for dirname, dirnames, filenames in os.walk(DIR):
        for filename in tqdm(filenames, total=len(filenames), ncols=80, desc='iterating over all googled files'):
            if filename.find('processing')>-1:
                os.rename(DIR + filename, DIR + filename.replace('processing','for_goog'))

iterating over all googled files: 100%|█████████| 1/1 [00:00<00:00, 3908.95it/s]


## Google SearchQA

In [59]:
if GOOGLE_FILTERED_FILES:
    DIR = '/Users/alontalmor/Documents/dev/datasets/SearchQA/data_json/'
    question_for_google = []
    for dirname, dirnames, filenames in os.walk(DIR):
        for filename in tqdm(filenames, total=len(filenames), ncols=80, desc='iterating over all googled files'):
            with open(DIR + filename,'r') as f:
                try:
                    single_question_dict = json.load(f)
                    question_for_google.append({'Question':single_question_dict['question'], \
                                                'QuestionId':single_question_dict['id']})
                except:
                    print('bad json')

In [60]:
if GOOGLE_FILTERED_FILES:
    Data_triviaqa_format = {}
    dataset = 'SearchQA'
    Data_triviaqa_format[dataset] = pd.DataFrame(question_for_google)