In [242]:
import gzip
import json
import tensorflow as tf
import collections
import os

tf.logging.set_verbosity(tf.logging.INFO)

In [260]:
#DATA_DIR = "data/v1.0/dev/nq-dev-0*.jsonl.gz"
DATA_DIR = "data/tiny-dev/nq-dev-sample.jsonl.gz"
FILTERED_DATA_DIR = "data/filtered_tiny-dev/"
accepted_long_answer_types = ["Paragraph"]

In [261]:
def get_candidate_type(e, idx):
    """Returns the candidate's type: Table, Paragraph, List or Other."""
    c = e["long_answer_candidates"][idx]
    first_token = e["document_tokens"][c["start_token"]]["token"]
    if first_token == "<Table>":
        return "Table"
    elif first_token == "<P>":
        return "Paragraph"
    elif first_token in ("<Ul>", "<Dl>", "<Ol>"):
        return "List"
    elif first_token in ("<Tr>", "<Li>", "<Dd>", "<Dt>"):
        return "Other"
    else:
        tf.logging.warning("Unknoww candidate type found: %s", first_token)
    return "Other"

def load_example(line):
    e = json.loads(line, object_pairs_hook=collections.OrderedDict)
    index_map = {}
    filtered_long_answer_candidates = []
    count = 0
    for cand_idx in range(len(e["long_answer_candidates"])):
        t = get_candidate_type(e, cand_idx)
        if t in accepted_long_answer_types:
            index_map[cand_idx] = count
            filtered_long_answer_candidates.append(e["long_answer_candidates"][cand_idx])
            count += 1
        else:
            index_map[cand_idx] = -1
    #print(index_map)
    e["long_answer_candidates"] = filtered_long_answer_candidates
    #print(filtered_long_answer_candidates)
    
    filtered_annotations = [] 
    #print("****")
    for i in range(len(e["annotations"])):
        annotation = e["annotations"][i]
        cand_idx = annotation["long_answer"]["candidate_index"]
        if cand_idx != -1:
            new_cand_idx = index_map[cand_idx]
            if new_cand_idx != -1:
                #t = get_candidate_type(e, cand_idx)
                #if t in accepted_long_answer_types:
                annotation["long_answer"]["candidate_index"] = new_cand_idx
                filtered_annotations.append(annotation)
    #print(filtered_annotations)
    e["annotations"] = filtered_annotations
    
    #print(e["long_answer_candidates"])
    #print(e["annotations"])
    
    return e
    
def read_nq_examples(input_file):
    """Read a NQ json file into a list of NqExample."""
    input_paths = tf.gfile.Glob(input_file)
    

    def _open(path):
        if path.endswith(".gz"):
            return gzip.GzipFile(fileobj=tf.gfile.Open(path, "r"))
        else:
            return tf.gfile.Open(path, "r")

    for path in input_paths:
        tf.logging.info("Reading: %s", path)
        json_str = ""
        with _open(path) as input_file:
            for line in input_file:
                #print("**Loading example **")
                e = load_example(line)
                if len(e["long_answer_candidates"]) == 0 or len(e["annotations"]) == 0 :
                    continue
                json_str = json_str + json.dumps(e) + "\n"
                print("id :" + str(e["example_id"]))
            json_bytes = json_str.encode('utf-8')
            with gzip.GzipFile(fileobj=tf.gfile.Open(os.path.join(FILTERED_DATA_DIR, os.path.basename(path)), "w")) as fout:
                fout.write(json_bytes)

In [262]:
read_nq_examples(DATA_DIR)

I0919 23:11:35.628875 140265699596096 <ipython-input-261-40584d435f7f>:66] Reading: data/tiny-dev/nq-dev-sample.jsonl.gz


id :5225754983651766092
id :-3290814144789249484
id :8851020722386421469
id :-7660771254611710392
id :8467542931261548456
id :-8817510910624892182
id :8081436745274892553
id :955374967862684316
id :-4340755100872459608
id :-4752044886865067782
id :3221262508309669486
id :-3632974700795137148
id :6117400998512150504
id :-7123670178204378853
id :-6965315175406025099
id :7761896900514914630
id :7621889567969940898
id :8588157975437869890
id :6334254940258112455
id :-1640714294501064196
id :2471210292422908873
id :-4843437099290518220
id :-7890654922490959584
id :2311014922242737246
id :-3672139806378353884
id :6556718257422998662
id :3062778689902618642
id :2673329117028914160
id :-2162391145049066060
id :966261456620232361
id :-7855437198803764135
id :-3129873875789425845
id :-7709111104155953766
id :-8381009484419676852
id :4398207059107866695
id :-523751984486228566
id :-1573340396671826912
id :-6560319052930436991
id :1658149178541467525
id :6928004204659089755
id :-846330316818577547