In [1]:
import json
import numpy
import random

In [2]:
DATA_DIR='eqasc/'
SUFFIX = '_grc.json'

In [3]:
split = 'dev'

In [4]:
def load_data(fname):
    data = [ ]
    data = json.load(open(fname, 'r'))
    return data

In [5]:
fname = DATA_DIR + 'eqasc_' + split + SUFFIX

data = load_data(fname)
print("len(data) = ", len(data))

len(data) =  926


In [6]:
###############

In [7]:
random.seed(123)

In [8]:
def show_random_example(data, show_correct_choice_only=True):
    
    row = random.choice(data)
    df_correct_choice = None
    correct_choice = row[ 'answerKey' ]
    ques = row['question']['stem'].lower().strip()
    #print("==== correct_choice = ", correct_choice)
    print("==== ques = ", ques)
    
    for c in row[ 'question' ][ 'choices' ]: # each question has 8 choices typically

        option = c[ "text" ]
        option = option.lower().strip()
        df = c['df']
        if correct_choice == c['label']:
            option_correct_choice = option
        else:
            if show_correct_choice_only:
                continue
        print("-"*44)
        #print("==== row['answerKey'] = ", correct_choice)
        print("==== option = ", option)
        #print("==== df = ", df)
        
        for j,chain in enumerate(c['chains']):
            
            print("="*24)
            f1, f2 = chain[ 0 ][ 'text' ], chain[ 1 ][ 'text' ]
            chain2 = chain[2]
            overlapping_entities = chain2['overlapping_entities'] 
            print("f1= ", f1 )
            print("f2= ", f2 )
            print("df= ", df )
            # we consider reasoning chain as (f1,f2,df)
            #print("[overlapping_entities]: txt12_candidates= ", overlapping_entities['txt12_candidates'] )
            #print("[overlapping_entities]: txt1c_candidates= ", overlapping_entities['txt1c_candidates'] )
            #print("[overlapping_entities]: txt2c_candidates= ", overlapping_entities['txt2c_candidates'] )
            print(" ---->>> j=", j, "chain = ", json.dumps(c[ 'chains' ][ j ], indent=4))
            # 'score' field holds retrieval score
            # 'df' is declaritive form of question+answer
            # 'grc' (generalized reasoning chain) is obtained after variablizing (f1,f2,df)
            
            

In [9]:
show_random_example(data, show_correct_choice_only=True)

==== ques =  what can be used to find your way if you're lost in the dark?
--------------------------------------------
==== option =  the north star
f1=  Art can find the lost Art can find the lost Art can find the lost.
f2=  North Star clip art was created by Peter Reynolds, author and illustrator of The North Star.
df=  the north star can be used to find your way if you ' re lost in the dark
 ---->>> j= 0 chain =  [
    {
        "text": "Art can find the lost Art can find the lost Art can find the lost.",
        "score": 20.327402
    },
    {
        "text": "North Star clip art was created by Peter Reynolds, author and illustrator of The North Star.",
        "score": 29.083858
    },
    {
        "overlapping_entities": {
            "txt12_candidates": {
                "txt1_candidates": [
                    "art"
                ],
                "txt2_candidates": [
                    "art"
                ]
            },
            "txt1c_candidates": {
             

In [10]:
show_random_example(data, show_correct_choice_only=True)

==== ques =  what are therapsids?
--------------------------------------------
==== option =  animals
f1=  Therapsids are reptiles ancestral to mammals.
f2=  Animals can be classified into mammals, reptiles, birds.
df=  animals are therapsids
 ---->>> j= 0 chain =  [
    {
        "text": "Therapsids are reptiles ancestral to mammals.",
        "score": 17.322662
    },
    {
        "text": "Animals can be classified into mammals, reptiles, birds.",
        "score": 24.840178
    },
    {
        "overlapping_entities": {
            "txt12_candidates": {
                "txt1_candidates": [
                    "mammals"
                ],
                "txt2_candidates": [
                    "mammals"
                ]
            },
            "txt1c_candidates": {
                "txt1_candidates": [
                    "therapsids"
                ],
                "txtc_candidates": [
                    "therapsids"
                ]
            },
            "txt2c_candid

In [11]:
###############

In [12]:
# it might be easier to read some of the grc chains by using alphabet variables instead of '[unusedi]' tokens

def utility_to_replace_unused_with_vars(grc_chain, vars_to_use = ['X','Y','Z','U','V','W']):
    assert len(grc_chain) == 3
    mapper = {}
    idx = 0
    ret = []
    for row in grc_chain:
        t = []
        tokens = row.strip().split()
        for token in tokens:
            if token.startswith('[unused'):
                if token not in mapper:
                    mapper[token] = vars_to_use[idx]
                    idx += 1
                t.append(mapper[token])
            else:
                t.append(token)
        ret.append(' '.join(t))
    return ret

In [13]:
grc = [
            " [unused2]  and  [unused9]  are theropsids",
            "some  [unused2]  gave rise to a group of  [unused8]  called  [unused9] ",
            " [unused8]  are  [unused9] "
        ]
# a valid reasoning chain

In [14]:
utility_to_replace_unused_with_vars(grc)


['X and Y are theropsids',
 'some X gave rise to a group of Z called Y',
 'Z are Y']

In [15]:
grc = [" [unused3]  are reptiles ancestral to  [unused1] ",
            " [unused5]  can be classified into  [unused1]  reptiles birds",
            " [unused5]  are  [unused3] "]
# a valid reasoning chain
# 'reptiles' got wrong POS

In [16]:
utility_to_replace_unused_with_vars(grc)

['X are reptiles ancestral to Y',
 'Z can be classified into Y reptiles birds',
 'Z are X']

In [17]:
grc = [ "tetraceratops is the oldest known  [unused0] ",
         "bantams are among the oldest known domestic  [unused2] ",
         " [unused2]  are  [unused0] " ]
# an invalid reasoning chain

In [18]:
utility_to_replace_unused_with_vars(grc)

['tetraceratops is the oldest known X',
 'bantams are among the oldest known domestic Y',
 'Y are X']

In [19]:
############### Inspecting eqasc-perturbed

In [20]:
fname = 'eqasc_perturbed/turk_modified_test.tsv'
# QID	Fact1	Fact2	Combined	Fact1Edited	Fact2Edited	CombinedEdited	Fact1Change	Fact2Change	CombinedChange

In [21]:
data = open(fname,'r').readlines()
data = [row.strip().split('\t') for row in data]
headers,data = data[0],data[1:]

In [22]:
headers

['QID',
 'Fact1',
 'Fact2',
 'Combined',
 'Fact1Edited',
 'Fact2Edited',
 'CombinedEdited',
 'Fact1Change',
 'Fact2Change',
 'CombinedChange']

In [23]:
data[0]

['3018Q3ZVOIPYTHOB6LJ337FXF57ARA',
 'Bryophytes occupy niches in moist habitats.',
 'Mosses and liverworts are bryophytes.',
 'mosses occupy niches in moist habitats',
 'Bryophytes occupy homes in moist habitats.',
 'Mosses and liverworts are bryophytes.',
 'mosses occupy homes in moist habitats',
 'niche -> "home"',
 'NIL -> NIL',
 'niche -> "home"']

In [24]:
def see_random_example_eqasc_pert(data):
    row = random.choice(data)
    idx, original, edited, edits = row[0], row[1:4], row[4:7], row[7:10]
    print("idx=",idx)
    print("original=",original)
    print("edited=",edited)
    print("edits=",edits)
    

In [25]:
random.seed(123)

In [26]:
see_random_example_eqasc_pert(data)

idx= 320DUZ38G7LI5KI1KG24X24923MGJZ
original= ['a radio is used for communication', 'Internal communication is very important during any emergency situation.', 'radios are very important during any emergency situation']
edited= ['a radio is used for conversation', 'Internal conversation is very important during any emergency situation.', 'radios are very important during any emergency situation']
edits= ['communication -> "conversation"', 'communication -> "conversation"', 'NIL -> NIL']


In [27]:
see_random_example_eqasc_pert(data)

idx= 3CFVK00FWLKM3HHVBO5V1Q4CE4GL6E
original= ['Most bryophytes are small.', 'Bryophytes include the mosses and liverworts.', 'most mosses are small']
edited= ['Most bryophytes are microscopic.', 'Bryophytes include the mosses and liverworts.', 'most mosses are microscopic.']
edits= ['small -> "microscopic"', 'NIL -> NIL', 'small -> "microscopic"']


In [28]:
see_random_example_eqasc_pert(data)

idx= 33FBRBDW6OYG4R6DRQ9UILAGTOVC8D
original= ['Bladder infections can be treated with antibiotics prescribed by a doctor.', 'If it is viral, no antibiotics are indicated.', 'bladder infections are not viral in nature']
edited= ['Bladder infections can be treated with medicines prescribed by a doctor.', 'If it is viral, no medicines are indicated.', 'bladder infections are not viral in nature']
edits= ['antibiotic -> "medicine"', 'antibiotic -> "medicine"', 'NIL -> NIL']
