# Dataset BioASQ: QAPair

### 0. Initial setup (0)

In [1]:
import json
import sys
from bs4 import BeautifulSoup
import requests
import sys,os,os.path
import re
import spacy
import numpy as np
import random
import seaborn as sns
import tqdm
import pprint

In [2]:
GOLD_PATH = 'gold_standard/'
files = [(f"{GOLD_PATH}BioASQ-trainingDataset{i}b.json", i) for i in [4, 5, 6, 7, 8]]
files

[('gold_standard/BioASQ-trainingDataset4b.json', 4),
 ('gold_standard/BioASQ-trainingDataset5b.json', 5),
 ('gold_standard/BioASQ-trainingDataset6b.json', 6),
 ('gold_standard/BioASQ-trainingDataset7b.json', 7),
 ('gold_standard/BioASQ-trainingDataset8b.json', 8)]

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
pp = pprint.PrettyPrinter(indent=4)

In [5]:
from scipy.spatial import distance

def sent_vec(sent):
    return np.random.uniform(-1, 1, 700)

def similarity(sent1, sent2):
    vec1 = sent_vec(sent1)
    vec2 = sent_vec(sent2)
    return 1 - distance.cosine(vec1, vec2)

In [6]:
sent_vec('''
    The Ctf4 protein has been shown to be a central member of the replication fork 
    and links the replicative MCM helicase and DNA polymerase α primase.
''')

array([ 0.15582305,  0.78096102,  0.72429312, -0.44352366,  0.24549707,
       -0.31775188, -0.51294194,  0.81415767, -0.48208483, -0.96903407,
       -0.27734005, -0.68966563,  0.49766366,  0.70394791,  0.95143505,
       -0.99132957, -0.04003307,  0.72868834, -0.65962629, -0.10529696,
        0.05268768,  0.35063321, -0.56807871,  0.58273888,  0.5220808 ,
       -0.70311798, -0.82641349,  0.9811752 , -0.46775802,  0.47195212,
        0.01025471, -0.88382602,  0.97402015,  0.7295231 , -0.43287794,
        0.32930443, -0.56286281,  0.96805267, -0.10355674,  0.10145866,
        0.62617541, -0.92687983, -0.09815115,  0.17812702, -0.10138327,
        0.41358631,  0.48164028,  0.67446822,  0.0413781 , -0.06729631,
       -0.24867183,  0.17966288,  0.03194968, -0.40970445, -0.92589636,
        0.95490541, -0.09973196,  0.05991556, -0.12635984, -0.85464657,
       -0.93344866, -0.4648318 , -0.77394693,  0.44096477,  0.31192006,
        0.84051442, -0.63378221,  0.87011827,  0.2944085 , -0.12

In [8]:
def get_answer_id(answer):
    return answer['document'].split('/')[-1]

def get_pair_id(question, answer):
    params = [
        question['id'], # question id
        get_answer_id(answer),
        answer['beginSection'],
        str(answer['offsetInBeginSection']),
        answer['endSection'],
        str(answer['offsetInEndSection'])
    ]

    return '-'.join(params)

In [9]:
def build_pair(question, answer, trainset, is_answer, doc_related, bioSentVecCosSim):
    '''Build a single QAPair object given its stats (is_answer, doc_related, bioSentVecCosSim).'''
    
    pair = {}
    pair['id'] = get_pair_id(question, answer)
    pair['questionId'] = question['id']
    pair['passage'] = answer['text']
    pair['trainset'] = f"{trainset}b"
    pair['doc_url'] = answer['document']
    pair['is_answer'] = is_answer
    pair['doc_related'] = doc_related
    pair['bioSentVecCosSim'] = bioSentVecCosSim
    pair['beginSection'] = answer['beginSection']
    pair['offsetInBeginSection'] = answer['offsetInBeginSection']
    pair['endSection'] = answer['endSection']
    pair['offsetInEndSection'] = answer['offsetInEndSection']
    return pair

### 1. True Pairs 

In [10]:
def get_qa_true_pairs(question, trainset):
    '''Transform a BioASQ question into a QAPair gold object.'''    
    pairs = []
    
    for answer in question['snippets']:
        pairs.append(build_pair(question, answer, trainset, 1, 1, 1))
        
    return pairs

In [11]:
true_pairs = []

for (file, trainset) in files[:1]:
    bioasq = json.load(open(file, "r"))
    for question in bioasq['questions'][:10]:
        true_pairs.extend(get_qa_true_pairs(question, trainset))
        
pp.pprint(true_pairs[0])

{   'beginSection': 'abstract',
    'bioSentVecCosSim': 1,
    'doc_related': 1,
    'doc_url': 'http://www.ncbi.nlm.nih.gov/pubmed/23378035',
    'endSection': 'abstract',
    'id': '52bf1d3c03868f1b0600000d-23378035-abstract-164-abstract-481',
    'is_answer': 1,
    'offsetInBeginSection': 164,
    'offsetInEndSection': 481,
    'passage': 'Muenke syndrome is characterized by coronal craniosynostosis '
               '(bilateral more often than unilateral), hearing loss, '
               'developmental delay, and carpal and/or tarsal bone coalition. '
               'Tarsal coalition is a distinct feature of Muenke syndrome and '
               'has been reported since the initial description of the '
               'disorder in the 1990s. ',
    'questionId': '52bf1d3c03868f1b0600000d',
    'trainset': '4b'}


### 2. False Easy Pairs

In [12]:
def get_qa_false_easy(gold, n, threshold=0.01):
    global true_pairs
    '''Get n false-easy snippets from a true (gold) QA object.'''
    original_n = n
    
    gold_sent = gold['passage']
    
    print("GOLD SENTENCE:\n\n" + gold_sent + "\n\n")
    print("FALSE EASY SENTENCES:\n")
    false_easy = set()
    
    while n > 0:
        rand_pair = true_pairs[np.random.randint(len(true_pairs))]
        
        if rand_pair['questionId'] != gold['questionId']:
            gold_sent = gold['passage']
            rand_sent = rand_pair['passage']
            
            s = abs(similarity(gold_sent, rand_sent))
            
            if s < threshold and rand_sent not in false_easy:
                print(f"{original_n - n + 1}. ({s})\n'{rand_sent}'\n")
                false_easy.add(rand_sent)
                n -= 1
                
    return list(false_easy)

In [13]:
false_easy = get_qa_false_easy(true_pairs[0], 5, threshold=0.02)

GOLD SENTENCE:

Muenke syndrome is characterized by coronal craniosynostosis (bilateral more often than unilateral), hearing loss, developmental delay, and carpal and/or tarsal bone coalition. Tarsal coalition is a distinct feature of Muenke syndrome and has been reported since the initial description of the disorder in the 1990s. 


FALSE EASY SENTENCES:

1. (0.015919076479490757)
'We show that Ctf4 function is conserved and that Drosophila can be effectively used as a model to further probe the precise function of Ctf4 as a member of the replication fork and possible roles in development.'

2. (0.004622304350921036)
'This case report suggests the possibility that discoloration from tetracycline may not be limited to tooth development in the child, but may also affect the adult dentition'

3. (0.016132953663895)
'Alteplase has been used successfully in evolving myocardial infarction (MI) to reopen occluded coronary arteries. '

4. (0.005086029719344909)
'coupling MCM2-7 to replicative

### 3. False Hard Questions

In [32]:
def get_qa_false_hard(gold, pairs, n, min_threshold, max_threshold=0.9):
    '''Get n false-easy snippets from a true (gold) QA object.'''
    original_n = n
    
    print("GOLD SENTENCE:\n\n" + gold['passage'] + "\n\n")
    print("FALSE HARD SENTENCES:\n")
    
    false_hard = set()
    pairs_to_check = np.arange(len(pairs))
    
    while n > 0 and len(pairs_to_check) > 0:
#         idx = np.random.randint(len(pairs_to_check))
#         rand_pair = pairs[pairs_to_check[idx]]
#         pairs_to_check = np.delete(pairs_to_check, idx)
               
        rand_pair = pairs[np.random.randint(len(pairs))]
            
        gold_sent = gold['passage']
        rand_sent = rand_pair['passage']

        s = similarity(gold_sent, rand_sent)

        if s > min_threshold and s < max_threshold and rand_sent not in false_hard:
            print(f"{original_n - n + 1}. ({s})\n'{rand_sent}'\n")
            false_hard.add(rand_sent)
            n -= 1
    
    if len(pairs_to_check) == 0:
        print("Searching list was exhausted.")
        
    return list(false_hard)

In [33]:
false_hard = get_qa_false_hard(true_pairs[0], true_pairs, 5, min_threshold=0.08)

GOLD SENTENCE:

Muenke syndrome is characterized by coronal craniosynostosis (bilateral more often than unilateral), hearing loss, developmental delay, and carpal and/or tarsal bone coalition. Tarsal coalition is a distinct feature of Muenke syndrome and has been reported since the initial description of the disorder in the 1990s. 


FALSE HARD SENTENCES:

1. (0.08013437833775183)
'We previously found that the C2-domain of the Saccharomyces cerevisiae Inn1 protein plays an essential but uncharacterised role at the cleavage site during cytokinesis.'

2. (0.11536282073475324)
'And-1/Ctf4 is therefore a new replication initiation factor that brings together the MCM2-7 helicase and the DNA pol alpha-primase complex, analogous to the linker between helicase and primase or helicase and polymerase that is seen in the bacterial replication machinery'

3. (0.09098856202352656)
'In order to determine the mechanism by which captopril inhibited tumor growth, we investigated the impact of this drug