# Dataset BioASQ: QAPair

### 0. Initial setup (0)

In [3]:
import json
import sys
from bs4 import BeautifulSoup
import requests
import sys,os,os.path
import re
import spacy
import numpy as np
import random
import seaborn as sns
import tqdm
import pprint

In [4]:
GOLD_PATH = 'gold_standard/'
files = [(f"{GOLD_PATH}BioASQ-trainingDataset{i}b.json", i) for i in [4, 5, 6, 7, 8]]
files

[('gold_standard/BioASQ-trainingDataset4b.json', 4),
 ('gold_standard/BioASQ-trainingDataset5b.json', 5),
 ('gold_standard/BioASQ-trainingDataset6b.json', 6),
 ('gold_standard/BioASQ-trainingDataset7b.json', 7),
 ('gold_standard/BioASQ-trainingDataset8b.json', 8)]

In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
pp = pprint.PrettyPrinter(indent=4)

In [7]:
from scipy.spatial import distance

def sent_vec(sent):
    return np.random.uniform(-1, 1, 700)

def similarity(sent1, sent2):
    vec1 = sent_vec(sent1)
    vec2 = sent_vec(sent2)
    return 1 - distance.cosine(vec1, vec2)

In [8]:
sent_vec('''
    The Ctf4 protein has been shown to be a central member of the replication fork 
    and links the replicative MCM helicase and DNA polymerase α primase.
''')

array([ 0.14887991, -0.55898568, -0.46558396, -0.47485176, -0.14581406,
        0.07868517, -0.92112647, -0.0205458 ,  0.11846369,  0.37008566,
        0.23777416, -0.27052212,  0.81177091, -0.41379938, -0.66905965,
        0.06272629, -0.05629426,  0.33613811,  0.64839521,  0.9901346 ,
       -0.88928653, -0.25614798,  0.03945133, -0.53914887,  0.79463762,
        0.513582  , -0.11020852, -0.53196193,  0.20065928,  0.31343279,
        0.12787348,  0.24138185, -0.75990616, -0.85397143,  0.01928039,
        0.66462009, -0.12618818, -0.22327217,  0.20314155,  0.67834459,
        0.27155994,  0.692159  ,  0.63018563,  0.11560437, -0.81722459,
       -0.22414666, -0.69751583,  0.27708565, -0.98848721,  0.23049235,
       -0.09869052, -0.58860394, -0.5765967 , -0.73925185, -0.57395012,
        0.3880691 ,  0.80434074,  0.2801701 ,  0.72708985, -0.79694199,
       -0.4813272 ,  0.13800516, -0.26810304,  0.91461627,  0.76372565,
        0.2105721 , -0.73980424,  0.65099268,  0.50500089, -0.68

In [9]:
def get_answer_id(answer):
    return answer['document'].split('/')[-1]

def get_pair_id(question, answer):
    params = [
        question['id'], # question id
        get_answer_id(answer),
        answer['beginSection'],
        str(answer['offsetInBeginSection']),
        answer['endSection'],
        str(answer['offsetInEndSection'])
    ]

    return '-'.join(params)

In [10]:
def build_pair(question, answer, trainset, is_answer, doc_related, bioSentVecCosSim):
    '''Build a single QAPair object given its stats (is_answer, doc_related, bioSentVecCosSim).'''
    
    pair = {}
    pair['id'] = get_pair_id(question, answer)
    pair['questionId'] = question['id']
    pair['passage'] = answer['text']
    pair['trainset'] = f"{trainset}b"
    pair['doc_url'] = answer['document']
    pair['is_answer'] = is_answer
    pair['doc_related'] = doc_related
    pair['bioSentVecCosSim'] = bioSentVecCosSim
    pair['beginSection'] = answer['beginSection']
    pair['offsetInBeginSection'] = answer['offsetInBeginSection']
    pair['endSection'] = answer['endSection']
    pair['offsetInEndSection'] = answer['offsetInEndSection']
    return pair

### 1. True Pairs 

In [11]:
def get_qa_true_pairs(question, trainset):
    '''Transform a BioASQ question into a QAPair gold object.'''    
    pairs = []
    
    for answer in question['snippets']:
        pairs.append(build_pair(question, answer, trainset, 1, 1, 1))
        
    return pairs

In [12]:
true_pairs = []

for (file, trainset) in files[:1]:
    bioasq = json.load(open(file, "r"))
    for question in bioasq['questions'][:10]:
        true_pairs.extend(get_qa_true_pairs(question, trainset))
        
pp.pprint(true_pairs[0])

{   'beginSection': 'abstract',
    'bioSentVecCosSim': 1,
    'doc_related': 1,
    'doc_url': 'http://www.ncbi.nlm.nih.gov/pubmed/23378035',
    'endSection': 'abstract',
    'id': '52bf1d3c03868f1b0600000d-23378035-abstract-164-abstract-481',
    'is_answer': 1,
    'offsetInBeginSection': 164,
    'offsetInEndSection': 481,
    'passage': 'Muenke syndrome is characterized by coronal craniosynostosis '
               '(bilateral more often than unilateral), hearing loss, '
               'developmental delay, and carpal and/or tarsal bone coalition. '
               'Tarsal coalition is a distinct feature of Muenke syndrome and '
               'has been reported since the initial description of the '
               'disorder in the 1990s. ',
    'questionId': '52bf1d3c03868f1b0600000d',
    'trainset': '4b'}


### 2. False Easy Pairs

In [13]:
def get_qa_false_easy(gold, n, threshold=0.01):
    global true_pairs
    '''Get n false-easy snippets from a true (gold) QA object.'''
    original_n = n
    
    gold_sent = gold['passage']
    
    print("GOLD SENTENCE:\n\n" + gold_sent + "\n\n")
    print("FALSE EASY SENTENCES:\n")
    false_easy = set()
    
    while n > 0:
        rand_pair = true_pairs[np.random.randint(len(true_pairs))]
        
        if rand_pair['questionId'] != gold['questionId']:
            gold_sent = gold['passage']
            rand_sent = rand_pair['passage']
            
            s = abs(similarity(gold_sent, rand_sent))
            
            if s < threshold and rand_sent not in false_easy:
                print(f"{original_n - n + 1}. ({s})\n'{rand_sent}'\n")
                false_easy.add(rand_sent)
                n -= 1
                
    return list(false_easy)

In [14]:
false_easy = get_qa_false_easy(true_pairs[0], 5, threshold=0.02)

GOLD SENTENCE:

Muenke syndrome is characterized by coronal craniosynostosis (bilateral more often than unilateral), hearing loss, developmental delay, and carpal and/or tarsal bone coalition. Tarsal coalition is a distinct feature of Muenke syndrome and has been reported since the initial description of the disorder in the 1990s. 


FALSE EASY SENTENCES:

1. (0.018803117087473176)
'Influence of the human cohesion establishment factor Ctf4/AND-1 on DNA replication.'

2. (0.012641665898401433)
'is study determined the effect of nonsteroidal anti-inflammatory drug (NSAID) administration on blood pressure in hypertensive patients taking hydrochlorothiazide (HCTZ). '

3. (0.018683252230306535)
'Using cell viability and fluorescent activated cell sorting analysis tests, we demonstrated that captopril inhibited the viability of LNM35 cells by inducing apoptosis, providing insight about the mechanisms underlying its antitumorigenic activities. In view of these experimental findings, we conclu

### 3. False Hard Questions

In [93]:


def get_qa_false_hard(gold, pairs, n, min_threshold, max_threshold=0.9):
    '''Get n false-easy snippets from a true (gold) QA object.'''
    original_n = n
    
    print("GOLD SENTENCE:\n\n" + gold['passage'] + "\n\n")
    print("FALSE HARD SENTENCES:\n")
    
    false_hard = set()
    pairs_to_check = np.arange(len(pairs))
    check = []
    while n > 0 and len(pairs_to_check) > 0:
        check_idx = np.random.randint(len(pairs_to_check))
        idx = pairs_to_check[check_idx]
        check.append(idx)
        pairs_to_check = np.delete(pairs_to_check, check_idx)
        rand_pair = pairs[idx]
            
        gold_sent = gold['passage']
        rand_sent = rand_pair['passage']

        s = abs(similarity(gold_sent, rand_sent))

        if s > min_threshold and s < max_threshold and rand_sent not in false_hard:
            print(f"{original_n - n + 1}. ({s}) [{rand_pair['id'].split('-')[1]} {idx}]\n'{rand_sent}'\n")
            false_hard.add(rand_sent)
            n -= 1
    
    if len(pairs_to_check) == 0:
        print("Searching list was exhausted.")
        
    return list(false_hard)

In [96]:
false_hard = get_qa_false_hard(true_pairs[0], true_pairs, 5, min_threshold=0.1)

GOLD SENTENCE:

Muenke syndrome is characterized by coronal craniosynostosis (bilateral more often than unilateral), hearing loss, developmental delay, and carpal and/or tarsal bone coalition. Tarsal coalition is a distinct feature of Muenke syndrome and has been reported since the initial description of the disorder in the 1990s. 


FALSE HARD SENTENCES:

1. (0.12385855135789026) [23116225 213]
'This review provides an overview of the efficacy and safety of antihypertensive therapy based on olmesartan medoxomil ± hydrochlorothiazide and amlodipine/olmesartan medoxomil in high-risk patient populations enrolled in studies that reported ambulatory BP endpoints'

2. (0.10649297232912369) [23044018 24]
'Muenke syndrome is an autosomal-dominant craniosynostosis syndrome characterized by unilateral or bilateral coronal craniosynostosis, hearing loss, intellectual disability, and relatively subtle limb findings such as carpal bone fusion and tarsal bone fusion'

Searching list was exhausted.
