In [1]:
import json
import sys
from bs4 import BeautifulSoup
import requests
import sys,os,os.path
import re
import logging
import logging.config
import spacy
import numpy as np
import random
import seaborn as sns
import tqdm

logging.config.fileConfig("logging.conf")

os.environ['HTTP_PROXY']="http://sacorchuelop:escorpion@proxyapp.unal.edu.co:8080/"
os.environ['HTTPS_PROXY']="http://sacorchuelop:escorpion@proxyapp.unal.edu.co:8080/"

In [2]:
negative_docs = []

def read_pubmed_abstract(url):
    r  = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, 'html.parser')
    mydivs = soup.findAll("div", {"class": "abstr"})
    if len(mydivs) > 0:
        abstract = mydivs[0].find('p').text
        return abstract
    else:
        return None

In [3]:
#spacy sentence tokenizer
abstract = read_pubmed_abstract("http://www.ncbi.nlm.nih.gov/pubmed/15829955")
nlp = spacy.load("en_core_web_sm")
abstract = nlp(abstract)
for i, token in enumerate(abstract.sents):
    print('-->Sentence %d: %s' % (i, token.text))

-->Sentence 0: The identification of common variants that contribute to the genesis of human inherited disorders remains a significant challenge.
-->Sentence 1: Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase
-->Sentence 2: RET contribute to risk in combination with mutations at other genes.
-->Sentence 3: We have used family-based association studies to identify a disease interval, and integrated this with comparative and functional genomic analysis to prioritize conserved and functional elements within which mutations can be sought.
-->Sentence 4: We now show that a common non-coding RET variant within a conserved enhancer-like sequence in intron 1 is significantly associated with HSCR susceptibility and makes a 20-fold greater contribution to risk than rare alleles do.
-->Sentence 5: This mutation reduces in vitro enhancer activity markedly, has low penetrance, has differe

In [11]:
#generate random documents from pubmed ids since 2005 id https://www.ncbi.nlm.nih.gov/pubmed/15829959
random_docs = []
max_random_docs = 900000
for x in range(max_random_docs):
    random_docs.append('https://www.ncbi.nlm.nih.gov/pubmed/'+str(random.randint(15829959,28829959)))

In [12]:
dataset_files = ['BioASQ-trainingDataset4b.json','BioASQ-trainingDataset5b.json',
           'BioASQ-trainingDataset6b.json',
           'BioASQ-trainingDataset7b.json',
           'BioASQ-trainingDataset8b.json']

## Generate Gold Dataset

In [6]:
"""
Extract the information for training dataset
Query
body, documents, ideal_answer, concepts, type, id, snippets
Snippet
offsetInBeginSection, offsetInEndSection, text, beginSection, document, endSection
"""
gold_snippets = []
gold_docs = []
for dataset in dataset_files:
    logging.debug("BioASQ Generating train files for {}".format(dataset))
    data = json.load(open('train-data/'+dataset,'r'))
    for query in data['questions']:
        if 'snippets' not in query.keys():
            print('No snippets for {}'.format(query['id']))
        else:
            for snippet in query['snippets']:
                snippet['body'] = query['body']
                snippet['id'] = query['id']
                snippet['trainset'] = dataset
                snippet['label'] = 1
                snippet['doc_related'] = 1
                gold_snippets.append(snippet)
        gold_docs.extend(query['documents'])

print("Gold snippets size {}".format(len(gold_snippets)))
json.dump(gold_snippets,open('train-data/train_pairs/gold_pairs.json','w'))

No snippets for 51406e6223fec90375000009
No snippets for 54d643023706e89528000007
No snippets for 51593dc8d24251bc05000099
No snippets for 532819afd6d3ac6a3400000f
No snippets for 5158a5b8d24251bc05000097
No snippets for 5172f8118ed59a060a000019
No snippets for 517545168ed59a060a00002b
Gold snippets size 173253


## Generate Related Document But Negative Snippets

In [None]:
import sys
sys.path.insert(0, "common")
import bioasq_util as bioasq_util
import ranking
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

import importlib
importlib.reload(ranking)
stops = stopwords.words('english')

index_name = '2018_pubmed_baseline_title_abs_mesh'
doc_relative_url = 'http://www.ncbi.nlm.nih.gov/pubmed/'

logging.getLogger('elasticsearch').setLevel(logging.ERROR)
"""
Extract the information for training dataset
Query
body, documents, ideal_answer, concepts, type, id, snippets
Snippet
offsetInBeginSection, offsetInEndSection, text, beginSection, document, endSection
"""
for dataset in dataset_files:
    related_docs_snippets = []
    logging.info("BioASQ Generating train files for {}".format(dataset))
    data = json.load(open('train-data/'+dataset,'r'))
    for query in tqdm.tqdm(data['questions'],position=0):
        if 'snippets' in query:
            for doc in query['documents']:
                if bioasq_util.get_doc(doc.replace(doc_relative_url,''), index_name, remove_tags=True) != None:
                    doc_id, doc_title, doc_abstract = bioasq_util.get_doc(doc.replace(doc_relative_url,''), index_name, remove_tags=True)
                    if (doc_title != None) & (doc_abstract != None):
                        text = doc_title + " " +  doc_abstract
                        #remove gold sentences
                        for snippet in query['snippets']:
                            if doc_id in snippet['document']:
                                text = text.replace(snippet['text']," ")
                        #tokenize in snippets
                        text_chunks = ranking.split_chunks(text)
                        for chunk in text_chunks:
                            chunk['body'] = query['body']
                            chunk['id'] = query['id']
                            chunk['trainset'] = dataset
                            chunk['label'] = 0
                            chunk['doc_related'] = 1
                            related_docs_snippets.append(chunk)
        if len(related_docs_snippets) > 540000:
            break
    print("Gold snippets size {}".format(len(related_docs_snippets)))
    json.dump(related_docs_snippets,open('train-data/train_pairs/related_docs_negative_pairs_'+dataset,'w'))

 22%|██▏       | 289/1307 [00:38<01:03, 16.11it/s]

In [9]:
print("Gold snippets size {}".format(len(related_docs_snippets)))

Gold snippets size 240004


In [None]:
all_gold_len = []
all_q = []
big_a = 0
for x in gold_snippets:
    all_q.append(len(x['body'].split(' ')))
    if len(x['text'].split(' ')) > 200:
        big_a += 1
    else:
        all_gold_len.append(len(x['text'].split(' ')))
sns.distplot(all_gold_len)
sns.distplot(all_q)

"""
As the lenght of the answers sequence is around 150 terms,
we should end with only answers candidates in this boundary
"""

## Generate Negative Dataset No Related Doc

In [15]:
from elasticsearch import Elasticsearch
import bioasq_util

bioasq_util.es = Elasticsearch(hosts=['168.176.36.10:9200'])
index_name = '2018_pubmed_baseline_title_abs_mesh'
doc_relative_url = 'http://www.ncbi.nlm.nih.gov/pubmed/'

In [38]:
#add negative snippet
negative_docs = list(set(random_docs) - set(gold_docs))
nlp = spacy.load("en_core_web_sm")
random_snippets = []
#generate random snippets
logging.debug("Generate random snippets from not related document colection size {}".format(len(negative_docs)))
for x in tqdm.tqdm(range(50000),position=0):
    if x % 10 == 0:
        logging.debug("Generate random snippet for file {}".format(x))
        json.dump({'random_snippets':random_snippets},open('train-data/train_pairs/random_snippets.json','w'))
    doc_idx = random.randint(0,len(negative_docs))
    abstract = bioasq_util.get_doc(negative_docs[doc_idx].split('/')[-1], index_name, remove_tags=True) 
    if abstract is not None:
        abstract = nlp(abstract[2])
        random_snippet = None
        for i, token in enumerate(abstract.sents):
            if (len(token.text) > 50) & (len(token.text) < 500):
                random_snippet = token.text
                random_snippets.append({'document':negative_docs[doc_idx],
                                        'snippet':random_snippet})
json.dump({'random_snippets':random_snippets},open('train-data/train_pairs/random_snippets.json','w'))
#print('-->Sentence %d: %s' % (i, token.text))
#token = get_random_snippet(negative_docs[0])

100%|██████████| 50000/50000 [2:45:28<00:00,  5.04it/s]  


In [None]:
len(random_snippets)

In [None]:
negative_snippets_random_doc = []

#add different document but negative snippet
for dataset in dataset_files:
    logging.debug("BioASQ Generating negative_snippets_random_doc train files for {}".format(dataset))
    data = json.load(open('train-data/'+dataset,'r'))
    for query in data['questions']:
        if 'snippets' not in query.keys():
            print('No snippets for {}'.format(query['id']))
        else:
            for snippet in query['snippets']:
                token = get_random_snippet()
                snippet['document'] = negative_docs[doc_idx]
                snippet['text'] = token.text
                snippet['body'] = query['body']
                snippet['id'] = query['id']
                snippet['trainset'] = dataset
                snippet['label'] = 0
                snippet['doc_related'] = 0
                snippet['offsetInBeginSection'] = None
                snippet['offsetInEndSection'] = None
                snippet['beginSection'] = None
                snippet['endSection'] = None
                negative_snippets_random_doc.append(snippet)
            logging.debug("qid {}, total snippets {}".format(query['id'],len(negative_snippets_random_doc)))

json.dump(negative_snippets_random_doc,open('train-data/train_pairs/negative_snippets_random_doc.json','w'))

In [82]:
negative_snippets_random_doc

[{'offsetInBeginSection': None,
  'offsetInEndSection': None,
  'text': 'Graft arteriosclerosis (GA), the major cause of late cardiac allograft failure, is characterized by a diffuse, concentric arterial intimal hyperplasia composed of infiltrating host T cells, macrophages, and predominantly graft-derived smooth muscle-like cells that proliferate and elaborate extracellular matrix, resulting in luminal obstruction and allograft ischemia.',
  'beginSection': None,
  'document': 'https://www.ncbi.nlm.nih.gov/pubmed/20520254',
  'endSection': None,
  'body': 'What symptoms characterize the Muenke syndrome?',
  'id': '52bf1d3c03868f1b0600000d',
  'trainset': 'BioASQ-trainingDataset4b.json',
  'label': 0,
  'doc_related': 0},
 {'offsetInBeginSection': None,
  'offsetInEndSection': None,
  'text': 'One year ago, a 42-year-old woman underwent aortic root replacement because of a pseudoaneurysm that developed at the site of an anastomosis after ascending aortic replacement for acute aortic di

In [72]:
negative_docs[doc_idx]

'https://www.ncbi.nlm.nih.gov/pubmed/20520254'

In [28]:
print(len(random_docs))
negative_docs = list(set(random_docs) - set(gold_docs))
print(len(negative_docs))

1000
1000
