In [2]:
import json
import random
from pathlib import Path
import numpy as np
import pandas as pd

import sys
sys.path.append('../src')

from step_00_functions import *

In [3]:
# functions
def pick_irrelevant_question(main_question, all_questions, all_relevant_pmids):
    irrelevant_candidates = [
        q for q in all_questions if q['id'] != main_question['id'] and
        len(set(q['documents']).intersection(all_relevant_pmids)) == 0
    ]
    return random.choice(irrelevant_candidates) if irrelevant_candidates else None

In [5]:
# load the data
# project_root = Path(__file__).parent.parent.resolve()
# data_path = project_root / "data" / "raw" / "BioASQ-training4b" / "BioASQ-trainingDataset4b.json"

data_path = Path('./../data/raw/BioASQ-training4b/BioASQ-trainingDataset4b.json')

data_path.exists()

True

In [6]:
with open(data_path) as f:
    data = json.load(f)

In [9]:
# randomly select ~10% of the questions as relevant
random.seed(8675309)

num_questions = len(data['questions'])
subset_size = max(1, num_questions // 600)
subset_questions = random.sample(data['questions'], subset_size)

In [None]:
# create list of non-selected questions
nonselected_questions = [q for q in data['questions'] if q not in subset_questions]


In [None]:
# create reference set of all relevant PMIDs
all_relevant_pmids = set(pmid for question in subset_questions for pmid in question['documents'])
# non_relevant_pmids = set(pmid for question in nonselected_questions for pmid in question['documents'])

In [12]:
all_relevant_pmids

{'http://www.ncbi.nlm.nih.gov/pubmed/10652315',
 'http://www.ncbi.nlm.nih.gov/pubmed/11502188',
 'http://www.ncbi.nlm.nih.gov/pubmed/14747656',
 'http://www.ncbi.nlm.nih.gov/pubmed/14761982',
 'http://www.ncbi.nlm.nih.gov/pubmed/15522301',
 'http://www.ncbi.nlm.nih.gov/pubmed/1634521',
 'http://www.ncbi.nlm.nih.gov/pubmed/17653994',
 'http://www.ncbi.nlm.nih.gov/pubmed/18840008',
 'http://www.ncbi.nlm.nih.gov/pubmed/21215336',
 'http://www.ncbi.nlm.nih.gov/pubmed/21527722',
 'http://www.ncbi.nlm.nih.gov/pubmed/21975791',
 'http://www.ncbi.nlm.nih.gov/pubmed/23235657',
 'http://www.ncbi.nlm.nih.gov/pubmed/3517024',
 'http://www.ncbi.nlm.nih.gov/pubmed/9115173',
 'http://www.ncbi.nlm.nih.gov/pubmed/9356261'}

In [18]:
non_relevant_pmids

{'http://www.ncbi.nlm.nih.gov/pubmed/2072458',
 'http://www.ncbi.nlm.nih.gov/pubmed/2620957',
 'http://www.ncbi.nlm.nih.gov/pubmed/22977535',
 'http://www.ncbi.nlm.nih.gov/pubmed/10654932',
 'http://www.ncbi.nlm.nih.gov/pubmed/24595274',
 'http://www.ncbi.nlm.nih.gov/pubmed/21035437',
 'http://www.ncbi.nlm.nih.gov/pubmed/23242182',
 'http://www.ncbi.nlm.nih.gov/pubmed/7690557',
 'http://www.ncbi.nlm.nih.gov/pubmed/3178044',
 'http://www.ncbi.nlm.nih.gov/pubmed/22647359',
 'http://www.ncbi.nlm.nih.gov/pubmed/7652577',
 'http://www.ncbi.nlm.nih.gov/pubmed/20513433',
 'http://www.ncbi.nlm.nih.gov/pubmed/21278336',
 'http://www.ncbi.nlm.nih.gov/pubmed/24092417',
 'http://www.ncbi.nlm.nih.gov/pubmed/8894691',
 'http://www.ncbi.nlm.nih.gov/pubmed/19445664',
 'http://www.ncbi.nlm.nih.gov/pubmed/23404595',
 'http://www.ncbi.nlm.nih.gov/pubmed/21192222',
 'http://www.ncbi.nlm.nih.gov/pubmed/15475613',
 'http://www.ncbi.nlm.nih.gov/pubmed/21628529',
 'http://www.ncbi.nlm.nih.gov/pubmed/16953954'

In [13]:
# build data set of relevant questions
structured_data = []

In [19]:
for main_question in subset_questions:
    q_text = main_question['body']
    relevant_pmids = main_question['documents'][:10]  # Limit to 10 relevant abstracts

    # Relevant abstracts (label=1)
    for pmid in relevant_pmids:
        abstract = fetch_abstract(pmid)
        if abstract:
            combined_text = f"Question: {q_text}\nAbstract: {abstract}"
            # embedding = get_embedding(combined_text)
            structured_data.append({
                "question": q_text,
                "abstract": abstract,
                # "embedding": embedding,
                "label": 1
            })

    # Pick one clearly irrelevant question
    irrelevant_question = pick_irrelevant_question(main_question, nonselected_questions, all_relevant_pmids)

    if irrelevant_question:
        irrelevant_pmids = irrelevant_question['documents'][:10]

        # Irrelevant abstracts (label=0)
        for pmid in irrelevant_pmids:
            abstract = fetch_abstract(pmid)
            if abstract:
                combined_text = f"Question: {q_text}\nAbstract: {abstract}"
                # embedding = get_embedding(combined_text)
                structured_data.append({
                    "question": q_text,
                    "abstract": abstract,
                    # "embedding": embedding,
                    "label": 0
                })

In [20]:
df = pd.DataFrame(structured_data)

In [21]:
df

Unnamed: 0,question,abstract,label
0,Which drug is considered as the first line tre...,Fibromyalgia is a syndrome of unknown origin w...,1
1,Which drug is considered as the first line tre...,Amitriptyline is a tricyclic antidepressant th...,1
2,Which drug is considered as the first line tre...,Valproic acid and its sodium salt (sodium valp...,1
3,Which drug is considered as the first line tre...,"Oral pregabalin, a calcium channel alpha(2)del...",1
4,Which drug is considered as the first line tre...,Cervical carcinomas result from cellular trans...,0
5,Which drug is considered as the first line tre...,Cervical cancer is one of the leading causes o...,0
6,Which drug is considered as the first line tre...,"Certain types of human papillomavirus (HPV), s...",0
7,Which drug is considered as the first line tre...,High-risk human papillomavirus type 16 (HPV-16...,0
8,Which drug is considered as the first line tre...,The recognition of a causal relationship betwe...,0
9,Which drug is considered as the first line tre...,Mutational activation of the K-Ras oncogene is...,0
