In [323]:
import pandas as pd
import numpy as np
import time
from src.utils import PubFetcher
from xml.etree import ElementTree as ET
from pprint import pprint
from Bio import Entrez

import warnings
warnings.filterwarnings('ignore')


In [270]:
path = '~/biosift_project/biosift/dataset/PMID_SoftLabels.csv'

In [271]:
df = pd.read_csv(path)

In [272]:
df

Unnamed: 0,PMID,Split,Number of Annotators,Aggregate,Has Human Subjects,Has Target Disease,Cohort Study or Clinical Trial,Has Quantitative Outcome Measure,Has Study Drug(s),Has Population Size,Has Comparator Group
0,11777291,Train,3,1,1.0,1.000000,0.666667,0.666667,1.000000,1.0,0.000000
1,1320241,Train,3,1,1.0,1.000000,0.666667,0.000000,1.000000,1.0,1.000000
2,9216092,Train,3,1,1.0,0.666667,0.666667,1.000000,1.000000,1.0,0.000000
3,30553135,Train,3,1,1.0,0.333333,0.666667,0.666667,0.666667,1.0,0.666667
4,17186001,Test,3,1,1.0,0.333333,1.000000,1.000000,1.000000,1.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
9995,7005123,Train,10,1,0.9,0.100000,0.900000,0.600000,1.000000,0.9,1.000000
9996,28478715,Train,10,1,1.0,1.000000,0.900000,0.900000,1.000000,1.0,0.500000
9997,26440203,Train,10,1,1.0,1.000000,0.300000,1.000000,1.000000,1.0,0.900000
9998,28111428,Train,10,1,1.0,1.000000,0.900000,1.000000,1.000000,1.0,0.700000


In [307]:
ids = df.PMID
ids = ids.to_list()

In [308]:
# print(type(ids))

<class 'list'>


In [309]:
pf = PubFetcher()

In [367]:
batch_size = 250

def extract_abstract(xml_data):
    abstracts = []
    root = ET.fromstring(xml_data)
    for pubmed_article in root.findall(".//PubmedArticle"):
        pmid = pubmed_article.find(".//MedlineCitation/PMID").text
        abstract_elem = pubmed_article.find(".//Article/Abstract")
        if abstract_elem is not None:
            abstract_text = " ".join(abstract_elem.itertext())
            abstracts.append({"PubMedID": pmid, "Abstract": abstract_text})
    return abstracts


result_df = pd.DataFrame(columns=["PubMedID", "Abstract"])

i = 0
for i in range(0, len(ids), batch_size):
    batch_pmids = ids[i:i + batch_size]
    
    raw_data = pf.fetch(ids=batch_pmids, retmode=None, rettype=None, split=False)

    if isinstance(raw_data, str):
        print(f"API call successful for {len(batch_pmids)} PMIDs.")
        # Abstract data
        abstract_data = extract_abstract(raw_data)
        result_df = result_df.append(abstract_data, ignore_index=True)
        # results
        # pprint(raw_data)
    else:
        print(f"Failed to fetch. Status code: {raw_data}")

    time.sleep(3)

API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful 

In [373]:
result_df

Unnamed: 0,PubMedID,Abstract
0,11777291,High-dose preparations of simvastatin and ator...
1,1320241,to compare lisinopril and nifedipine in the ma...
2,9216092,This investigation aimed to compare bacterial ...
3,30553135,Preterm premature rupture of fetal membranes (...
4,17186001,Limited data are available about the effect of...
...,...,...
9992,7005123,"Plasma renin activity (PRA), antidiuretic horm..."
9993,28478715,Current guidelines make no specific recommenda...
9994,26440203,To evaluate the efficacy of metformin administ...
9995,28111428,This phase II study examined whether the addit...


In [378]:
result_df.to_csv("pmid_w_abstracts.csv", index=False)

In [372]:
temp_df = result_df.copy()

In [374]:
batch_size = 250

def extract_mesh_ids(xml_data, result_df):
    root = ET.fromstring(xml_data)
    for article in root.findall(".//PubmedArticle"):
        pmid = article.find(".//MedlineCitation/PMID").text
        mesh_ids = []
        mesh_headings = article.findall(".//MeshHeading")
        for head in mesh_headings:
            mID = head.find(".//DescriptorName").attrib["UI"]
            mesh_ids.append(mID)
        mesh_ids_str = ", ".join(mesh_ids)
        
        result_df.loc[result_df['PubMedID'] == pmid, 'MeshID'] = mesh_ids_str

i = 0
for i in range(0, len(ids), batch_size):
    batch_pmids = ids[i:i + batch_size]
    
    raw_data = pf.fetch(ids=batch_pmids, retmode=None, rettype=None, split=False)

    if isinstance(raw_data, str):
        print(f"API call successful for {len(batch_pmids)} PMIDs.")
        # MeSH ID data
        extract_mesh_ids(raw_data, temp_df)
        # pprint(raw_data)
    else:
        print(f"Failed to fetch. Status code: {raw_data}")

    time.sleep(3)

API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful for 250 PMIDs.
API call successful 

In [375]:
tot_df = temp_df.copy()

In [376]:
tot_df

Unnamed: 0,PubMedID,Abstract,MeshID
0,11777291,High-dose preparations of simvastatin and ator...,"D000328, D000368, D000924, D004305, D005260, D..."
1,1320241,to compare lisinopril and nifedipine in the ma...,"D000328, D000368, D000369, D000959, D015982, D..."
2,9216092,This investigation aimed to compare bacterial ...,"D000293, D000328, D000368, D000658, D000891, D..."
3,30553135,Preterm premature rupture of fetal membranes (...,"D000305, D000328, D000900, D002585, D057239, D..."
4,17186001,Limited data are available about the effect of...,"D000328, D018687, D019540, D016642, D004336, D..."
...,...,...,...
9992,7005123,"Plasma renin activity (PRA), antidiuretic horm...","D000328, D000450, D004338, D004573, D005665, D..."
9993,28478715,Current guidelines make no specific recommenda...,"D000368, D000369, D000925, D005260, D006470, D..."
9994,26440203,To evaluate the efficacy of metformin administ...,"D000022, D016022, D016640, D005260, D006801, D..."
9995,28111428,This phase II study examined whether the addit...,"D000328, D000077716, D000368, D000369, D000971..."


In [377]:
tot_df.to_csv("mesh_data_with_abstracts.csv", index=False)

## For Future Experimentation in Improvement

In [332]:
# flat_df = result_df.groupby('PubMedID')[['MeshID', 'Abstract']].agg(list).reset_index()
# flat_df['MeshID'] = flat_df['MeshID'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else '')
# flat_df['Abstract'] = flat_df['Abstract'].apply(lambda x: x if isinstance(x, str) else '')

# flat_df.to_csv("mesh_data_with_abstracts.csv", index=False)

In [363]:
query = "Diabetes+Mellitus%2Fdrug+therapy%5BMajr%5D+OR+Hypertension%2Fdrug+therapy%5BMajr%5D+OR+Asthma%2Fdrug+therapy%5BMajr%5D+OR+Hypothyroidism%2Fdrug+therapy%5BMajr%5D+OR+Sleep+Wake+Disorders%2Fdrug+therapy%5BMajr%5D+OR+Hyperlipidemias%2Fdrug+therapy%5BMajr%5D+OR+Depression%2Fdrug+therapy%5BMajr%5D"

final_df = pd.DataFrame(columns=['PMID', 'Abstract'])

handle = Entrez.esearch(db="pubmed", term=query, retmode="xml")
record = Entrez.read(handle)
handle.close()

pmids = record['IdList']

for pmid in pmids:
    handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    record = Entrez.read(handle)
    handle.close()
    try:
        abstract = record[0]["PubmedArticle"]["MedlineCitation"]["Article"]["Abstract"]["AbstractText"]
    except KeyError:
        abstract = "Abstract not available"

    final_df = final_df.append({"PMID": pmid, "Abstract": abstract}, ignore_index=True)

In [364]:
final_df

Unnamed: 0,PMID,Abstract
0,37845742,Abstract not available
1,37828874,Abstract not available
2,37763194,Abstract not available
3,37688291,Abstract not available
4,37682585,Abstract not available
5,37679897,Abstract not available
6,37653806,Abstract not available
7,37626416,Abstract not available
8,37582740,Abstract not available
9,37559158,Abstract not available
