In [245]:
import pandas as pd
import time
from src.utils import PubFetcher
from xml.etree import ElementTree as ET
from pprint import pprint
from Bio import Entrez

import warnings
warnings.filterwarnings('ignore')


In [246]:
path = '~/biosift_project/biosift/dataset/PMID_SoftLabels.csv'

In [247]:
df = pd.read_csv(path)

In [257]:
df

Unnamed: 0,PMID,Split,Number of Annotators,Aggregate,Has Human Subjects,Has Target Disease,Cohort Study or Clinical Trial,Has Quantitative Outcome Measure,Has Study Drug(s),Has Population Size,Has Comparator Group
0,11777291,Train,3,1,1.0,1.000000,0.666667,0.666667,1.000000,1.0,0.000000
1,1320241,Train,3,1,1.0,1.000000,0.666667,0.000000,1.000000,1.0,1.000000
2,9216092,Train,3,1,1.0,0.666667,0.666667,1.000000,1.000000,1.0,0.000000
3,30553135,Train,3,1,1.0,0.333333,0.666667,0.666667,0.666667,1.0,0.666667
4,17186001,Test,3,1,1.0,0.333333,1.000000,1.000000,1.000000,1.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
9995,7005123,Train,10,1,0.9,0.100000,0.900000,0.600000,1.000000,0.9,1.000000
9996,28478715,Train,10,1,1.0,1.000000,0.900000,0.900000,1.000000,1.0,0.500000
9997,26440203,Train,10,1,1.0,1.000000,0.300000,1.000000,1.000000,1.0,0.900000
9998,28111428,Train,10,1,1.0,1.000000,0.900000,1.000000,1.000000,1.0,0.700000


In [249]:
ids = df.PMID
ids = ids.to_list()

In [250]:
print(type(ids))

<class 'list'>


In [251]:
pf = PubFetcher()

In [261]:
batch_size = 250

def extract_mesh_ids(xml_data):
    mesh_ids = []
    root = ET.fromstring(xml_data)
    for pubmed_article in root.findall(".//PubmedArticle"):
        pmid = pubmed_article.find(".//MedlineCitation/PMID").text
        mesh_headings = pubmed_article.findall(".//MeshHeading")
        for mesh_heading in mesh_headings:
            mesh_id = mesh_heading.find(".//DescriptorName").attrib["UI"]
            mesh_ids.append({"PubMedID": pmid, "MeshID": mesh_id})
    return mesh_ids

result_df = pd.DataFrame(columns=["PubMedID", "MeshID"])


for i in range(0, len(ids), batch_size):
    batch_pmids = ids[i:i + batch_size]
    
    raw_data = pf.fetch(ids=batch_pmids, retmode=None, rettype=None, split=False)

    if isinstance(raw_data, str):
        print(f"API call successful for {len(batch_pmids)} PMIDs.")
        # MeSH ID data
        mesh_data = extract_mesh_ids(raw_data)
        # print(mesh_data)
        result_df = result_df.append(mesh_data, ignore_index=True)
        # results
        # pprint(raw_data)
    else:
        print(f"Failed to fetch. Status code: {raw_data}")

    time.sleep(3)

result_df.to_csv("mesh_data.csv", index=False)

print("Data saved to mesh_data.csv")

In [264]:
terms = "Cancers and drug therapy"
handle = Entrez.esearch(db="pubmed", term=terms, retmax=None)
record = Entrez.read(handle)
pmids = record["IdList"]

records = Entrez.efetch(db="pubmed", id=pmids, rettype="medline", retmode="text")

print(records.read())


PMID- 37861407
OWN - NLM
STAT- Publisher
LR  - 20231020
IS  - 1557-3265 (Electronic)
IS  - 1078-0432 (Linking)
DP  - 2023 Oct 20
TI  - Phase 1 Dose-Escalation Study of the Safety and Pharmacokinetics of AGS15E 
      Monotherapy in Patients With Metastatic Urothelial Carcinoma.
LID - 10.1158/1078-0432.CCR-22-3627 [doi]
AB  - PURPOSE: Effective treatment of locally advanced or metastatic urothelial 
      carcinoma (mUC) remains an unmet need. Antibody-drug conjugates (ADCs) providing 
      targeted drug delivery have shown antitumor activity in this setting. AGS15E is 
      an investigational ADC that delivers the cytotoxic drug monomethyl auristatin E 
      to cells expressing SLITRK6, a UC-associated antigen. PATIENTS AND METHODS: This 
      was a multicenter, single-arm, phase 1 dose-escalation and expansion trial of 
      AGS15E in patients with mUC (NCT01963052). During dose escalation, AGS15E was 
      administered intravenously at 6 levels (0.10, 0.25, 0.50, 0.75, 1.00, 1

In [241]:
df['PMID'].nunique()

10000

In [242]:
result_df['PubMedID'].nunique()

9997

In [254]:
pd.reset_option("display.max_rows")
# pd.set_option("display.max_rows", None)
print(result_df['PubMedID'].value_counts())

7579368     55
23876492    54
29097388    52
10507773    52
12606523    51
            ..
31395495     5
10989257     5
27766590     5
10870743     4
14586484     4
Name: PubMedID, Length: 9997, dtype: int64


In [255]:
result_df.to_csv('mesh_data.csv', index=False)

In [256]:
result_df['DrugName']

0                           NaN
1                           NaN
2                           NaN
3                           NaN
4                           NaN
                  ...          
208146                 Afatinib
208147              Simvastatin
208148    Anti-Bacterial Agents
208149              Penicillins
208150              Amoxicillin
Name: DrugName, Length: 208151, dtype: object