<a href="https://colab.research.google.com/github/ZhaochenYe999/CBB752_FinalProject/blob/main/part2a_extract_pmids.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import time
import requests
import xml.etree.ElementTree as ET
import json

In [None]:
def View(df, rows=None, cols=None, width=None):

    with pd.option_context(
        "display.max_rows", rows,
        "display.max_columns", cols,
        "display.max_colwidth", width,
        "display.expand_frame_repr", False
    ):
        display(df.head(rows))

In [None]:
def get_pmid(gene, max_results=20):
    result = []
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={gene}&retmode=xml&retmax={max_results}"
    r = requests.get(url=url)
    if r.status_code != 200:
        print(f"Lookup of {gene} failed; status: {r.status_code}")
    else:
        root = ET.fromstring(r.text)
        pmids = root.findall(".//Id")
        for pmid in pmids:
            result.append(pmid.text)
        return result

In [None]:
gene_df = pd.read_csv("https://raw.githubusercontent.com/ZhaochenYe999/CBB752_FinalProject/refs/heads/main/genelist/genelist_NonSynoymousVariants.csv").drop(columns=["Unnamed: 0"])

In [None]:
#taking the top 11 since there is a tie
gene_df_sorted = gene_df.sort_values(by="Mutation_count", ascending=False).head(11)

In [None]:
genes = list(gene_df_sorted["Gene_symbol"].values)

rows = []

for gene in genes:
    pmids = get_pmid(gene)
    print(f"{gene}: {len(pmids)} PMIDs")
    for pmid in pmids:
        rows.append({"gene": gene, "pmid": pmid})
    time.sleep(0.4)

df = pd.DataFrame(rows)

In [None]:
!pip install biopython

In [None]:
from Bio import Entrez
Entrez.email = "inna.cohen@gmail.com"

def get_abstract(pmid):
    try:
        handle = Entrez.efetch(db="pubmed", id=str(pmid), rettype="abstract", retmode="text")
        return handle.read()
    except Exception as e:
        return f"ERROR: {e}"

df["abstract"] = df["pmid"].apply(get_abstract)


In [None]:
df["abstract"] = df["abstract"].str.replace("\n", " ", regex=False).str.strip()

In [None]:
df.to_csv("raw_abstracts.csv", index=False)

In [None]:
df2 = pd.read_csv("raw_abstracts.csv")

In [None]:
df2.head()

In [None]:
#!pip install huggingface_hub scispacy
#from huggingface_hub import login
#login()


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from transformers.pipelines import AggregationStrategy

model_name = "alvaroalon2/biobert_diseases_ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def extract_disease_entities(text):
    results = ner(text)
    return [entity['word'] for entity in results if entity['entity_group'] == 'DISEASE']

In [None]:
def extract_disease_entities(text):
    raw = ner(text)

    # Only keep disease entities
    entities = [ent['word'] for ent in raw if ent.get('entity_group', '') == 'DISEASE']

    # Join subword tokens and convert to lowercase
    cleaned = []
    skip_next = False
    for i in range(len(entities)):
        if skip_next:
            skip_next = False
            continue

        current = entities[i]
        if i + 1 < len(entities) and entities[i + 1].startswith("##"):
            # Merge current and next
            merged = current + entities[i + 1].replace("##", "")
            cleaned.append(merged.strip(".,:;- ").lower())
            skip_next = True
        else:
            cleaned.append(current.replace("##", "").strip(".,:;- ").lower())

    cleaned = list(set(cleaned))  # Remove duplicates
    return cleaned


In [None]:
abstract = """
1. Endokrynol Pol. 2020;71(3):213-226. doi: 10.5603/EP.a2020.0025. Identification of related long non-coding RNAs and mRNAs in subclinical hypothyroidism complicated with type 2 diabetes by transcriptome analysis - a preliminary study. Jiang Q(1)(2)(3), Sun L(4), Lu Y(5), Han S(4), Hou L(5), Lou K(5), Li J(5), Wang L(5), Pang S(5). Author information: (1)Department of Endocrinology, Jinan Central Hospital, Cheeloo College of Medicine, Shandong University, Jinan, China. jiangqiangjinan@sina.com. (2)Department of Endocrinology, Jinan Central Hospital Affliated to Shandong First Medical University, Jinan, China. jiangqiangjinan@sina.com. (3)Central Laboratory, Jinan central Hospital, Cheeloo College of Medicine, Shandong University, Jinan, China. jiangqiangjinan@sina.com. (4)Central Laboratory, Jinan central Hospital, Cheeloo College of Medicine, Shandong University, Jinan, China. (5)Department of Endocrinology, Jinan Central Hospital, Cheeloo College of Medicine, Shandong University, Jinan, China. INTRODUCTION: The pathology mechanism of subclinical hypothyroidism and subclinical hypothyroidism complicated with type 2 diabetes remained uncertain. We aimed to find potential related long non-coding RNAs (lncRNAs) and mRNAs in the above diseases. MATERIAL AND METHODS: Transcriptome sequencing was performed in three patients with subclinical hypothyroidism (S), three patients with subclinical hypothyroidism complicated with type 2 diabetes (SD), and three healthy controls (N). Differentially expressed mRNAs (DEmRNAs) and differentially expressed lncRNAs (DElncRNAs) were screened in S vs. N, SD vs. N, and SD vs. S group, and the nearby and co-expressed DEmRNAs of DElncRNAs were screened in S vs. N and SD vs. N. Moreover, functional analysis of DEmRNAs was then performed by Metascape. RESULTS: In total, 465, 1058, and 943 DEmRNAs were obtained in S vs. N, SD vs. N, SD vs. S, respectively, and 191 overlapping genes were obtained in S vs. N and SD vs. N group. Among which, LAIR2, PNMA6A, and SFRP2 were deduced to be involved in subclinical hypothyroidism, and GPR162, APOL4, and ANK1 were deduced to be associated with subclinical hypothyroidism complicated with type 2 diabetes. A total of 50, 100, and 88 DElncRNAs were obtained in S vs. N, SD vs. N and SD vs. S, respectively. Combining with the interaction network of DElncRNA-DEmRNA, PAX8-AS1, co-expressed with KIR3DL1, was identified to function in subclinical hypothyroidism, and JHDM1D-AS1, co-expressed with ANK1, was deduced to play a role in subclinical hypothyroidism complicated with type 2 diabetes. CONCLUSIONS: Dysfunctional lncRNAs and mRNAs may be involved in the development of subclinical hypothyroidism and subclinical hypothyroidism complicated with type 2 diabetes. DOI: 10.5603/EP.a2020.0025 PMID: 36624669
"""

print(extract_disease_entities(abstract))


In [None]:
df2["disease_entities"] = df2["abstract"].apply(extract_disease_entities).copy()

In [None]:
df2.to_csv("abstracts.csv")

In [None]:
df2.to_csv("abstracts.csv")

In [None]:
df2.head()

In [None]:
df2.to_csv("abstracts.csv")

In [None]:
df2