In [None]:
import pandas as pd
import requests
from xml.etree import ElementTree

def get_pubmed_articles(query, max_results=1000):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "retmode": "xml",
        "term": query,
        "retmax": max_results
    }
    search_response = requests.get(base_url, params=params)
    search_xml = ElementTree.fromstring(search_response.content)
    pmids = [pmid.text for pmid in search_xml.findall(".//IdList/Id")]
    total_found = int(search_xml.find(".//Count").text)
    articles = []
    errors = []
    for pmid in pmids:
        fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        fetch_params = {
            "db": "pubmed",
            "retmode": "xml",
            "id": pmid
        }
        fetch_response = requests.get(fetch_url, params=fetch_params)
        if fetch_response.status_code == 200:
            article_xml = ElementTree.fromstring(fetch_response.content)
            title = article_xml.find(".//ArticleTitle").text
            abstract_element = article_xml.find(".//AbstractText")
            abstract = abstract_element.text if abstract_element is not None else "Abstract not available"
            articles.append({'pubmedid': pmid, 'title': title, 'abstract': abstract})
        else:
            errors.append(pmid)
            print(f"Error fetching article with PMID {pmid}")
    return articles, total_found, errors

# Example usage:
query = 'Bcl-x'
articles, total_found, errors = get_pubmed_articles(query)

# Convert articles data into a pandas DataFrame
df = pd.DataFrame(articles)

# Save DataFrame to Excel file
excel_filename = "pubmed_articles_Bcl.xlsx"
df.to_excel(excel_filename, index=False)
print(f"Excel file '{excel_filename}' saved successfully.")

# Save errors to a separate Excel sheet
errors_df = pd.DataFrame({"PubMed IDs with Errors": errors})
errors_excel_filename = "pubmed_errors_Bcl.xlsx"
errors_df.to_excel(errors_excel_filename, index=False)
print(f"Excel file '{errors_excel_filename}' saved successfully.")


Error fetching article with PMID 37811881
Error fetching article with PMID 33687950
Error fetching article with PMID 32801295
Error fetching article with PMID 31436044
Error fetching article with PMID 30796752
Error fetching article with PMID 30536015
Error fetching article with PMID 30535997
Error fetching article with PMID 28617432
Excel file 'pubmed_articles_Bcl.xlsx' saved successfully.
Excel file 'pubmed_errors_Bcl.xlsx' saved successfully.


In [9]:
!pip install medspacy

Collecting medspacy
  Downloading medspacy-1.1.5.tar.gz (111 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.2/111.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting spacy<3.6,>=3.4.1 (from medspacy)
  Downloading spacy-3.5.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyRuSH>=1.0.8 (from medspacy)
  Downloading PyRuSH-1.0.8-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.4/67.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting medspacy-quickumls==3.0 (from medspacy)
  Downloading medspacy_quic

In [1]:
!pip install -U spacy




In [2]:
!pip install scispacy


Collecting scispacy
  Downloading scispacy-0.5.4-py3-none-any.whl (45 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.4/45.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.11 (from scispacy)
  Downloading scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
Collecting conllu (from scispacy)
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pysbd (from scispacy)
  Downloading pysbd-0.3.4-py3-none-any.whl (71

In [3]:
!pip install render


Collecting render
  Downloading render-v1.0.0.tar.gz (2.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: render
  Building wheel for render (setup.py) ... [?25l[?25hdone
  Created wheel for render: filename=render-1.0.0-py3-none-any.whl size=3038 sha256=43a6cb3f911b9c45bd7c4bd5f72a03f860758cc902776344838050efebb7596d
  Stored in directory: /root/.cache/pip/wheels/05/30/61/64c5e91469799f7e7bfff7c6f9047ab73e856f389f898e9e67
Successfully built render
Installing collected packages: render
Successfully installed render-1.0.0


In [5]:
import scispacy
import spacy
#Core models
import en_core_sci_sm
import en_core_sci_md
#NER specific models
import en_ner_bc5cdr_md
#Tools for extracting & displaying data
from spacy import displacy
import pandas as pd

In [7]:
nlp_sm = en_core_sci_sm.load()


In [12]:
text="Cancer research has emphasized the Bcl-2 family of proteins because of their interaction in apoptosis process, a critical mechanism that regulates cellular survival and death. Recently small molecules from diverse sources have gained much attention in anticancer research due to their promising inhibitory action against Bcl-2 and Bcl-XL that are pointedly known as the members of anti-apoptotic Bcl-2 family of proteins. Pinostrobin (PN) is a natural flavonoid with diverse pharmacological potential emerged as a molecule of interest as anticancer agent. The present study aims to screen the interaction of PN with anti-apoptotic protagonists Bcl-2 and Bcl- XL at the molecular level through docking studies."
#Display resulting
doc = nlp_sm(text)


In [13]:
displacy_image = displacy.render(doc, jupyter=True,style='ent')


In [14]:
nlp_bc = en_ner_bc5cdr_md.load()
doc = nlp_bc(text)
#Display resulting entity extraction
displacy_image = displacy.render(doc, jupyter=True,style='ent')

In [4]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.5.0,>=3.4.1 (from en-core-sci-sm==0.5.1)
  Downloading spacy-3.4.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting thinc<8.2.0,>=8.1.0 (from spacy<3.5.0,>=3.4.1->en-core-sci-sm==0.5.1)
  Downloading thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (919 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m919.6/919.6 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wasabi