In [1]:
from selenium import webdriver
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import random
import numpy as np
from concurrent import futures
from datetime import datetime

In [2]:
SEARCH_TERM = "koala"

In [3]:
# get article URLs
options = Options()
options.headless = True
options.set_preference("javascript.enabled", True)
options.set_preference(
    "general.useragent.override",
    "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0",
)
browser = webdriver.Firefox(options=options)
url = (
    "https://www.ncbi.nlm.nih.gov/pubmed/?term="
    + SEARCH_TERM
    + "&filter=simsearch1.fha"
)
browser.get(url)
soup = BeautifulSoup(browser.page_source, "lxml")
max_page = int(soup.find(id="page-number-input").get("max"))
base_url = "https://pubmed.ncbi.nlm.nih.gov"
papers_list = []
url_list = []

for page_num in tqdm(range(max_page)):

    # article page preview
    titles = []
    doc_urls = []
    soup = BeautifulSoup(browser.page_source, "lxml")
    docs = soup.find_all(class_="docsum-content")

    # get article urls from preview page
    for doc in docs:
        titles.append(doc.find("a").get_text())
        doc_urls.append(doc.find("a").get("href"))
    article_urls = [base_url + doc_ulr for doc_ulr in doc_urls]

    url_list.extend(article_urls)
    # browser.implicitly_wait(1)
    if page_num < max_page - 1:
        browser.find_element(
            "xpath", '//*[@title="Navigate to the next page of results."]'
        ).click()

browser.quit()
print(f"Found papers: {len(url_list)}")


100%|██████████| 93/93 [02:50<00:00,  1.83s/it]


Found papers: 928


In [15]:
def get_paper_info(url_list):
    options = Options()
    options.headless = True
    options.set_preference("javascript.enabled", True)
    options.set_preference(
        "general.useragent.override",
        "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0",
    )
    browser = webdriver.Firefox(options=options)

    papers = []
    for article_url in url_list:

        browser.get(article_url)
        soup = BeautifulSoup(browser.page_source, "lxml")
        title = soup.find(class_="heading-title").text
        abstract = soup.find("div", {"id": "eng-abstract"}).text

        try:
            keywords = soup.select_one('p:-soup-contains("Keywords")').text
        except:
            keywords = None

        authors_soup = soup.find("div", {"class": "authors"}).find_all(
            class_="authors-list-item"
        )
        author_list = []
        for author_doc in authors_soup:
            author_name = author_doc.find("a").get("data-ga-label")
            author_list.append(author_name)
        author_list

        citation_doc = soup.find(class_="article-source")
        if citation_doc:
            citation = citation_doc.find(class_="cit").text
        else:
            citation = None

        papers.append(
            {
                "pmid": int(article_url.split("/")[-2]),
                "title": title,
                "abstract": abstract,
                "keywords": keywords,
                "author_list": author_list,
                "url": article_url,
                "citation": citation,
            }
        )

    browser.quit()
    return papers


In [16]:
# processs papers in batch
n_parts = 32
batches = np.array_split(np.array(url_list), n_parts)

with tqdm(total=len(batches)) as pbar:
    with futures.ThreadPoolExecutor(max_workers=4) as executor:

        future_dict = {executor.submit(get_paper_info, arg): arg for arg in batches}
        papers = []
        for future in futures.as_completed(future_dict):
            arg = future_dict[future]
            papers.append(future.result())
            pbar.update(1)

papers = [item for sublist in papers for item in sublist]


100%|██████████| 32/32 [16:08<00:00, 30.27s/it]  


In [17]:
papers_df = pd.DataFrame(papers)
papers_df.to_parquet("data/pubmed_dump_raw.parquet")

In [20]:
papers_df

Unnamed: 0,pmid,title,abstract,keywords,author_list,url,citation
0,30656465,\n \n \n \n \n \n \n Molecu...,\n\n \n Koala retrovirus (KoRV) is a...,,"[Mohammad Enamul Hoque Kayesh, Osamu Yamato, M...",https://pubmed.ncbi.nlm.nih.gov/30656465/,2019 Mar;164(3):757-765.
1,28669101,\n \n \n \n \n \n \n Charac...,\n\n \n Koala (Phascolarctos cinereu...,\n\n Keywords:\n \n \n ...,"[Yuanyuan Cheng, Adam Polkinghorne, Amber Gill...",https://pubmed.ncbi.nlm.nih.gov/28669101/,2018 Feb;70(2):125-133.
2,29967444,\n \n \n \n \n \n \n Adapta...,"\n\n \n The koala, the only extant s...",,"[Rebecca N Johnson, Denis O'Meally, Zhiliang C...",https://pubmed.ncbi.nlm.nih.gov/29967444/,2018 Aug;50(8):1102-1111.
3,32470998,\n \n \n \n \n \n \n Inbree...,\n\n \n Habitat destruction and frag...,\n\n Keywords:\n \n \n ...,"[Anthony J Schultz, Romane H Cristescu, Jon Ha...",https://pubmed.ncbi.nlm.nih.gov/32470998/,2020 Jul;29(13):2416-2430.
4,31848216,\n \n \n \n \n \n \n Quanti...,\n\n \n The morphology and locomotor...,\n\n Keywords:\n \n \n ...,"[Joshua L Gaschk, Celine H Frère, Christofer J...",https://pubmed.ncbi.nlm.nih.gov/31848216/,2019 Dec 17;222(Pt 24):jeb207506.
...,...,...,...,...,...,...,...
923,33257543,\n \n \n \n \n \n \n Finger...,\n\n \n Fingerprints are unique to p...,\n\n Keywords:\n \n \n ...,"[Seoung-Mok Yum, In-Keun Baek, Dongpyo Hong, J...",https://pubmed.ncbi.nlm.nih.gov/33257543/,2020 Dec 15;117(50):31665-31673.
924,36161902,\n \n \n \n \n \n \n Functi...,\n\n \n Lorises are a group of globa...,\n\n Keywords:\n \n \n ...,"[Ming-Li Li, Sheng Wang, Penghui Xu, Hang-Yu T...",https://pubmed.ncbi.nlm.nih.gov/36161902/,2022 Oct 4;119(40):e2123030119.
925,24906475,\n \n \n \n \n \n \n Chemic...,\n\n \n Structural characterizations...,,"[Tadasu Urashima, Saori Fujita, Kenji Fukuda, ...",https://pubmed.ncbi.nlm.nih.gov/24906475/,2014 Jul;31(5):387-99.
926,25197935,\n \n \n \n \n \n \n A gala...,\n\n \n A specific galactose-binding...,,"[Imtiaj Hasan, Miharu Watanabe, Naoto Ishizaki...",https://pubmed.ncbi.nlm.nih.gov/25197935/,2014 Sep 5;19(9):13990-4003.
