In [1]:
# TODO
# Institution


In [2]:
from selenium import webdriver
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import random
import numpy as np
from concurrent import futures
from datetime import datetime


In [3]:
SEARCH_TERM='koala'

In [5]:
# get article URLs
options = Options()
options.headless = True
options.set_preference("javascript.enabled", True)
options.set_preference(
    "general.useragent.override",
    "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0",
)
browser = webdriver.Firefox(options=options)
url = "https://www.ncbi.nlm.nih.gov/pubmed/?term=" + SEARCH_TERM + "&filter=simsearch1.fha"
browser.get(url)
soup = BeautifulSoup(browser.page_source, "lxml")
max_page = int(soup.find(id="page-number-input").get("max"))
base_url = "https://pubmed.ncbi.nlm.nih.gov"
papers_list = []
url_list = []

for page_num in tqdm(range(max_page)):

    # article page preview
    titles = []
    doc_urls = []
    soup = BeautifulSoup(browser.page_source, "lxml")
    docs = soup.find_all(class_="docsum-content")

    # get article urls from preview page
    for doc in docs:
        titles.append(doc.find("a").get_text())
        doc_urls.append(doc.find("a").get("href"))
    article_urls = [base_url + doc_ulr for doc_ulr in doc_urls]

    url_list.extend(article_urls)
    # browser.implicitly_wait(1)
    if page_num < max_page - 1:
        browser.find_element(
            "xpath", '//*[@title="Navigate to the next page of results."]'
        ).click()

browser.quit()
print(f"Found papers: {len(url_list)}")


100%|██████████| 93/93 [03:11<00:00,  2.06s/it]


Found papers: 927


In [6]:
len(url_list)


927

In [19]:
def get_paper_info(url_list):
    options = Options()
    options.headless = True
    options.set_preference("javascript.enabled", True)
    options.set_preference(
        "general.useragent.override",
        "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0",
    )
    browser = webdriver.Firefox(options=options)

    papers = []
    # print('O')
    for article_url in url_list:
        # print('X')
        # browser.implicitly_wait(random.randint(1, 2))
        browser.get(article_url)
        soup = BeautifulSoup(browser.page_source, "lxml")
        title = soup.find(class_="heading-title").text
        abstract = soup.find("div", {"id": "enc-abstract"}).text

        try:
            keywords = soup.select_one('p:-soup-contains("Keywords")').text
        except:
            keywords = None

        authors_soup = soup.find("div", {"class": "authors"}).find_all(
            class_="authors-list-item"
        )
        author_list = []
        for author_doc in authors_soup:
            author_name = author_doc.find("a").get("data-ga-label")
            author_list.append(author_name)
        author_list


        citation_doc = soup.find(class_="article-source")
        if citation_doc:
            citation=citation_doc.find(class_="cit").text
        else:
            citation=None
        
        papers.append(
            {
                "pmid": int(article_url.split("/")[-2]),
                "title": title,
                "abstract": abstract,
                "keywords": keywords,
                "author_list": author_list,
                "url": article_url,
                "citation": citation,
            }
        )

    browser.quit()
    return papers


In [20]:
# processs papers in batch
n_parts = 32
batches = np.array_split(np.array(url_list), n_parts)

with tqdm(total=len(batches)) as pbar:
    with futures.ThreadPoolExecutor(max_workers=4) as executor:

        future_dict = {executor.submit(get_paper_info, arg): arg for arg in batches}
        papers = []
        for future in futures.as_completed(future_dict):
            arg = future_dict[future]
            papers.append(future.result())
            pbar.update(1)

papers = [item for sublist in papers for item in sublist]


100%|██████████| 32/32 [12:38<00:00, 23.69s/it]


In [26]:
# data pre-processing


from pandas import NaT


def parse_date(citation):
    if not citation:
        return NaT
    
    date_str = citation.split(";")[0][0:4]
    date = datetime.strptime(date_str, "%Y")
    # if len(date_str) > 8:
    #     date = datetime.strptime(date_str, "%Y %b %d")
    # elif 8 >= len(date_str) > 4:
    #     date = datetime.strptime(date_str, "%Y %b")
    # else:
    #     date = datetime.strptime(date_str, "%Y")
    return date.year


def parse_keywords(key_str):
    if not key_str:
        return

    key_str = key_str.replace("Keywords:", "")
    key_str = key_str.strip()
    key_list = key_str.split("; ")
    return key_list


papers_df = pd.DataFrame(papers)
papers_df.title = papers_df.title.apply(str.strip)
papers_df.abstract = papers_df.abstract.apply(str.strip)
papers_df.abstract = papers_df.abstract.apply(str.replace, args=("\n", ""))
papers_df.keywords = papers_df.keywords.apply(parse_keywords)
papers_df = papers_df.assign(date=papers_df.citation.apply(parse_date))
papers_df


Unnamed: 0,pmid,title,abstract,keywords,author_list,url,citation,date
0,33546104,The Koala Immune Response to Chlamydial Infect...,Chlamydia is a significant pathogen for many s...,"[Chlamydia, immunity, koalas, vaccines.]","[Bonnie L Quigley, Peter Timms]",https://pubmed.ncbi.nlm.nih.gov/33546104/,2021 Feb 3;11(2):380.,2021
1,32770481,"Koala retrovirus epidemiology, transmission mo...",Koala retrovirus (KoRV) is a major threat to k...,,"[Mohammad Enamul Hoque Kayesh, Md Abul Hashem,...",https://pubmed.ncbi.nlm.nih.gov/32770481/,2020 Nov;165(11):2409-2417.,2020
2,29382557,Koala immunology and infectious diseases: How ...,Infectious diseases are contributing to the de...,"[Chlamydia, Immune system, Koala, Koala retrov...","[Danielle Madden, Alessandra Whaite, Elizabeth...",https://pubmed.ncbi.nlm.nih.gov/29382557/,2018 May;82:177-185.,2018
3,24148555,Koala retroviruses: characterization and impac...,Koala retroviruses (KoRV) have been isolated f...,,"[Joachim Denner, Paul R Young]",https://pubmed.ncbi.nlm.nih.gov/24148555/,2013 Oct 23;10:108.,2013
4,26958909,Koala Retroviruses: Evolution and Disease Dyna...,A retroviral etiology for malignant neoplasias...,"[Phascolarctos cinereus, SLC19A2, SLC20A1, end...","[Wenqin Xu, Maribeth V Eiden]",https://pubmed.ncbi.nlm.nih.gov/26958909/,2015 Nov;2(1):119-34.,2015
...,...,...,...,...,...,...,...,...
922,33257543,Fingerprint ridges allow primates to regulate ...,Fingerprints are unique to primates and koalas...,"[capillary evaporation, epidermal ridge functi...","[Seoung-Mok Yum, In-Keun Baek, Dongpyo Hong, J...",https://pubmed.ncbi.nlm.nih.gov/33257543/,2020 Dec 15;117(50):31665-31673.,2020
923,36161902,Functional genomics analysis reveals the evolu...,Lorises are a group of globally threatened str...,"[adaptive evolution, demographic history, slow...","[Ming-Li Li, Sheng Wang, Penghui Xu, Hang-Yu T...",https://pubmed.ncbi.nlm.nih.gov/36161902/,2022 Oct 4;119(40):e2123030119.,2022
924,24906475,Chemical characterization of milk oligosacchar...,Structural characterizations of marsupial milk...,,"[Tadasu Urashima, Saori Fujita, Kenji Fukuda, ...",https://pubmed.ncbi.nlm.nih.gov/24906475/,2014 Jul;31(5):387-99.,2014
925,25197935,A galactose-binding lectin isolated from Aplys...,A specific galactose-binding lectin was shown ...,,"[Imtiaj Hasan, Miharu Watanabe, Naoto Ishizaki...",https://pubmed.ncbi.nlm.nih.gov/25197935/,2014 Sep 5;19(9):13990-4003.,2014


In [27]:
papers_df.to_parquet("pubmed_dump.parquet")


url='https://pubmed.ncbi.nlm.nih.gov/32556174/'
browser = webdriver.Firefox()
browser.get(url)
soup = BeautifulSoup(browser.page_source, "lxml")

