In [21]:
"""
This script runs through all the authors on a pubmed paper using its url. 
It collects the abstracts of the 5 most recent papers of each author credited in that paper.
"""

from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

# Chromedriver setup
def init_browser():
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    return Browser('chrome', **executable_path, headless=False) #set to false for debugging

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amosfung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
#visit the web page
browser = init_browser()

base_url = "https://pubmed.ncbi.nlm.nih.gov" #pubmed base url
article_url = "https://pubmed.ncbi.nlm.nih.gov/32877576/"

#get list of authors and hrefs to their searches
browser.visit(article_url)
soup = bs(browser.html, "html.parser")
authors = []
for child in soup.find("div", class_="authors").find_all("a", class_='full-name'):
    author_name = child.get_text()
    author_href = child['href']
    authors.append({
        "name": author_name,
        "href": author_href
    })

for author in authors:
    print(f'Collecting abstracts for {author["name"]}')
          
    #go to date sorted results for author
    browser.visit(base_url + author['href'] + "&sort=date")
    soup = bs(browser.html, "html.parser")
          
    article_hrefs = [] # list to hold hrefs for 5 most recent articles by scientist
    for article in soup.find_all("a", class_="docsum-title", limit=5):
        article_hrefs.append(article['href'])
    abstract_string = ''
          
    #go to each article by href and add the abstract text to a string if it exists
    for article_href in article_hrefs:
        browser.visit(base_url + article_href)
        soup = bs(browser.html, "html.parser")
        abstract = soup.find(id="enc-abstract")
        if abstract:
            abstract_string += abstract.get_text()
          
    #add the concatenated string to authors dict
    author["abstracts"] = abstract_string
browser.quit()


Collecting abstracts for Cheryl Keech
Collecting abstracts for Gary Albert
Collecting abstracts for Iksung Cho
Collecting abstracts for Andreana Robertson
Collecting abstracts for Patricia Reed
Collecting abstracts for Susan Neal
Collecting abstracts for Joyce S Plested
Collecting abstracts for Mingzhu Zhu
Collecting abstracts for Shane Cloney-Clark
Collecting abstracts for Haixia Zhou
Collecting abstracts for Gale Smith
Collecting abstracts for Nita Patel
Collecting abstracts for Matthew B Frieman
Collecting abstracts for Robert E Haupt
Collecting abstracts for James Logue
Collecting abstracts for Marisa McGrath
Collecting abstracts for Stuart Weston
Collecting abstracts for Pedro A Piedra
Collecting abstracts for Chinar Desai
Collecting abstracts for Kathleen Callahan
Collecting abstracts for Maggie Lewis
Collecting abstracts for Patricia Price-Abbott
Collecting abstracts for Neil Formica
Collecting abstracts for Vivek Shinde
Collecting abstracts for Louis Fries
Collecting abstracts 

In [40]:
# create df with data and cleaning
df = pd.DataFrame(authors)

In [17]:
def cleanHtml(sentence): #regex to remove end of sentence punctuation
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext


def cleanPunc(sentence): #function to clean the abstracts of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned


# stem data to combine words with similar meanings w/ snowball stemmer
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

#update list of stopwords from nltk
stop_words = set(stopwords.words('english'))
#remove some common terms used in pubmed abstract
stop_words.update(['background', 'methods', 'results', 'conclusions'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
#function to remove stop words
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)

In [18]:
#apply cleaning functions to abstract text
df['abstracts'] = df['abstracts'].str.lower()
df['abstracts'] = df['abstracts'].apply(cleanHtml)
df['abstracts'] = df['abstracts'].apply(cleanPunc)
#optionally use stemming
# df['abstracts'] = df['abstracts'].apply(stemming)
df['abstracts'] = df['abstracts'].apply(removeStopWords)

In [19]:
df.to_csv("authors.csv")