# Publications Mining from Pubtator Central

### Preqrequisites Libraries

In [5]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import numpy as np
import pandas as pd
import calendar
import time

### Mining Config

In [6]:
query = "breast cancer"
url = "https://www.ncbi.nlm.nih.gov/research/pubtator/?view=docsum&query={}&page={}"

### Functions

In [7]:
browser = webdriver.PhantomJS(executable_path="./libs/phantomjs-2.1.1/phantomjs")
logs_str = "Logs: {}"

def get_publication_perpage(query, page):
    print(logs_str.format("Getting page {}".format(page)))

    pubs = np.array([])

    browser.get(url.format(query, page))
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "publication")))

    html = browser.page_source
    soup = BeautifulSoup(html, 'html.parser')
    journals = soup.find_all('div', {'class': 'publication'})


    for journal in journals:
        title = journal.find('a', {'class': 'publication-title'}).get_text().strip()
        pmid = journal.get('data-pmid').strip()
        pub = np.array([pmid, title])
        pubs = np.append(pubs, pub)

    return pubs

def get_publications(query, start_page=1, target_page=1):
    print(logs_str.format("🔨 Getting publications for query: {}".format(query)))
    print(logs_str.format("Start working from page {} to page {}".format(start_page, target_page)))

    pubs = np.array([])
    for page in range(start_page, target_page + 1):
        pubs = np.append(pubs, get_publication_perpage(query, page))
        
    print("-" * 50)
    print(logs_str.format("🌟 Job done!"))

    return pubs

def transorm_to_df(pubs):
    df = pd.DataFrame(pubs.reshape(-1, 2), columns=['PMID', 'Publication_Title'])
    return df

### Mining

In [8]:
pubs = get_publications(query, 1, 324)
pubs_df = transorm_to_df(pubs)

pubs_df.describe()

Logs: 🔨 Getting publications for query: breast cancer
Logs: Start working from page 1 to page 324
Logs: Getting page 1
Logs: Getting page 2
Logs: Getting page 3
Logs: Getting page 4
Logs: Getting page 5
Logs: Getting page 6
Logs: Getting page 7
Logs: Getting page 8
Logs: Getting page 9
Logs: Getting page 10
Logs: Getting page 11
Logs: Getting page 12
Logs: Getting page 13
Logs: Getting page 14
Logs: Getting page 15
Logs: Getting page 16
Logs: Getting page 17
Logs: Getting page 18
Logs: Getting page 19
Logs: Getting page 20
Logs: Getting page 21
Logs: Getting page 22
Logs: Getting page 23
Logs: Getting page 24
Logs: Getting page 25
Logs: Getting page 26
Logs: Getting page 27
Logs: Getting page 28
Logs: Getting page 29
Logs: Getting page 30
Logs: Getting page 31
Logs: Getting page 32
Logs: Getting page 33
Logs: Getting page 34
Logs: Getting page 35
Logs: Getting page 36
Logs: Getting page 37
Logs: Getting page 38
Logs: Getting page 39
Logs: Getting page 40
Logs: Getting page 41
Logs: Get

Unnamed: 0,PMID,Publication_Title
count,4860,4860
unique,4860,4859
top,36800640,ESGO/ESHRE/ESGE Guidelines for the fertility-s...
freq,1,2


### Export Data to CSV

In [9]:
file_path = "./../data/"
ts = calendar.timegm(time.gmtime())
num_pubs = pubs_df.shape[0]
file_name = "{}-pubs-{}-{}.csv".format(ts, query.replace(" ", "-"), num_pubs)

pubs_df.to_csv(file_path+file_name, index=False)