# Article PDFs download

The AK API provided a list of articles (publications_eng.csv) with a list of source links and the DOI for some articles  

The below code uses the provided information (URLs and DOI) to search for and try to download a PDF of each of the identified articles

We identified 3 methods for downloading PDFs but only used method 1 and 2 as method 3 was the least reliable (had the greatest potential that files other than the sought after article would be downloaded)

## 0: Set up

In [None]:
from requests import get 
from bs4 import BeautifulSoup
from functools import reduce
import pandas as pd
import re, json, ast, os

In [None]:
#directory that save found PDFs to
base_dir = 'papers'

#load publication information
publications = pd.read_csv('data_files/publications_eng.csv', encoding = "ISO-8859-1")
n_articles = publications.shape[0]
print('number of articles to search: ', n_articles)

In [None]:
#note all unique paper ids
paper_ids = list(publications['paper_id'].unique())
found_papers = {}
for i in paper_ids:
    found_papers[i] = 0
    
#in cases that need to run the script multiple times
#get information on which papers have pdf file for already
rootdir = 'papers_final'
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file.endswith(".pdf"):
#             paper_id = file.split('.')[0]
            paper_id = subdir.split('/')[-1]
            found_papers[int(paper_id)] = 1

In [None]:
def extract_pdf(url, base_dir, name):
    """
    take url and make request, if retrieved content is pdf, save to base_dir as <name>.pdf
    indicate whether pdf has been found
    """
    try:
        content = get(url)
        if content.status_code==200 and content.headers['content-type']=='application/pdf':
            with open(os.path.join(base_dir, str(name) +'.pdf'), 'wb') as pdf:
                pdf.write(content.content)
                return True
        else:
            return False
    except:
        return False

## 1: Retrieve PDFs using AK API provided URLs

Visit each URL and if it is a PDF, download it.

### Functions

In [None]:
def get_link(url):
    """
    takes in url
    if url is pointing to one of the well known open source platforms but does not end in 'pdf' - adapt it
    
    returns link to use in request
    """
    
    #if url ends in 'pdf' - leave as is
    if url[-3:].lower() == 'pdf':
        link = url

    #if url is pointing to one of the well known open source platforms 
    #but does not end in 'pdf' -- adapt
    elif 'arxiv' in url:    
        #remove queries sometimes present at the end of the URL
        no_query = url.split('?')[0]
        #get article arxiv ID
        article_id = no_query.split('/')[-1]    
        #construct link
        link = 'https://arxiv.org/pdf/' + str(article_id) + '.pdf'

    elif 'peerj' in url:
        link = url[:-1] + '.pdf'

    elif 'frontiers' in url:
        link = url[:-4] + 'pdf'

    elif 'plos' in url:
        components = url.split('?')
        if len(components)==2:
            start = "/".join(components[0].split('/'))
            if start[-1] != 'article':
                link = "/".join(components[0].split('/')[:-1]) + "/file?" + components[1] + '&type=printable'
            else: link = components[0] + "/file?" + components[1] + '&type=printable'
        else:
            link = url

    elif 'rsos' in url:
        link = url + '.full.pdf'

    #otherwise leave as is
    else:
        link = url
        
    return link

### Download

In [None]:
for index, row in publications.iterrows():
    
    urls = row['url_0':'url_19']
    paper_id = row['paper_id']
        
    for i in urls:

        #check that i is a string rather than NaN and that paper has not been found yet
        if type(i) != float and found_papers[paper_id] == 0:

            #form URL to request
            link = get_link(i)

            #try to extract pdf - check if successful
            found = extract_pdf(link, base_dir, paper_id)
            if found:
                found_papers[paper_id] = 1

    #keep track of progress
    if index != 0 and index%100 == 0:
        print(str(round(index/n_articles, 2)) + "%")

## 2: Search for open access article using article DOI and the oaDOI API

the oaDOI provides a database of links to open access versions of an article which can be retrieved using the article DOI  
We use it to look for further links to articles that have not yet been retrieved (where the DOI is known)  

https://oadoi.org

### Functions

In [None]:
def decode_bytes(bytes_string):
    """
    takes dictionary represented as bytes string
    returns dictionary
    """
    data = "".join([word for word in bytes_string.split() if word != "\n"])
    repls = ('true', 'True'), ('false', 'False'), ('null', '0')
    data = reduce(lambda a, kv: a.replace(*kv), repls, data)
    
    return ast.literal_eval(data)

def extract_oadoi_data(doi):
    """
    takes doi
    returns list of urls provided by oaDOI
    """
    url = 'https://api.oadoi.org/v2/'
    link = url + doi
    urls = []
    
    try:
        content = get(link)
        data = content.content.decode()
        data_dict = decode_bytes(data)

        if 'oa_locations' in data_dict.keys():
            for i in data_dict['oa_locations']:
                    urls.append(i['url'])
    except:
        #if get internal server error
        pass
    
    return urls

### Download

In [None]:
for index, row in publications.iterrows():
    paper_id = row['paper_id']
    doi = row['doi']
    
    #check paper has not been found yet + has a valid doi
    if found_papers[paper_id] == 0 and type(doi) == str:

        urls = extract_oadoi_data(doi)
        
        if len(urls) > 0:
            for i in urls:
                if found_papers[paper_id] == 0:
                    found = extract_pdf(i, base_dir, paper_id)
                    if found:
                        found_papers[paper_id] = 1
                            
    if index != 0 and index%100 == 0:
        print(str(round(index/n_articles, 2)) + "%") 

## 3: Search for links to PDF on article linked webpage 

For the articles that have not been downloaded, visit the AK API provided URLs, search for links to PDFs within the retrieved HTML and try to download those files    

NOTE: there is no guarantee the downloaded file is the searched for paper as compared to e.g., supplementary information files or repository related documents or even another paper entirely  

It might therefore be preferred to check the downloaded files for content if using this method

### Functions

In [None]:
def get_page(url):
    content = get(url)
    if content.status_code == 200:
        return content.text

def get_all_links(content):
    soup = BeautifulSoup(content)
    links = []
    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        links.append(link.get('href'))
    return links

def get_pdf(url, base_dir, paper_id):
    content = get_page(url)
    links = get_all_links(content)
    n_pdfs = 0
    
    for link in links:
        if 'pdf' in link:
            #some pages have 2 links to pdf - e.g. common for scirate.com -- avoid duplication
            if n_pdfs == 0:
                found = extract_pdf(link, base_dir, paper_id)
                if found:
                    n_pdfs+= 1  
                    
    if n_pdfs==0:
        return False
    else:
        return True

### Download

In [None]:
for index, row in publications.iterrows():

    paper_id = row['paper_id']
    urls = row['url_0':'url_19']
    
    if found_papers[paper_id] == 0:
                
        for i in urls:
            
            if type(i) != float and found_papers[paper_id] == 0:
                                                
                try:
                    found = get_pdf(i, base_dir, paper_id)
                    if found:
                        found_papers[paper_id] = 1
                except:
                    pass
                
    if index != 0 and index%100 == 0:
        print(str(round(index/n_articles, 2)) + "%")