# Sample code for downloading articles

## Step 1: Prepare list of articles to download based on publishing journal

In [1]:
import pandas as pd

In [2]:
SCOPUS_1_PATH = './data/scopus_1.csv'
SCOPUS_2_PATH = './data/scopus_2.csv'
SCOPUS_3_PATH = './data/scopus_3.csv'

In [3]:
df_1 = pd.read_csv(SCOPUS_1_PATH)
df_2 = pd.read_csv(SCOPUS_2_PATH)
df_3 = pd.read_csv(SCOPUS_3_PATH)

In [4]:
full_df = pd.concat([df_1, df_2, df_3])
full_df.head()

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,ISBN,CODEN,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Open Access,Source,EID
0,"Chen Y., Liu A., Cheng X.",57203573569;57203568753;7401754355;,Detection of thermokarst lake drainage events ...,2022,Science of the Total Environment,807,,150828,,,...,,STEVA,,English,Sci. Total Environ.,Article,Final,,Scopus,2-s2.0-85116938924
1,"Curra-Sánchez E.D., Lara C., Cornejo-D'Ottone ...",57272348800;36502397300;57219281043;8865507700...,Contrasting land-uses in two small river basin...,2022,Science of the Total Environment,806,,150435,,,...,,STEVA,,English,Sci. Total Environ.,Article,Final,,Scopus,2-s2.0-85115798899
2,"Becek K., Yong G.Y.V., Sukri R.S., Lai D.T.C.",25227469700;57323380300;36931212700;57199649884;,Shorea albida Sym. does not regenerate in the ...,2022,Forest Ecology and Management,504,,119816,,,...,,FECMD,,English,For. Ecol. Manage.,Article,Final,"All Open Access, Hybrid Gold",Scopus,2-s2.0-85118490097
3,"Haro S., Jesus B., Oiry S., Papaspyrou S., Lar...",57207730674;55884902700;57220084562;6603281099...,Microphytobenthos spatio-temporal dynamics acr...,2022,Science of the Total Environment,804,,149983,,,...,,STEVA,,English,Sci. Total Environ.,Article,Final,,Scopus,2-s2.0-85114704794
4,"Nguyen T.T., Pham T.D., Nguyen C.T., Delfos J....",57209166347;57188874343;57254177200;5725417730...,A novel intelligence approach based active and...,2022,Science of the Total Environment,804,,150187,,,...,,STEVA,34517328.0,English,Sci. Total Environ.,Article,Final,,Scopus,2-s2.0-85114661978


In [5]:
len(full_df)

4322

In [6]:
# Get list of all unique publishers
publisher_list = full_df['Publisher'].value_counts().index.tolist()
len(publisher_list)

305

## Step 2: Prepare a dataset of articles that belong to the same journals

In [7]:
# Create a map of journals with list of articles.
journal_dict = {}
for idx, row in full_df.iterrows():
    info = {}
    info['title'] = row['Title']
    info['doi'] = row['DOI']
    info['open-access'] = row['Open Access']
    info['issn'] = row['ISSN']

    if row['Publisher'] not in journal_dict.keys():
        journal_dict[row['Publisher']] = []

    journal_dict[row['Publisher']].append(info)

In [8]:
# Maintain information of any failed download requests
failed_articles_list = {}

In [9]:
from time import sleep
import random
import requests
from bs4 import BeautifulSoup as soup
import urllib

## Prepare list of journals from elsevier

In [10]:
elsevier_list = [i for i in publisher_list if i.startswith('Elsevier')]
elsevier_list

['Elsevier B.V.',
 'Elsevier Inc.',
 'Elsevier Ltd',
 'Elsevier',
 'Elsevier GmbH',
 'Elsevier Science Inc, New York, NY, United States',
 'Elsevier BV',
 'Elsevier Science Ltd, Oxford',
 'Elsevier Science Ltd, Exeter, United Kingdom',
 'Elsevier Masson SAS',
 'Elsevier Masson s.r.l.',
 'Elsevier; PIER, 3',
 'Elsevier Science B.V., Amsterdam',
 'Elsevier; Developments in Soil Science, 20',
 'Elsevier Sci B.V., Amsterdam, Netherlands']

## Step 3: Download all articles from elsevier journals

In [11]:
def download_from_elsevier(root_path, output_filename, response_type, api_key, doi):
    """Download articles from elsevier journals

    Special Instructions: Most articles require the the download 
    request sent from an IP address inside Purdue Network. Please login to
    Purdue WiFi to run this function and get appropraite results.

    Args:
        doi (string): DOI string of requested article
        api_key (string): ELSEVIER API key to use for authentication
        response_type (string): text or html
        output_filename (): _description_
    """
    # Article url link
    url = 'http://api.elsevier.com/content/article/doi:' + doi + '?view=FULL'
    
    # HTTP headers for authentication
    # https://dev.elsevier.com/tecdoc_api_authentication.html - Read this
    headers = {
        'X-ELS-APIKEY': '{}'.format(api_key),
        'Accept': 'text/{}'.format(response_type)
    }
    
    # Make HTTP Get request
    response = requests.get(url, stream=True, headers=headers)
    
    # Save file 
    with open('{}/{}.{}'.format(root_path,output_filename, response_type), 'wb') as f:
        for chunk in response.iter_content(2048):
            f.write(chunk)

In [12]:
ELSEVIER_API_KEY = '85f4783f858cca9e81b30e19d0ba5545'

In [13]:
# Iterate through the elsevier journal names
for idx, journal in enumerate(elsevier_list):
    article_list = journal_dict[journal]
    failed_articles_list[journal] = []
    
    # Retrieve all articles from the journal and iterate through them
    for num, article in enumerate(article_list):
        sleep(random.choice([1, 3]))
        
        doi = article['doi']
        doi = doi.replace('/', '_')
        
        # Try to download the file
        try:
            download_from_elsevier('./', "{}".format(doi),  'xml',  ELSEVIER_API_KEY, article['doi'])
        except Exception as e:
            print('Error while downloading from elsevier:', e)
            print('Error for article with doi {}'.format(article['doi']))
            failed_articles_list[journal].append(article)
        
        print(idx + num)
    

0
1
2
3
4
5
6
7


KeyboardInterrupt: 

## Step 4: Download articles from MDPI

In [14]:
mdpi_list = [i for i in publisher_list if i.startswith('MDPI')]
len(mdpi_list)

3

In [15]:
def download_from_mdpi(root_path, output_file, doi):
    """Download open access articles from MDPI journal

    Args:
        output_file (string): _description_
        doi (_type_): _description_
    """
    # MDPI Search URL link
    url = 'https://www.mdpi.com/search?q=' + urllib.parse.quote_plus(doi)
    
    # Make HTTP Get request
    response = requests.get(url)
    
    # Parse response for the actual article
    article_url = response.url
    full_text_url = article_url + '/htm'
    
    # Make HTTP Get request for the full text of the article
    resp_full_text = requests.get(full_text_url)
    
    # Save file
    with open('{}/{}.html'.format(root_path, output_file), "w", encoding='utf-8') as file:
        file.write(str(resp_full_text.text))

In [16]:
# Iterate through the elsevier journal names
for idx, journal in enumerate(mdpi_list):
    article_list = journal_dict[journal]
    failed_articles_list[journal] = []
    
    # Retrieve all articles from the journal and iterate through them
    for num, article in enumerate(article_list):
        sleep(random.choice([1, 3]))
        
        doi = article['doi']
        file_name = doi.replace('/', '_')
        
        # Try to download the file
        try:
            download_from_mdpi(".",file_name, "{}".format(article['doi']))
        except Exception as e:
            print('Error while downloading from MDPI:', e)
            print('Error for article with doi {}'.format(article['doi']))
            failed_articles_list[journal].append(article)
        
        print(idx + num)
    

0
1
2
3
4
5
6
7


KeyboardInterrupt: 