In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import gc
import requests
import random
import lxml
from IPython.display import clear_output
import numpy as np
import time
import json

In [None]:
UserAgent = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
        ]


def requestHeader(url):
    # Build request headers
    headers = {
            'User-Agent':random.choice(UserAgent),
            'Referer': url,
            'Connection':'keep-alive'
            }
    return headers

In [None]:
with open('../secret/elsevier_api.txt') as f:
    api_key = f.read()

In [None]:
meta_df = pd.read_csv('../data/240107_combined_sem_metadata.csv')

In [None]:
meta_df['publisher'] = meta_df['doi'].apply(lambda x: x.split('/')[0])

In [None]:
meta_df = meta_df.drop_duplicates('doi')[['doi', 'publisher']]

In [None]:
# Avoid waiting for 1 second on Nature Portfolio requests by randomizing order
np.random.seed(50)

meta_df = meta_df.sample(frac=1).reset_index(drop=True).copy()

In [None]:
month_dict = {1:'January', 2:'February', 3:'March', 4:'April', 5:'May', 6:'June', 
            7:'July', 8:'August', 9:'September', 10:'October', 11:'November', 12:'December'}

In [None]:
def frontiers_aff(doi):
    author_array = []
    aff_array = []
    pub_date_array = []
    url = 'https://doi.org/' + doi
    response = requests.get(url, headers= requestHeader(url))

    file = BeautifulSoup(response.text, "lxml")

    author_names = file.find('div', {'class':'authors'}).find_all(text=True)
    sup_text_array = [sup.text for sup in file.find('div', {'class':'authors'}).find_all('sup')]
        
    author_names = np.array([x.replace('*', '').replace('†', '').strip() for x in author_names])
    author_names = author_names[author_names != '']
    author_names = author_names[[author_name not in sup_text_array for author_name in author_names]]

    ###### Authors
    if len(sup_text_array) > 0:
        for full_name, sup_text in zip(author_names, sup_text_array):
            for sup in sup_text.split(','):
                author_array.append(pd.DataFrame({'doi':[doi], 'full_name':[full_name], 'script':[sup]}))
    else:
        for full_name in author_names:
            author_array.append(pd.DataFrame({'doi':[doi], 'full_name':[full_name], 'script':['0']}))

    ######### Affiliations            
    aff_objects = file.find('ul', {'class':'notes'}).find_all('li')
    for aff_object in aff_objects:
        if aff_object.find('sup'):
            sup = aff_object.find('sup').text
            aff_text = aff_object.text.replace(sup, '', 1)
            aff_array.append(pd.DataFrame({'doi':[doi], 'affiliation':[aff_text], 'script':[sup]}))
        else:
            aff_text = aff_object.text
            aff_array.append(pd.DataFrame({'doi':[doi], 'affiliation':[aff_text], 'script':['0']}))

    ####### Handling dates
    full_date_text = file.find('p', {'id':'timestamps'}).text
    for date_text in full_date_text.split(';'):
        date_text = date_text.strip()
        action = date_text.split(':')[0].strip()
        date_str = date_text.split(':')[1].strip().replace('.', '')
        pub_date_array.append(pd.DataFrame({'doi':[doi], 'action':[action], 'date':[date_str]}))
            
    author_df = pd.DataFrame()
    aff_df = pd.DataFrame()
    pub_date_df = pd.DataFrame()
    
    if len(author_array) > 0:
        author_df = pd.concat(author_array)
    if len(aff_array) > 0:
        aff_df = pd.concat(aff_array)
    if len(pub_date_array) > 0:
        pub_date_df = pd.concat(pub_date_array)

    file.decompose()
        
    return author_df, aff_df, pub_date_df

In [None]:
def plos_aff(doi):
    author_array = []
    aff_array = []
    pub_date_array = []

    url = 'https://doi.org/' + doi
    response = requests.get(url, headers= requestHeader(url))

    file = BeautifulSoup(response.text, "lxml")

    ######### Authors & Affiliations
    author_objects = file.find_all('a', {'class':'author-name'})
        
    author_count = 0
    for author_object in author_objects:
        author_count = author_object['data-author-id']
        full_name = author_object.text.replace('\n', '').strip(' ,')
        sup = str(author_count)
        author_array.append(pd.DataFrame({'doi':[doi], 'full_name':[full_name], 'script':[sup]}))
        try:
            aff_text = file.find('p', {'id':"authAffiliations-" + author_count}).text.replace('\n', '').strip(' ,')
        except:
            aff_text = ''
        aff_array.append(pd.DataFrame({'doi':[doi], 'affiliation':[aff_text], 'script':[sup]}))

    ########## Handling dates
    article_info = file.find(class_='articleinfo')
    for p in article_info.find_all('p'):
        text = p.text
        if text.startswith('Received'):
            true_text = text.split(';')
        else:
            true_text = [text]
                
        for text_ele in true_text:
            ident = text_ele.split(':')[0]
            value = text_ele.split(':')[1].strip()
            pub_date_array.append(pd.DataFrame({'doi':[doi], 'action':[ident], 'date':[value]}))

    author_df = pd.DataFrame()
    aff_df = pd.DataFrame()
    pub_date_df = pd.DataFrame()
    
    if len(author_array) > 0:
        author_df = pd.concat(author_array)
    if len(aff_array) > 0:
        aff_df = pd.concat(aff_array)
    if len(pub_date_array) > 0:
        pub_date_df = pd.concat(pub_date_array)

    file.decompose()
        
    return author_df, aff_df, pub_date_df

In [None]:
def nature_aff(doi):
    author_array = []
    aff_array = []
    pub_date_array = []

    url = 'https://doi.org/' + doi
    response = requests.get(url, headers= requestHeader(url))

    file = BeautifulSoup(response.text, "lxml")

    ######### Authors & Affiliations
    author_objects = json.loads(file.find_all('script', {'type':'application/ld+json'})[0].text)['mainEntity']['author']
        
    author_count = 0
    for author_object in author_objects:
        full_name = author_object['name']
        sup = str(author_count)
        author_array.append(pd.DataFrame({'doi':[doi], 'full_name':[full_name], 'script':[sup]}))
            
        for affiliation_object in author_object['affiliation']:
            aff_text = affiliation_object['address']['name']
            aff_array.append(pd.DataFrame({'doi':[doi], 'affiliation':[aff_text], 'script':[sup]}))
        author_count += 1
            
    ######### Handling dates & pub dates
    
    for date_object in file.find_all('li', {'class':'c-bibliographic-information__list-item'}):
        date_text = date_object.find('p').text
        if 'DOI' in date_text:
            pass
        else:
            action = date_text.split(':')[0]
            date_str = date_text.split(':')[-1].strip()
            pub_date_array.append(pd.DataFrame({'doi':[doi], 'action':[action], 'date':[date_str]}))

    author_df = pd.DataFrame()
    aff_df = pd.DataFrame()
    pub_date_df = pd.DataFrame()
    
    if len(author_array) > 0:
        author_df = pd.concat(author_array)
    if len(aff_array) > 0:
        aff_df = pd.concat(aff_array)
    if len(pub_date_array) > 0:
        pub_date_df = pd.concat(pub_date_array)

    file.decompose()
        
    return author_df, aff_df, pub_date_df

In [None]:
def elsevier_aff(doi, api_key):
    author_array = []
    aff_array = []
    pub_date_array = []

    url = 'https://api.elsevier.com/content/article/doi/' + doi + '?APIKey=' + api_key + '&view=FULL'
    response = requests.get(url, headers= requestHeader(url))

    file = BeautifulSoup(response.text, "lxml")

    ######### Authors

    # Check for editors.

    editor_ids = []
    editor_aff_ids = []
    try:

        for editor_object in file.find_all('ce:editors'):
            editor_author_objects = editor_object.find_all('ce:author')
            editor_aff_objects = editor_object.find_all('ce:affiliation')
            for editor_author_object in editor_author_objects:
                editor_ids.append(editor_author_object['author-id'])
            for editor_aff_object in editor_aff_objects:
                editor_aff_ids.append(editor_aff_object['affiliation-id'])
    except:
        editor_ids = []
        editor_aff_ids = []
        
    for author_object in file.find_all('ce:author'):
        pipe = False
        try:
            pipe = (author_object['author-id'] not in editor_ids)
        except:
            pipe = True
            
        if pipe:
            # given_name
            if author_object.find('ce:given-name'):
                given_name = author_object.find('ce:given-name').text
            else:
                given_name = ''
    
            # surname
            if author_object.find('ce:surname'):
                surname = author_object.find('ce:surname').text
            else:
                surname = ''
    
            full_name = given_name + ' ' + surname
    
                # script
            if author_object.find('ce:sup'):
                sup_objects = author_object.find_all('ce:sup')
                for sup_object in sup_objects:
                    sup = sup_object.text
                    author_array.append(pd.DataFrame({'doi':[doi], 'full_name':[full_name], 'script':[sup]}))    
            else:
                sup = ''
                author_array.append(pd.DataFrame({'doi':[doi], 'full_name':[full_name], 'script':[sup]}))

    ######### Affiliations
    for affiliation_object in file.find_all('ce:affiliation'):

        pipe = False
        try:
            pipe = (affiliation_object['affiliation-id'] not in editor_aff_ids)
        except:
            pipe = True
            
        if pipe:

            # aff text
            if affiliation_object.find('ce:textfn'):
                aff_text = affiliation_object.find('ce:textfn').text
            else:
                aff_text = ''
    
            # script
            if affiliation_object.find('ce:label'):
                sup = affiliation_object.find('ce:label').text
            else:
                sup = ''
    
            aff_array.append(pd.DataFrame({'doi':[doi], 'affiliation':[aff_text], 'script':[sup]}))

    ######### Handling dates
    for date_object in file.find_all('ce:date-received'):
        pub_date_array.append(pd.DataFrame({'doi':[doi], 'action':['received'], 'date':[date_object['year'] + '-' + \
                                                                                        date_object['month'] + '-' + \
                                                                                       date_object['day']]}))

    for date_object in file.find_all('ce:date-revised'):
        pub_date_array.append(pd.DataFrame({'doi':[doi], 'action':['revised'], 'date':[date_object['year'] + '-' + \
                                                                                        date_object['month'] + '-' + \
                                                                                       date_object['day']]}))

    for date_object in file.find_all('ce:date-accepted'):
        pub_date_array.append(pd.DataFrame({'doi':[doi], 'action':['accepted'], 'date':[date_object['year'] + '-' + \
                                                                                        date_object['month'] + '-' + \
                                                                                       date_object['day']]}))


        ######### Publication dates
    for date_object in file.find_all('xocs:available-online-date'):
        pub_date_array.append(pd.DataFrame({'doi':[doi], 'action':['available online'], 'date':[date_object.text]}))
    for date_object in file.find_all('xocs:vor-available-online-date'):
        pub_date_array.append(pd.DataFrame({'doi':[doi], 'action':['version of record'], 'date':[date_object.text]}))

    author_df = pd.DataFrame()
    aff_df = pd.DataFrame()
    pub_date_df = pd.DataFrame()
    
    if len(author_array) > 0:
        author_df = pd.concat(author_array)
    if len(aff_array) > 0:
        aff_df = pd.concat(aff_array)
    if len(pub_date_array) > 0:
        pub_date_df = pd.concat(pub_date_array)

    file.decompose()
        
    return author_df, aff_df, pub_date_df

Test cases

In [None]:
author_df, aff_df, pub_date_df = elsevier_aff('10.1016/j.microc.2015.12.017')

In [None]:
author_df, aff_df, pub_date_df = plos_aff('10.1371/journal.pone.0287514')

In [None]:
author_df, aff_df, pub_date_df = nature_aff('10.1038/s41598-024-52705-0')

In [None]:
author_df, aff_df, pub_date_df = frontiers_aff('10.3389/fmats.2020.563233')

In [None]:
author_df, aff_df, pub_date_df = elsevier_aff('10.1016/j.ijhydene.2022.10.031')

In [None]:
author_df, aff_df, pub_date_df = elsevier_aff('10.1016/j.ijhydene.2023.03.046')

Collect all

In [None]:
author_array = []
aff_array = []
pub_date_array = []

for index, row in meta_df.iterrows():
    doi = row['doi']
    publisher = row['publisher']

    author_df = pd.DataFrame()
    aff_df = pd.DataFrame()
    pub_date_df = pd.DataFrame()

    try:
        if publisher == '10.1016':
            author_df, aff_df, pub_date_df = elsevier_aff(doi)
        elif publisher == '10.1371':
            author_df, aff_df, pub_date_df = plos_aff(doi)
        elif publisher == '10.1038':
            author_df, aff_df, pub_date_df = nature_aff(doi)
        elif publisher == '10.3389':
            author_df, aff_df, pub_date_df = frontiers_aff(doi)
    except:
        print('Error processing DOI: ' + doi)
        with open('../data/affiliations_timeout.txt', 'a+') as f:
            f.write(doi + '\n')
            
    author_array.append(author_df)
    aff_array.append(aff_df)
    pub_date_array.append(pub_date_df)

    print(str(index) + ' DOIs processed.')

    if (index % 1000 == 0) & (index > 0):
        author_df = pd.concat(author_array)
        aff_df = pd.concat(aff_array)
        pub_date_df = pd.concat(pub_date_array)

        author_array = []
        aff_array = []
        pub_date_array = []
        
        author_df.to_csv('../data/240205_sem_authors_' + str(index) + '.csv', index=False)
        aff_df.to_csv('../data/240205_sem_affiliations_' + str(index) + '.csv', index=False)
        pub_date_df.to_csv('../data/240205_sem_dates_' + str(index) + '.csv', index=False)
        
    if index % 10 == 0:
        clear_output()
        gc.collect()

author_df = pd.concat(author_array)
aff_df = pd.concat(aff_array)
pub_date_df = pd.concat(pub_date_array)

author_array = []
aff_array = []
pub_date_array = []
        
author_df.to_csv('../data/240205_sem_authors_' + str(index) + '.csv', index=False)
aff_df.to_csv('../data/240205_sem_affiliations_' + str(index) + '.csv', index=False)
pub_date_df.to_csv('../data/240205_sem_dates_' + str(index) + '.csv', index=False)

Collect any that timed out

In [None]:
df = pd.read_csv('../data/affiliations_timeout.txt', header=None)

In [None]:
author_array = []
aff_array = []
pub_date_array = []

for index, row in df.iterrows():
    doi = row[0]
    publisher = doi.split('/')[0]

    author_df = pd.DataFrame()
    aff_df = pd.DataFrame()
    pub_date_df = pd.DataFrame()

    try:
        if publisher == '10.1016':
            author_df, aff_df, pub_date_df = elsevier_aff(doi)
        elif publisher == '10.1371':
            author_df, aff_df, pub_date_df = plos_aff(doi)
        elif publisher == '10.1038':
            author_df, aff_df, pub_date_df = nature_aff(doi)
        elif publisher == '10.3389':
            author_df, aff_df, pub_date_df = frontiers_aff(doi)
    except:
        print('Error processing DOI: ' + doi)
        with open('../data/affiliations_timeout.txt', 'a+') as f:
            f.write(doi + '\n')
            
    author_array.append(author_df)
    aff_array.append(aff_df)
    pub_date_array.append(pub_date_df)

    print(str(index) + ' DOIs processed.')

    if (index % 1000 == 0) & (index > 0):
        author_df = pd.concat(author_array)
        aff_df = pd.concat(aff_array)
        pub_date_df = pd.concat(pub_date_array)

        author_array = []
        aff_array = []
        pub_date_array = []
        
        author_df.to_csv('../data/240206_sem_authors_' + str(index) + '.csv', index=False)
        aff_df.to_csv('../data/240206_sem_affiliations_' + str(index) + '.csv', index=False)
        pub_date_df.to_csv('../data/240206_sem_dates_' + str(index) + '.csv', index=False)
        
    if index % 10 == 0:
        clear_output()
        gc.collect()

author_df = pd.concat(author_array)
aff_df = pd.concat(aff_array)
pub_date_df = pd.concat(pub_date_array)

author_array = []
aff_array = []
pub_date_array = []
        
author_df.to_csv('../data/240206_sem_authors_' + str(index) + '.csv', index=False)
aff_df.to_csv('../data/240206_sem_affiliations_' + str(index) + '.csv', index=False)
pub_date_df.to_csv('../data/240206_sem_dates_' + str(index) + '.csv', index=False)