In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import gc
import requests
import random
import lxml
from IPython.display import clear_output
import numpy as np

In [2]:
UserAgent = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
        ]


def requestHeader(url):
    # Build request headers
    headers = {
            'User-Agent':random.choice(UserAgent),
            'Referer': url,
            'Connection':'keep-alive'
            }
    return headers

Elsevier

In [None]:
works_df = pd.read_csv('../data/231217_works.csv')

In [7]:
with open('../secret/elsevier_api.txt') as f:
    api_key = f.read()

In [None]:
max_index = 0

In [None]:
%%time

result_df_array = []
count = 0
for doi in works_df['doi'].values[max_index:]:
    url = 'https://api.elsevier.com/content/article/doi/' + doi + '?APIKey=' + api_key + '&view=FULL'
    try:
        response = requests.get(url, headers= requestHeader(url))
        
        file = BeautifulSoup(response.text, "lxml")
        
        #jpeg_objects = file.find_all('object', category='standard', multimediatype="JPEG image file")
        figure_objects = file.find_all('ce:figure')
        
        for figure_object in figure_objects:
            try:
                caption = figure_object.find('ce:caption').text.lower()
                cond_1 = 'sem image' in caption
                cond_2 = 'scanning electron micro' in caption
                cond_3 = 'sem micro' in caption
                cond_4 = 'sem pic' in caption
                if (cond_1 | cond_2 | cond_3 | cond_4 ):
                    print('SEM figure identified.')
                    locator_str = figure_object.find('ce:link')['locator']
                    figure_types = file.find_all('object',
                                            ref=locator_str)
                    for figure_type in figure_types:
                        figure_link = figure_type.text
                        figure_name = figure_link.split('?')[0].split('/')[-1]
                        if figure_name[-3:] == 'jpg':
                            figure_link = figure_link.split('?')[0] + '?APIKey=' + api_key
                            img_response = requests.get(figure_link, headers= requestHeader(url))
                            with open('../img/' + figure_name, 'wb') as f:
                                f.write(img_response.content)
                            result_df_array.append(pd.DataFrame({'doi':[doi], 
                                                                 'locator':[locator_str], 
                                                                 'caption':[caption], 
                                                                 'filename':[figure_name]}))
            except:
                print('Figure does not have caption.')
    except:
        print('Error reading DOI: ' + doi)
            
    file.decompose()
        
    count += 1
    print(str(count) + ' DOIs processed.')

    if count % 1000 == 0:
        gc.collect()
        result_df = pd.concat(result_df_array)
        result_df.to_csv('../data/sem_images_231217_batch_count_' + str(count) + '_.csv', index=False)
        result_df_array = []
        clear_output()

result_df = pd.concat(result_df_array)
result_df.to_csv('../data/sem_images_231217_batch_count_' + str(count) + '_.csv', index=False)

PLOS

In [None]:
max_index = 0

In [None]:
timeout_errors = []

result_df_array = []
count = 0
doi_list = works_df['doi'].values[max_index:]

for doi in doi_list:
    n_complete = len(result_df_array)
    url = 'https://doi.org/' + doi

    try:
        response = requests.get(url, headers= requestHeader(url))
        
        file = BeautifulSoup(response.text, "lxml")

        figure_objects = file.find_all('div', {'class':'figure'})
    except:
        with open('../data/timeout.txt', 'a+') as f:
            f.write(doi)
        figure_objects = []
    
    for figure_object in figure_objects:
        try:
            if 'Table' not in figure_object.find('div', {'class':'figcaption'}).text:
                caption = ''
                for obj in figure_object.find('div', {'class':'figcaption'}):
                    caption += obj.text
                for obj in figure_object.find_all('p'):
                    caption += ' ' + obj.text
                caption = figure_object.text.lower()

                cond_1 = 'sem image' in caption
                cond_2 = 'scanning electron micro' in caption
                cond_3 = 'sem micro' in caption
                cond_4 = 'sem pic' in caption
                if (cond_1 | cond_2 | cond_3 | cond_4 ):
                    print('SEM figure identified.')
                    #fig_link figure_object.find_all('li')[1].find('a')['href']
                    locator_str = figure_object['data-doi'].split('.')[-1]
                    
                    href = ''
                    fig_link = ''
                    for image_object in figure_object.find_all('li'):
                        if 'larger image' == image_object.find('div', {'class':'definition-description'}).text:
                            href = image_object.find('a')['href']
                            fig_link = 'https://journals.plos.org/plosone/' + href
                    img_response = requests.get(fig_link, headers= requestHeader(url))
                    
                    with open('../img/' + fig_link.split('/')[-1] + '.png', 'wb') as f:
                        f.write(img_response.content)

                    result_df_array.append(pd.DataFrame({'doi':[doi], 
                                                      'locator':[locator_str], 
                                                      'caption':[caption], 
                                                      'filename':[fig_link.split('/')[-1]  + '.png']}))
                
        except:
            print('Error processing figure')
        
    count += 1
    print(str(count) + ' DOIs processed.')
    file.decompose()
    
    if count % 1000 == 0:
        try:
            gc.collect()
            result_df = pd.concat(result_df_array)
            result_df.to_csv('../data/sem_images_240118_batch_count_' + str(count) + '_.csv', index=False)
            result_df_array = []
        except:
            print('No objects to concatenate.')
        clear_output()
        
    if count % 10 == 0:
        clear_output()
        gc.collect()

result_df = pd.concat(result_df_array)
result_df.to_csv('../data/sem_images_240118_batch_count_' + str(count) + '_.csv', index=False)

Frontiers

In [None]:
max_index = 0

In [None]:
result_df_array = []
count = 0
doi_list = works_df['doi'].values

for doi in doi_list[max_index:]:
    n_complete = len(result_df_array)
    url = 'https://doi.org/' + doi
    
    time.sleep(1)
    response = requests.get(url, headers= requestHeader(url))
    
    file = BeautifulSoup(response.text, "lxml")
    
    figure_objects = file.find_all('div', {'class':'FigureDesc'})
    
    for figure_object in figure_objects:
        try:
            caption = figure_object.text.lower()
            locator_str = figure_object.find('a')['name']
            fig_cond = 'Figure' in locator_str
            cond_1 = 'sem image' in caption
            cond_2 = 'scanning electron micro' in caption
            cond_3 = 'sem micro' in caption
            cond_4 = 'sem pic' in caption
            if ((cond_1 | cond_2 | cond_3 | cond_4 ) & fig_cond):
                print('SEM figure identified.')
                fig_link = figure_object.find('a')['href']
                img_response = requests.get(fig_link, headers= requestHeader(url))
                filename = fig_link.split('/')[-1]
                with open('../img/' + filename, 'wb') as f:
                    f.write(img_response.content)
                result_df_array.append(pd.DataFrame({'doi':[doi], 
                                                     'locator':[locator_str], 
                                                     'caption':[caption], 
                                                     'filename':[filename]}))
        except:
            print('Error processing figure.')
    file.decompose()
    
    count += 1
    print(str(count) + ' DOIs processed.')
    
    if (count % 1000 == 0) & (len(result_df_array) > 0):
        gc.collect()
        result_df = pd.concat(result_df_array)
        result_df.to_csv('../data/sem_images_240104_batch_count_' + str(count) + '_.csv', index=False)
        result_df_array = []
        clear_output()
        
    if count % 10 == 0:
        clear_output()
        gc.collect()

result_df = pd.concat(result_df_array)
result_df.to_csv('../data/sem_images_240104_batch_count_' + str(count) + '_.csv', index=False)

Nature Portfolio

In [None]:
result_df_array = []
count = 0
doi_list = works_df['doi'].values

for doi in doi_list:
    n_complete = len(result_df_array)
    url = 'https://doi.org/' + doi
    
    time.sleep(1)
    response = requests.get(url, headers= requestHeader(url))
    
    file = BeautifulSoup(response.text, "lxml")
    
    figure_objects = file.find_all('div', {'class':'c-article-section__figure-description'})
    
    for figure_object in figure_objects:
        try:
            caption = figure_object.text.lower()
            cond_1 = 'sem image' in caption
            cond_2 = 'scanning electron micro' in caption
            cond_3 = 'sem micro' in caption
            cond_4 = 'sem pic' in caption
            if (cond_1 | cond_2 | cond_3 | cond_4 ):
                print('SEM figure identified.')
                locator_str = figure_object['id']
                #print(locator_str)
                figure_number = locator_str.split('-')[1]
                fig_link = 'https://www.nature.com' + file.find('a', {'aria-label':'Full size image figure ' + figure_number})['href']
                
                time.sleep(1)
                response = requests.get(fig_link, headers= requestHeader(url))
                figure_page_file = BeautifulSoup(response.text, "lxml")
                
                time.sleep(1)
                fig_src_url = 'https:' + figure_page_file.find_all('picture')[-1].find('img')['src']
                img_response = requests.get(fig_src_url, headers= requestHeader(url))
                with open('../img/' + fig_src_url.split('/')[-1], 'wb') as f:
                    f.write(img_response.content)
                result_df_array.append(pd.DataFrame({'doi':[doi], 
                                                     'locator':[locator_str], 
                                                     'caption':[caption], 
                                                     'filename':[fig_src_url.split('/')[-1]]}))
                
        except:
            print('Error processing figure')
        
    count += 1
    print(str(count) + ' DOIs processed.')
    file.decompose()
    
    if count % 1000 == 0:
        gc.collect()
        result_df = pd.concat(result_df_array)
        result_df.to_csv('../data/sem_images_231227_batch_count_' + str(count) + '_.csv', index=False)
        result_df_array = []
        clear_output()
        
    if count % 10 == 0:
        clear_output()
        gc.collect()

result_df = pd.concat(result_df_array)
result_df.to_csv('../data/sem_images_231227_batch_count_' + str(count) + '_.csv', index=False)