In [1]:
import requests
import pandas as pd
import requests
import json
from tqdm import tqdm
from pathlib import Path
import re

In [2]:
def download_pdf(doi, dest_folder, content):
    doi_suffix = str(doi.split('/', 1)[1:]).strip("[']")
    doi_suffix = re.sub(r"""([()/\\*,"': ?;<>])""", '_._', doi_suffix)
    filename = dest_folder + '/' + str(doi_suffix)+'.pdf'
    with open(filename, mode = 'wb') as f:
        f.write(content)

In [3]:
with open('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/code/cpet_articles/gathering/full-text_download_code/springer/springer_config.json') as config_file:
    api_key = json.load(config_file)['api_key']

In [4]:
articles = pd.read_csv('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/unpaywall/unpaywall_info.csv')
springer_articles = articles[(articles['is_oa'] == True) & \
    ((articles['publisher'] == 'Springer Science and Business Media LLC') | \
     (articles['publisher'] == 'Springer Nature') | \
     (articles['publisher'] == 'Springer International Publishing'))].drop_duplicates().reset_index(drop=True)

In [5]:
springer_articles['publisher'].value_counts()

Springer Science and Business Media LLC    934
Springer Nature                              3
Springer International Publishing            1
Name: publisher, dtype: int64

In [6]:
pdf_file_paths = list(Path('/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/pdfs').rglob('*/*.pdf'))
pdfs = [path.stem for path in pdf_file_paths]

In [7]:
re_doi_suffix = re.compile(r'(?<=\d/).*')
springer_articles['doi_suffix'] = springer_articles['doi'].apply(lambda x: re_doi_suffix.search(x).group())
full_texts_to_download = [x for x in springer_articles['doi_suffix'].tolist() if x not in pdfs]
merge = pd.merge(pd.DataFrame({'doi_suffix': full_texts_to_download}), springer_articles, how='inner', on='doi_suffix')
merge.shape

(50, 50)

In [11]:
springer_api_url = 'https://api.springernature.com/meta/v2/json?'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:100.0) Gecko/20100101 Firefox/100.0'}
folder = '/Users/antonhesse/Desktop/Anton/Education/UMN/Lab and Research/HSPL/CPET_scoping_review/data/cpet_articles/full_texts/pdfs/springer_oa'

In [12]:
log = []

In [13]:
for i, row in tqdm(merge.iterrows(), total=merge.shape[0]):
    doi = row['doi']
    out = {'doi': doi}
    params = {
    'api_key': api_key,
    'q': doi
    }

    try:
        response = requests.get(url = springer_api_url, params = params, headers=headers, allow_redirects=True)
        out.update({'query_status_code': response.status_code})
                    
        if response.status_code == 200:
            pdf_url = response.json()['records'][0]['url'][1]['value']
            pdf_response = requests.get(pdf_url, stream=True, allow_redirects=True)
            out.update({'pdf_status_code': pdf_response.status_code})
                    
            if pdf_response.status_code == 200:
                download_pdf(doi, dest_folder=folder, content=pdf_response.content)

    except Exception as e:
        print(f'{e} at index {i} for DOI {doi}')
        out.update({'error': e})
    
    log.append(out)


100%|██████████| 50/50 [03:48<00:00,  4.57s/it]


In [None]:
log_df = pd.DataFrame(log)