## Elsevier API
Goal: Get % of refDict that can be found through the Elsevier API. Assume OA's from Unpaywall does not overlap with Elsevier papers (they practically don't)

Dependency: quantify_datasets/query_out.json

Source: https://raw.githubusercontent.com/ElsevierDev/elsapy/master/exampleProg.py

In [1]:
import urllib
import requests
import os
import math
import json
import pandas as pd

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch

email = 'ivalexander13@berkeley.edu' # enter ur email pls

Elsevier Setup

In [2]:
## Load configuration
con_file = open("elsevier_api/config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
client.local_dir = "./elsevier_api/"


In [3]:
# Load the metadata
json_file = './quantify_datasets/query_out.json'
if os.path.isfile(json_file):
    with open(json_file, 'r') as fp:
        query_out = json.load(fp)
        fp.close()

In [4]:
# Get paper
def get_paper(doi: str):
    ## ScienceDirect (full-text) document example using DOI
    doi_doc = FullDoc(doi = doi)
    if doi_doc.read(client):
        return doi_doc.data
    else:
        return False

In [6]:
# fixme test
get_paper('10.1016/0031-9422(73)80493-5')

{'coredata': {'prism:url': 'https://api.elsevier.com/content/article/pii/0031942273804935',
  'dc:identifier': 'doi:10.1016/0031-9422(73)80493-5',
  'eid': '1-s2.0-0031942273804935',
  'prism:doi': '10.1016/0031-9422(73)80493-5',
  'pii': '0031-9422(73)80493-5',
  'dc:title': 'Pro-tRNA synthetase from Phaseolus aureus and Delonix regia\n             ',
  'prism:publicationName': 'Phytochemistry',
  'prism:aggregationType': 'Journal',
  'pubType': 'fla',
  'prism:issn': '00319422',
  'prism:volume': '12',
  'prism:issueIdentifier': '12',
  'prism:startingPage': '2829',
  'prism:endingPage': '2841',
  'prism:pageRange': '2829-2841',
  'prism:number': '12',
  'dc:format': 'application/json',
  'prism:coverDate': '1973-12-31',
  'prism:coverDisplayDate': 'December 1973',
  'prism:copyright': 'Copyright © 1973 Published by Elsevier Ltd.',
  'prism:publisher': 'Published by Elsevier Ltd.',
  'dc:creator': [{'@_fa': 'true', '$': 'Norris, Roger D.'},
   {'@_fa': 'true', '$': 'Fowden, Leslie'}]

In [81]:
# Output file = doi to full_text dict dictionary (in json)
json_file = './elsevier_api/elsevier_fulltexts.json'
if os.path.isfile(json_file):
    with open(json_file, 'r') as fp:
        elsevier_fulltexts = json.load(fp)
        fp.close()
else:
    elsevier_fulltexts = {}

# hyperparam (-1 if max)
max_calls = -1

# Stats
calls = 0
fails = 0
not_elsevier = 0
successes_or_found = 0
queries = 0

try:
    # Looping through quantify_dataset output json.
    for key, value in query_out.items():
        doi = value['doi']
        doi_url = value['doi_url']
        
        # dont go over max calls. (-1 if infinite)
        if calls == max_calls or (queries == 99999):
            print("Query limit reached.")
            break
        else:
            calls += 1

        # checks if paper has been successfully fetched before
        if doi in elsevier_fulltexts:
            successes_or_found += 1
            print(f"## Call {calls} found.")
            continue
        
        # checks if publisher is elsevier
        if value['publisher'] != 'Elsevier' and value['publisher'] != 'Elsevier BV':
            not_elsevier += 1
            continue
        
        if (fullpaper := get_paper(doi)):
            elsevier_fulltexts[doi] = fullpaper
            successes_or_found += 1
            queries += 1
            print(f"Call {calls} success.")
        else:
            fails += 1
            print (f"##### Call {calls} failed: {fails}. DOI: {doi_url}.")
        
except KeyboardInterrupt:
    pass

# save to file
with open(json_file, 'w') as fp:
    json.dump(elsevier_fulltexts, fp)
    # vary: alter frequency of file save
    if (calls % 1 == 0):
        fp.close()

# Print Stats
print("")
print("###### STATS ######")
print(f"Total calls: {calls}")
print(f"Total number of queries: {queries}")
print(f"Total number of Elsevier papers: {calls - not_elsevier}")
print(f"Number of Non-Elsevier papers skipped: {not_elsevier}")
print(f"Number of fetch failures: {fails}")
print(f"Papers in storage: {len(elsevier_fulltexts)}")
print(f"% of success: {successes_or_found / (calls-not_elsevier) * 100}%")

## Call 3 found.
## Call 6 found.
## Call 7 found.
## Call 8 found.
## Call 9 found.
## Call 14 found.
## Call 16 found.
## Call 17 found.
## Call 19 found.
## Call 20 found.
## Call 21 found.
## Call 22 found.
## Call 23 found.
## Call 24 found.
## Call 25 found.
## Call 26 found.
## Call 37 found.
## Call 39 found.
## Call 42 found.
## Call 43 found.
## Call 44 found.
## Call 47 found.
## Call 51 found.
## Call 53 found.
## Call 54 found.
## Call 55 found.
## Call 58 found.
## Call 59 found.
## Call 63 found.
## Call 64 found.
## Call 67 found.
## Call 72 found.
## Call 75 found.
## Call 78 found.
## Call 79 found.
## Call 83 found.
## Call 85 found.
## Call 90 found.
## Call 95 found.
## Call 96 found.
## Call 97 found.
## Call 98 found.
## Call 102 found.
## Call 108 found.
## Call 109 found.
## Call 110 found.
## Call 111 found.
## Call 112 found.
## Call 114 found.
## Call 115 found.
## Call 119 found.
## Call 120 found.
## Call 122 found.
## Call 123 found.
## Call 128 found.
##

In [23]:
query_out['659773']['doi']

'10.1021/jo00059a013'

# Converting Finished JSON to usable CSV
Columns:
- reaction_id,
- doi
- substrate
- product
- 
- fulltext
- 

In [2]:
# Load the metadata
json_file = './quantify_datasets/query_out.json'
if os.path.isfile(json_file):
    with open(json_file, 'r') as fp:
        query_out = json.load(fp)
        fp.close()

In [10]:
try:
    import cPickle as pickle
except ImportError:  # Python 3.x
    import pickle
    
with open('./get_doi/refDict.p', 'rb') as fp:
    refDict = pickle.load(fp)
    
# refDict is ready to use now.