## Quantifying Datasets
Gauge the coverage for various sources for FULL TEXT papers.

List of sources to check:
1. Unpaywall (includes pmid?)
2. BioArxiv
3. ChemArxiv


## 1. Unpaywall

Will try using missing pmid's (use title query). This is in get_doi/refDict.p.

In [2]:
import urllib
import requests
import os
import math
import json
import pandas as pd

email = 'ivalexander13@berkeley.edu' # enter ur email pls

In [3]:
try:
    import cPickle as pickle
except ImportError:  # Python 3.x
    import pickle
    
with open('./get_doi/refDict.p', 'rb') as fp:
    refDict = pickle.load(fp)
    
# refDict is ready to use now.

## Helper Functions

Pasted get_doi from get_doi.ipynb. This is the first way of querying. Modified to work with value insteaad of key

In [3]:
import lxml.etree as ET
import urllib
import re

def get_doi(metadata: dict):
    DOIerrors = {}
    data = metadata
#     for i in (["journal", "volume", "year","pages"]): # why is this here?
#     print(data)
    
    journal = " ".join(data["journal"].split())
    volume = "".join(data["volume"].split())
    year = "".join(data["year"].split())
    start_page = "".join(data["pages"].split()).split("-")[0]
    first_author_last_name = "".join(data["authors"].split(";")[0].split(",")[0].split())
    journal_title = journal.replace(" ", '%20')
    params = (first_author_last_name, journal_title, volume, start_page, year)
    url = "https://doi.crossref.org/openurl?pid=mrunali@berkeley.edu&aulast={}&title={}&volume={}&spage={}&date={}&noredirect=true".format(*params)

    data = urllib.request.urlopen(url).read()
#     print(data)
#     print(url)
    data = data.decode("utf-8")
    
#     print(li)

    if (data.find('unresolved') != -1):
        DOIerrors[key] = data
        return 
    else:
        try:
            doi = re.findall('(<doi.*>)(.*)<\/doi>', data)[0][1]
#             return "https://sci-hub.tw/{}".format(doi)
            return doi
        except Exception as e:
            DOIerrors[key] = data
    

Query by metadata by doi. Input is the value items in refDict.

In [4]:
# returns dict if found, else returns False.
def query_meta_doi(key: str, value: dict):
    doi = get_doi(value)
    url = f'https://api.unpaywall.org/v2/{doi}?email={email}'
    
    out_dict = requests.get(url).json()
    
    if 'error' in out_dict:
        return False
    else:
        return out_dict
        
    

In [6]:
query_meta_doi('659773', {
    'authors': 'Blacklock, T.J.; Sohar, P.; Butcher, J.W.; Lamanec, T.; Grabowski, E.J.J.',
    'title': 'An enantioselective synthesis of the topically-active carbonic anhydrase inhibitor MK-0507:5,6-dihydro-(s)-4-(ethylamino)-(s)-6-mehtyl-4H-thieno[2,3-beta]thiopyran-2-sulfonamide 7,7-dioxide hydrochloride',
    'journal': 'J. Org. Chem.',
    'volume': '58',
    'pages': '1672-1679',
    'year': '1993',
    'pubmedId': '0',
    'textmining': '0'
})


{'doi': '10.1021/jo00059a013',
 'doi_url': 'https://doi.org/10.1021/jo00059a013',
 'title': 'An enantioselective synthesis of the topically-active carbonic anhydrase inhibitor MK-0507: 5,6-dihydro-(S)-4-(ethylamino)-(S)-6-methyl-4H-thieno[2,3-b]thiopyran-2-sulfonamide 7,7-dioxide hydrochloride',
 'genre': 'journal-article',
 'is_paratext': False,
 'published_date': '1993-03-01',
 'year': 1993,
 'journal_name': 'The Journal of Organic Chemistry',
 'journal_issns': '0022-3263,1520-6904',
 'journal_issn_l': '0022-3263',
 'journal_is_oa': False,
 'journal_is_in_doaj': False,
 'publisher': 'American Chemical Society (ACS)',
 'is_oa': False,
 'oa_status': 'closed',
 'has_repository_copy': False,
 'best_oa_location': None,
 'first_oa_location': None,
 'oa_locations': [],
 'updated': '2020-09-10T11:51:33.841618',
 'data_standard': 2,
 'z_authors': [{'family': 'Blacklock',
   'given': 'Thomas J.',
   'sequence': 'first'},
  {'family': 'Sohar', 'given': 'Paul', 'sequence': 'additional'},
  {'fam

## Main Query

For each item in refDict, check if it has been successfully called before, then try to get its doi (get_doi). If doi not found, use title query instead. If still not found, then mark it as fail.

Adjustable things:
- max_calls: int or False (if we want to query all)
- delete the query_out.json file if you want to reset the data
- Interrupt the loop by clicking "I" twice (or Kernel>Interrupt). The data will be saved, and stats shown.

In [12]:
# 1. check if json file is saved. Create or load a dict!!
json_file = './quantify_datasets/query_out.json'
if os.path.isfile(json_file):
    with open(json_file, 'r') as fp:
        query_out = json.load(fp)
        fp.close()
else:
    query_out = {}
    
# stats
max_calls = 90000 # max 100K per day (to be safe); or -1 if querying all.
calls = 0
queries = 0
fails = {}
are_oa = 0
successes_or_found = 0

# MAIN LOOP
try:
    # Looping through no-pmid papers (~12K)
    for key, value in refDict.items():
        key = str(math.floor(key))

        # stop before maxing out query
        if calls == max_calls or (queries == 99999):
            print("Query limit reached.")
            break
        else:
            calls += 1

        # 2. if not, then construct query
        if key not in query_out:  # there are 7 None's
            raw_title = value['title']
            if raw_title is not None:
                title = raw_title.strip()
            else:
                continue


            try:
                # First: DOI Query
                out_dict = query_meta_doi(key, value) # either dict or False
                queries += 1

                # if not found by doi, do title query. Errors if fails #fixme
                if not out_dict:
                    title_arg = title.replace(" ", '%20')
                    url = f'https://api.unpaywall.org/v2/search/?query={title_arg}&email={email}'
                    out_dict = requests.get(url).json()['results'][0]['response']
                    queries += 1


                # 4. append to DICT if successful
                query_out[key] = out_dict

                print(f"#### Call {calls} success")

                # Extra: note down # oa's
                if out_dict['is_oa']:
                    are_oa += 1

                successes_or_found += 1 # only reached when either doi or title query is successful
            except KeyboardInterrupt:
                print("### INTERRUPTED")
                break
            except:
    #             url = f'https://api.unpaywall.org/v2/search/?query={title_arg}&email={email}'

                fails[key] = [url, value]
                print(f"############# Calls: {calls}, Fails: {len(fails)}")

        else:
            if query_out[key]['is_oa']:
                    are_oa += 1
            successes_or_found += 1
            print(f"Call {calls} found")
            
    # Looping through the rest (100K?)
            
except KeyboardInterrupt:
    pass
        
# 5. save dict to file
with open(json_file, 'w') as fp:
    json.dump(query_out, fp)
    # vary: alter frequency of file save
    if (calls % 1 == 0):
        fp.close()
        
# Print stats
print("")
print("###### STATS ######")
print(f"Total calls: {calls}")
print(f"Total number of queries: {queries}")
print(f"Number of failed queries: {len(fails)}")
print(f"Number of open access papers: {are_oa}")
print(f"Number of papers in storage: {successes_or_found}")
print(f"% of open access papers: {are_oa / (successes_or_found+1) * 100}%")

Call 1 found
Call 2 found
Call 3 found
Call 4 found
Call 5 found
Call 6 found
Call 7 found
Call 8 found
Call 9 found
############# Calls: 10, Fails: 1
Call 11 found
############# Calls: 12, Fails: 2
Call 13 found
Call 14 found
Call 15 found
Call 16 found
Call 17 found
Call 18 found
Call 19 found
Call 20 found
############# Calls: 21, Fails: 3
Call 22 found
Call 23 found
Call 24 found
Call 25 found
Call 26 found
Call 27 found
Call 28 found
Call 29 found
Call 30 found
Call 31 found
Call 32 found
Call 33 found
Call 34 found
Call 35 found
Call 36 found
Call 37 found
Call 38 found
Call 39 found
Call 40 found
Call 41 found
############# Calls: 42, Fails: 4
Call 43 found
Call 44 found
#### Call 45 success
#### Call 46 success
#### Call 47 success
#### Call 48 success
#### Call 49 success
############# Calls: 50, Fails: 5
#### Call 51 success
#### Call 52 success
############# Calls: 53, Fails: 6
#### Call 54 success
#### Call 55 success
#### Call 56 success
#### Call 57 success
############# 

## Getting PMID -> DOI -> Query (not refDict). 
~100K unique PMIDs.

In [None]:
# remove all duplicates and zeroes. 
pmids = pd.read_csv("../Data_CSVs/brenda_rxns_incomplete.csv").pubmedId
pmids = pmids.drop_duplicates().dropna()
pmids = pmids.apply(lambda x: str(math.floor(x)) if x != 0 else None)
pmids = pmids.drop_duplicates().dropna()
len(pmids)

Converts PMID to DOI using PubMed API

In [17]:
import re

def doi_from_pmid(pmid):
    url = f'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=id_converter_api&email={email}&ids={pmid}'
    data = requests.get(url).content.decode("utf-8") 
    match =  re.findall('doi="([^"]*)', data)
    if match:
        return match[0]
    return False
        
doi_from_pmid(19022233)

'10.1016/j.cbi.2008.10.037'

CANT GET THIS TO WORK

In [None]:
url = 'https://www.pmid2cite.com/pmid-to-doi-converter'
myobj = {'pmidNo': '11056675'}

x = requests.post(url, data= myobj)

print(x.text)

In [None]:
# LOOP AND QUERY

# Check if json file is saved. Create or load a dict!!
json_file = './quantify_datasets/query_out_pmid.json'
if os.path.isfile(json_file):
    with open(json_file, 'r') as fp:
        query_out = json.load(fp)
        fp.close()
else:
    query_out = {}
    
# stats
pmid_max_calls = 100 #False #90000 # max 100K per day (to be safe); or False if querying all.
pmid_calls = 0
pmid_queries = 0
pmid_fail_no_doi = [] # pmid strings
pmid_fail_no_result = {} # pmid: [doi, url]
pmid_are_oa = 0
pmid_successes_or_found = 0
    
##### MAIN LOOP #####
try: 
    for pmid in pmids:
        # stop before maxing out query
        if pmid_calls == pmid_max_calls or (pmid_queries == 99999):
            print("Query limit reached.")
            break
        else:
            pmid_calls += 1
            
        # checks if pmid has been succcessful before.
        if pmid in query_out:
            successes_or_found += 1
            print(f"Call {pmid_calls} found")
            continue
            
        # query
        if doi := doi_from_pmid(pmid):
            url = f'https://api.unpaywall.org/v2/{doi}?email={email}'
            out_dict = requests.get(url).json()
            pmid_queries += 1
            
            #backup
            if !out_dict:
                url = 'https://www.w3schools.com/python/demopage.php'
                myobj = {'somekey': 'somevalue'}

                x = requests.post(url, data = myobj)

                print(x.text)
            
            # 4. append to DICT if successful
            if out_dict:
                query_out[pmid] = out_dict
                print(f"## Call {pmid_calls} success")
                pmid_successes_or_found += 1 # only reached when either doi or title query is successful
            else:
                print(f"####### Call {pmid_calls} failed. No results.")

            # Extra: note down # oa's
            if out_dict['is_oa']:
                pmid_are_oa += 1

        else: # if doi isnt available
            print(f"#### Call {pmid_calls} failed. No DOI found.")
            pmid_fail_no_doi.append(pmid)
            
except KeyboardInterrupt:
    pass

# 5. save dict to file
with open(json_file, 'w') as fp:
    json.dump(query_out, fp)
    # vary: alter frequency of file save
    if (calls % 1 == 0):
        fp.close()
        
# Print stats
print("")
print("###### STATS ######")
print(f"Total calls: {pmid_calls}")
print(f"Total number of queries: {pmid_queries}")
print(f"Number of failed doi lookups: {len(pmid_fail_no_doi)}")
print(f"Number of failed queries: {len(pmid_fail_no_result)}")
print(f"Number of open access papers: {pmid_are_oa}")
print(f"Number of papers in storage: {pmid_successes_or_found}")
print(f"% of open access papers: {pmid_are_oa / (pmid_successes_or_found+1) * 100}%")

In [None]:
pmid_fail_no_doi

## See publisher stats for Elsevier API (worth it? yes)

Can be run without running all the cells above. 

In [5]:
json_file = './quantify_datasets/query_out.json'
if os.path.isfile(json_file):
    with open(json_file, 'r') as fp:
        query_out = json.load(fp)
        fp.close()

In [43]:
publishers = {}
are_oa = 0
for key, value in query_out.items():
    pub = value['publisher']
    is_oa = value['is_oa']
    
    # Check by publisher
    if True:
        if pub not in publishers:
                publishers[pub] = 1
        else:
            publishers[pub] += 1
        
    # check if paper is oa
    if is_oa:
        are_oa += 1

In [51]:
import operator
sorted_x = sorted(publishers.items(), key=operator.itemgetter(1), reverse=True)
sorted_x

num_elsevier_papers = sum([v[1] for i, v in enumerate(sorted_x) if v[0] == 'Elsevier' or v[0] == 'Elsevier BV'])

num_E_S_W = sum([v[1] for i, v in enumerate(sorted_x) if 
                 v[0] == 'Elsevier' 
                 or v[0] == 'Elsevier BV' 
                 or v[0] == 'Springer Science and Business Media LLC'
                 or v[0] == 'Wiley'
                ])


# printouts
print(f"Number of Elsevier papers in total: {num_elsevier_papers}.")
print(f"Total papers that we have = elsevier + OA = {num_elsevier_papers} + {are_oa} = {num_elsevier_papers + are_oa}")
print(f"% of full texts out of those found in Unpaywall (if all Elsevier papers found) = {num_elsevier_papers + are_oa} / {len(query_out)} = {(num_elsevier_papers + are_oa) / len(query_out) * 100}%")
print(f"% of full texts out of those found in TOTAL (12K) (if all Elsevier papers found) = {num_elsevier_papers + are_oa} / {len(refDict)} = {(num_elsevier_papers + are_oa) / len(refDict) * 100}%")

print(f"% if fetching from Elsevier + Springer + Wiley = {num_E_S_W + are_oa} / {len(refDict)} = {(num_E_S_W + are_oa) / len(refDict) * 100}%")


Number of Elsevier papers in total: 4387.
Total papers that we have = elsevier + OA = 4387 + 2127 = 6514
% of full texts out of those found in Unpaywall (if all Elsevier papers found) = 6514 / 10398 = 62.646662819773034%
% of full texts out of those found in TOTAL (12K) (if all Elsevier papers found) = 6514 / 12064 = 53.995358090185675%
% if fetching from Elsevier + Springer + Wiley = 8534 / 12064 = 70.7393899204244%


In [45]:
sorted_x[0:10]

[('Elsevier BV', 3654),
 ('Springer Science and Business Media LLC', 1214),
 ('Informa UK Limited', 894),
 ('Wiley', 806),
 ('Elsevier', 733),
 ('Portland Press Ltd.', 448),
 ('American Chemical Society (ACS)', 383),
 ('Oxford University Press (OUP)', 276),
 ('Microbiology Society', 195),
 ('Royal Society of Chemistry (RSC)', 189)]