In [1]:
import sys

In [19]:
#!/usr/bin/env python

import re
import sys
import json
import time
import random
import argparse
import networkx
from pathlib import Path

- https://github.com/ORCID/python-orcid
- https://github.com/scholarly-python-package/scholarly
- https://github.com/fabiobatalha/crossrefapi    

In [3]:
# !{sys.executable} -m pip install crossrefapi orcid scholarly[tor]

In [4]:
# Import necessary libraries
import requests
from scholarly import scholarly
import orcid 
import crossref

In [100]:
# Define function to retrieve DOIs for all papers published by a researcher using their ORCID or Google Scholar profile page
def get_paper_dois(profile_page):
    
    # Define empty list to store DOIs
    dois = []
    
    # Check if the profile page is for ORCID or Google Scholar
    if "orcid" in profile_page:
        
        # Define API endpoint and headers
        endpoint = f"https://pub.orcid.org/v3.0/{profile_page.split('/')[-1]}/works"
        headers = {"Accept": "application/vnd.citationstyles.csl+json", "Authorization": "Bearer <access_token>"}
        
        # Send GET request to API endpoint
        response = requests.get(endpoint, headers=headers)
        
        # Check if response was successful
        if response.status_code == 200:
            
            # Extract list of DOIs from response
            works = response.json()["group"]
            for work in works:
                doi = work["external-ids"]["external-id"][0]["external-id-value"]
                dois.append(doi)
            
        else:
            print("Error retrieving data from ORCID API")
            
    elif "scholar.google" in profile_page:
        
        # Retrieve Google Scholar profile using the scholarly package
        search_query = scholarly.search_author_id(profile_page.split('=')[-1])
        profile = scholarly.fill(next(search_query))
        
        # Extract list of DOIs from profile
        for pub in profile.publications:
            if pub.bib.get("doi"):
                dois.append(pub.bib["doi"])
            else:
                # Try to search for the DOI using gslookup
                search_query = f"{pub.bib['title']} {pub.bib['author']}"
                search_results = scholarly.gscholar(search_query)
                for result in search_results:
                    if pub.bib["title"] == result["title"] and pub.bib["author"] == result["author"]:
                        if result.get("eprint"):
                            # Extract DOI from eprint URL
                            doi = result["eprint"].split("/")[-1]
                            if doi.startswith("10."):
                                dois.append(doi)
                        elif result.get("url"):
                            # Extract DOI from URL
                            doi = result["url"].split("/")[-1]
                            if doi.startswith("10."):
                                dois.append(doi)                
    
    else:
        print("Invalid profile page. Please provide a valid ORCID or Google Scholar profile page.")
    
    return dois


In [12]:
orcid_profile = 'https://orcid.org/0000-0001-6781-893X'

In [13]:
orcid_url = f'{orcid_profile}/worksExtendedPage.json?offset=0&sort=date&sortAsc=false&pageSize=50'

In [14]:
resp = requests.get(orcid_url)

In [15]:
resp.ok

True

In [16]:
resp.status_code

200

In [17]:
resp.json()

{'nextOffset': 50,
 'totalGroups': 21,
 'groups': [{'activePutCode': 123746668,
   'defaultPutCode': 123746668,
   'groupId': 0,
   'activeVisibility': 'PUBLIC',
   'userVersionPresent': False,
   'externalIdentifiers': [{'errors': [],
     'externalIdentifierId': {'errors': [],
      'value': '10.1080/15592294.2020.1748917',
      'required': True,
      'getRequiredMessage': None},
     'externalIdentifierType': {'errors': [],
      'value': 'doi',
      'required': True,
      'getRequiredMessage': None},
     'url': {'errors': [],
      'value': 'https://doi.org/10.1080/15592294.2020.1748917',
      'required': True,
      'getRequiredMessage': None},
     'relationship': {'errors': [],
      'value': 'self',
      'required': True,
      'getRequiredMessage': None},
     'normalized': {'errors': [],
      'value': '10.1080/15592294.2020.1748917',
      'required': False,
      'getRequiredMessage': None},
     'normalizedUrl': {'errors': [],
      'value': 'https://doi.org/10.1080

In [21]:
google_scholar_profile='https://scholar.google.com/citations?user=gvzAHzcAAAAJ'

In [22]:
Path(google_scholar_profile).parts

('https:', 'scholar.google.com', 'citations?user=gvzAHzcAAAAJ')

In [24]:
google_scholar_id = google_scholar_profile.split('=')[-1]

In [27]:
author = scholarly.search_author_id(google_scholar_id)

In [28]:
from scholarly import ProxyGenerator

# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)

In [159]:
# Retrieve the author's data, fill-in, and print
search_query = scholarly.search_author('Alberto Labarga')
author = scholarly.fill(next(search_query))

In [160]:
author.keys()

dict_keys(['container_type', 'filled', 'source', 'scholar_id', 'url_picture', 'name', 'affiliation', 'email_domain', 'interests', 'citedby', 'organization', 'homepage', 'citedby5y', 'hindex', 'hindex5y', 'i10index', 'i10index5y', 'cites_per_year', 'coauthors', 'publications', 'public_access'])

In [161]:
author["name"]

'Alberto Labarga'

In [82]:
author["scholar_id"]

'gvzAHzcAAAAJ'

In [162]:
for pub in author['publications']:
    title = pub['bib']['title']
    print(title)

New developments in the InterPro database
Web services at the European bioinformatics institute
A public resource facilitating clinical use of genomes
Beta electroencephalograph changes during passive movements: sensory afferences contribute to beta event-related desynchronization in humans
The Semantic Automated Discovery and Integration (SADI) web service design-pattern, API and reference implementation
Frontal and central oscillatory changes related to different aspects of the motor process: a study in go/no-go paradigms
Movement-related changes in cortical oscillatory activity in ballistic, sustained and negative movements
Alpha and beta oscillatory changes during stimulus-induced movement paradigms: effect of stimulus predictability
Gamma band activity in an auditory oddball paradigm studied with the wavelet transform
Alpha and beta oscillatory activity during a sequence of two movements
Priorities for nucleotide trace, sequence and annotation data capture at the Ensembl Trace Arc

In [173]:
pub = author['publications'][0]

In [163]:
len(author['publications'])

49

## Crossref

In [165]:
from crossref.restful import Works

In [166]:
works = Works()

In [174]:
pub

{'container_type': 'Publication',
 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>,
 'bib': {'title': 'New developments in the InterPro database',
  'pub_year': '2007',
  'citation': 'Nucleic acids research 35 (suppl_1), D224-D228, 2007'},
 'filled': False,
 'author_pub_id': 'gvzAHzcAAAAJ:u5HHmVD_uO8C',
 'num_citations': 594,
 'citedby_url': 'https://scholar.google.com/scholar?oi=bibs&hl=en&cites=17463171523175817476',
 'cites_id': ['17463171523175817476']}

In [179]:
results = [work for work in works.query(bibliographic=pub['bib']['title'], author=author['name'])]


In [None]:
next(works.query(bibliographic=pub['bib']['title'], author=author['name']))

In [None]:
works.doi('10.1590/0102-311x00133115')

In [119]:
from urllib.parse import quote

In [132]:
 # Try to search for the DOI using Crossref
title = pub["bib"]["title"]
author = pub["bib"]["author"][0]

In [133]:
def get_doi(title, author):
    query = f"title:{quote(title)}+AND+author:{quote(author)}"
    response = requests.get(f"https://api.crossref.org/works?query={query}&mailto=alberto.labarga@bsc.es")
    if response.status_code == 200:
        data = response.json()
        return data["message"]["items"][0].get('URL')
    return None


In [134]:
get_doi(title, author)

'http://dx.doi.org/10.1093/nar/gkm291'

In [123]:
f"https://api.crossref.org/works?query={query}"

'https://api.crossref.org/works?query=title:Web%20services%20at%20the%20European%20bioinformatics%20institute+AND+author:A%20Labarga'

In [131]:
[it.get('URL') for it in data["message"]["items"]]

['http://dx.doi.org/10.1093/nar/gkm291',
 'http://dx.doi.org/10.1093/bib/4.4.332',
 'http://dx.doi.org/10.1002/cpbi.74',
 'http://dx.doi.org/10.1002/0471650129.dob0196',
 'http://dx.doi.org/10.1016/s0076-6879(96)66003-0',
 'http://dx.doi.org/10.1093/bib/bbn029',
 'http://dx.doi.org/10.1186/1471-2105-9-229',
 'http://dx.doi.org/10.1093/nar/gkp302',
 'http://dx.doi.org/10.1042/bio02604033',
 'http://dx.doi.org/10.1007/0-387-27478-2_9',
 'http://dx.doi.org/10.1007/978-94-6265-383-2_2',
 'http://dx.doi.org/10.1093/ww/9780199540884.013.37638',
 'http://dx.doi.org/10.1093/nar/gkh405',
 'http://dx.doi.org/10.1093/ww/9780199540884.013.u37638',
 'http://dx.doi.org/10.1186/s12859-019-3159-9',
 'http://dx.doi.org/10.1385/0-89603-358-9:303',
 'http://dx.doi.org/10.1093/bioinformatics/btn387',
 'http://dx.doi.org/10.1093/nar/24.1.6',
 'http://dx.doi.org/10.1093/bib/bbq023',
 'http://dx.doi.org/10.1042/bio03604039']

In [130]:
data["message"]["items"][0].get('URL')

'http://dx.doi.org/10.1093/nar/gkm291'

In [None]:
if data["message"]["total-results"] > 0:
    doi = data["message"]["items"][0].get("DOI")
    if doi:
        dois.append(doi)

In [101]:
get_paper_dois(google_scholar_profile).get('URL')

TypeError: 'dict' object is not an iterator

In [78]:
get_paper_dois(orcid_url)

Error retrieving data from ORCID API


[]

In [None]:
'''
Opencitations Network
fetching citation data of a specific research paper using COCI REST API
'''

#user input
doi=input("enter doi, ex- 10.1021/ci500020m : ")


doilist=[doi] #list of all DOIs in network
lst = []

for z in doilist:
    res=requests.get(f'https://opencitations.net/index/coci/api/v1/citations/{z}')
    datax = res.json()
    dlen=len(datax)
    n=dlen
    for j in range(n):
        data1x=datax[j]
        dfy=list(data1x.keys())
        dfx=list(data1x.values())
        datf=pd.DataFrame({"value":dfx,"key":dfy})
        lst.append(dfx)
        y=datf.loc[datf['key'] == 'citing', 'value'].iloc[0]
        doilist.append(y)
        
df = pd.DataFrame(lst, columns=dfy)



## Visualisation....................................


In [None]:

import networkx as nx
from pyvis.network import Network
import IPython

G = nx.from_pandas_edgelist(df,'cited','citing')
net=Network(height='1000px',width='100%',bgcolor='#222222',font_color='white',directed='True') #add layout = 'True' for heirarchical tree layout
net.from_nx(G)
net.save_graph('coci.html')
IPython.display.HTML(filename='coci.html')



In [None]:

#extracting metadata of all the DOIs in the network for further data analysis

metalist=[] # contains metadata of all DOIs in network
for k in doilist:
    metares=requests.get(f'https://opencitations.net/index/coci/api/v1/metadata/{k}')
    datam=metares.json()
    metalist.append(datam)

#extracting urls of all the open access DOIs in the network

from urlextract import URLExtract
metastr=str(metalist)
extractor = URLExtract()
urls = extractor.find_urls(metastr)