In [214]:
from urllib.request import Request, urlopen
from urllib.parse import urlencode, quote_plus
import time
import codecs
import pandas as pd
import xml.etree.ElementTree as ET
import pymarc
from pymarc import Record,Field
import marcx
import io
import furl
import sharepa
from sharepa import basic_search
from sharepa import ShareSearch
from sharepa.helpers import pretty_print
from elasticsearch_dsl import Q
import requests


In [2]:
import json

def query_share(url, query):
    # A helper function that will use the requests library,
    # pass along the correct headers,
    # and make the query we want
    headers = {'Content-Type': 'application/json'}
    data = json.dumps(query)
    return requests.post(url, headers=headers, data=data, verify=False).json()

In [3]:
OSF_APP_URL = 'https://osf.io/api/v1/share/search/'
search_url = furl.furl(OSF_APP_URL)
search_url.args['size'] = 3
search_url.args['sort'] = 'providerUpdatedDateTime'
search_url.args['from'] = 5

In [21]:
from datetime import datetime

recent_results = requests.get(search_url.url).json()
for result in recent_results['results']:
    print(
        '{} -- from {} -- updated on {}'.format(
            result['title'].encode('utf-8'),
            result['shareProperties']['source'],
            datetime.strftime(datetime.strptime(result['providerUpdatedDateTime'][:19], "%Y-%m-%dT%H:%M:%S"), '%B %d %Y')
        )
    )

b'Methods for Preregistration' -- from osf -- updated on March 20 2016
b'The role of antonym relationships in toddlers\xe2\x80\x99 lexical-semantic organization of adjectives' -- from osf -- updated on March 20 2016
b'Methods for Preregistration' -- from osf -- updated on March 20 2016


In [24]:
sponsorship_query = {
    "size": 5,
    "query": {
        "filtered": {
            "filter": {
                "exists": {
                    "field": "sponsorships"
                }
            }
        }
    }
}

In [25]:
results = query_share(search_url.url, sponsorship_query)

for item in results['results']:
    print('{} -- from source {} -- sponsored by {}'.format(
            item['title'].encode('utf-8'),
            item['shareProperties']['source'].encode('utf-8'),
            ' '.join(
                [sponsor['sponsor']['sponsorName'] for sponsor in item['sponsorships']]
            )
        )
    )
    print('-------------------')

b'A Phase III, Randomized, Comparative, Open-label Study of Intravenous Iron Isomaltoside 1000 (Monofer\xc2\xae) Administered as Maintenance Therapy by Single or Repeated Bolus Injections in Comparison With Intravenous Iron Sucrose in Subjects With Stage 5 Chronic Kidney Disease on Dialysis Therapy (CKD-5D)' -- from source b'clinicaltrials' -- sponsored by Pharmacosmos A/S 
-------------------
b'Phase IB Study of FOLFIRINOX Plus PF-04136309 in Patients With Borderline Resectable and Locally Advanced Pancreatic Adenocarcinoma' -- from source b'clinicaltrials' -- sponsored by Washington University School of Medicine National Cancer Institute (NCI)
-------------------
b"Discontinuation of Infliximab Therapy in Patients With Crohn's Disease During Sustained Complete Remission: A National Multi-center, Double Blinded, Randomized, Placebo Controlled Study" -- from source b'clinicaltrials' -- sponsored by Copenhagen University Hospital at Herlev 
-------------------
b'Temperature Evaluation b



In [26]:
my_search = ShareSearch()

my_search = my_search.query(
    'query_string',
    query='subjects:*',
    analyze_wildcard=True
)

pretty_print(my_search.to_dict())

{
    "query": {
        "query_string": {
            "analyze_wildcard": true,
            "query": "subjects:*"
        }
    }
}


In [27]:
new_results = my_search.sort('-providerUpdatedDateTime').execute()

for hit in new_results:
    print(
        '{} - with subjects {}\n\n'.format(
            hit.title.encode('utf-8'),
            [sub.encode('utf-8') for sub in hit.subjects]
        )
    )

b'GHRSST Level 4 DMI_OI Global Foundation Sea Surface Temperature Analysis (GDS version 2) for 2016-02-14 (NCEI Accession 0145472)' - with subjects [b'ICE - COVERAGE', b'SEA SURFACE TEMPERATURE', b'AATSR-NR', b'AMSR-E', b'AVHRR-3', b'MODIS', b'SEVIRI', b'satellite data', b'Aqua SATELLITE', b'Envisat SATELLITE', b'MetOp-A', b'MSG2 SATELLITE', b'NOAA-19 SATELLITE', b'Terra SATELLITE', b'Danish Meteorological Institute', b'US NASA; Jet Propulsion Laboratory; Physical Oceanography Distributed Active Archive Center', b'Group for High Resolution Sea Surface Temperature (GHRSST)', b'World-Wide Distribution']


b'Os protestos anti austeridade e o conflito em torno do desemprego' - with subjects [b'mercadoriza\xc3\xa7\xc3\xa3o do trabalho', b'a\xc3\xa7\xc3\xa3o coletiva', b'workfare', b'desemprego', b'ex\xc3\xa9rcito de reserva']


b'A Promo\xc3\xa7\xc3\xa3o da Seguran\xc3\xa7a, Higiene e Sa\xc3\xbade no Trabalho nas Empresas de Constru\xc3\xa7\xc3\xa3o em Cabo Verde' - with subjects [b'constru

In [74]:
columns = ['orcids']
orcids_df = pd.read_excel('/Volumes/jwa_drive1/git/Share/ORCIDs.xlsx',columns=columns)
#orcids_df

In [55]:
orcid_search = ShareSearch()

for i in range(1000,1925):
    orcid = 'http://orcid.org/' + orcids_df.ix[i]['orcids']
    orcid_search = orcid_search.query(
        {
            "bool": {
                "should": [
                    {
                        "match": {
                            "contributors.sameAs": {
                                "query": orcid, 
                                "operator": "and",
                                "type" : "phrase"
                            }
                        }
                    }
                ]
            }
        }
    )

In [56]:
orcid_search.aggs.bucket(
    'sources',
    'terms',
    field='_type',
    size=0,
    min_doc_count=1
)

orcid_results = orcid_search.execute()

In [57]:
print(
    'There are {} documents with contributors who have any of those orcids.'.format(
        orcid_search.count()
    )
)

all_agg_df = pd.DataFrame()
all_agg_df['title'] = [result.title for result in orcid_results]
all_agg_df['docID'] = [result.shareProperties.docID for result in orcid_results]
all_agg_df['source'] = [result.shareProperties.source for result in orcid_results]
all_agg_df

There are 10 documents with contributors who have any of those orcids.


Unnamed: 0,title,docID,source
0,Supramolecular regulation of bioorthogonal cat...,10.1038/nchem.2284,crossref
1,Clinical vignettes and global health considera...,10.1186/s40738-016-0017-6,crossref
2,Depletion of microglia and inhibition of exoso...,10.1038/nn.4132,crossref
3,Comparative analysis of the growth and biologi...,10.1186/s12866-015-0569-3,crossref
4,Unifocal versus multifocal mandibular fracture...,10.1007/s10140-015-1375-9,crossref
5,Neisseria gonorrhoeae modulates cell death in...,10.1128/iai.00732-15,crossref
6,Integrated omics and computational glycobiolog...,10.1074/mcp.m116.058016,crossref
7,Improving Access to Online Health Information ...,10.2196/jmir.5239,crossref
8,Genetic association analyses highlight biologi...,10.1038/ng.3383,crossref
9,Disruptive Innovation: Implementation of Elect...,10.2196/medinform.4801,crossref


In [54]:
df = all_agg_df
df

Unnamed: 0,title,docID,source
0,Lysyl oxidase propeptide stimulates osteoblast...,10.1007/s12079-015-0311-9,crossref
1,Pre-gravid oral contraceptive use in relation ...,10.1007/s10654-015-0053-2,crossref
2,Descriptive epidemiology and short-term outcom...,10.1136/heartjnl-2015-308451,crossref
3,Improved insulin sensitivity 3 months after RY...,10.2337/db14-1765,crossref
4,Mini Mental State Examination and Logical Memo...,10.1186/s13195-016-0176-z,crossref
5,Differential Acceptance of Genomic Medicine Ap...,10.1007/s40670-015-0146-2,crossref
6,Reply to “Decrease in Penicillin Sales in Braz...,10.1128/aac.01128-15,crossref
7,Disruptive Innovation: Implementation of Elect...,10.2196/medinform.4801,crossref


In [58]:
df = df.append(all_agg_df,ignore_index=True)

In [59]:
df

Unnamed: 0,title,docID,source
0,Lysyl oxidase propeptide stimulates osteoblast...,10.1007/s12079-015-0311-9,crossref
1,Pre-gravid oral contraceptive use in relation ...,10.1007/s10654-015-0053-2,crossref
2,Descriptive epidemiology and short-term outcom...,10.1136/heartjnl-2015-308451,crossref
3,Improved insulin sensitivity 3 months after RY...,10.2337/db14-1765,crossref
4,Mini Mental State Examination and Logical Memo...,10.1186/s13195-016-0176-z,crossref
5,Differential Acceptance of Genomic Medicine Ap...,10.1007/s40670-015-0146-2,crossref
6,Reply to “Decrease in Penicillin Sales in Braz...,10.1128/aac.01128-15,crossref
7,Disruptive Innovation: Implementation of Elect...,10.2196/medinform.4801,crossref
8,Supramolecular regulation of bioorthogonal cat...,10.1038/nchem.2284,crossref
9,Clinical vignettes and global health considera...,10.1186/s40738-016-0017-6,crossref


In [60]:
result

{'contributors': [{'additionalName': 'H',
   'familyName': 'Wojcik',
   'givenName': 'Erica',
   'name': 'Erica H Wojcik',
   'sameAs': ['https://osf.io/gqudi/']},
  {'additionalName': 'F.',
   'familyName': 'Werker',
   'givenName': 'Janet',
   'name': 'Janet F. Werker',
   'sameAs': ['https://osf.ioNone']},
  {'additionalName': 'Geoffrey',
   'familyName': 'Hall',
   'givenName': 'D.',
   'name': 'D. Geoffrey Hall',
   'sameAs': ['https://osf.ioNone']}],
 'highlight': {},
 'otherProperties': [{'name': 'parent_title',
   'properties': {'parent_title': 'The role of antonym relationships in toddlers’ lexical-semantic organization of adjectives'}},
  {'name': 'category', 'properties': {'category': 'registration'}},
  {'name': 'wiki_link', 'properties': {'wiki_link': '/4pdh7/wiki/'}},
  {'name': 'is_component', 'properties': {'is_component': True}},
  {'name': 'is_registration', 'properties': {'is_registration': True}},
  {'name': 'parent_url', 'properties': {'parent_url': '/jmdz3/'}}],
 

In [62]:
result.keys()

dict_keys(['highlight', 'contributors', 'otherProperties', 'providerUpdatedDateTime', 'shareProperties', 'uris', 'title'])

In [67]:
orcid_results.hits

[<Result(share_v2/crossref/10.1038/nchem.2284): {'description': '', 'highlight': {}, 'contributors': [{'give...}>, <Result(share_v2/crossref/10.1186/s40738-016-0017-6): {'highlight': {}, 'contributors': [{'familyName': 'Chow', 'g...}>, <Result(share_v2/crossref/10.1038/nn.4132): {'highlight': {}, 'contributors': [{'familyName': 'Asai', 'g...}>, <Result(share_v2/crossref/10.1186/s12866-015-0569-3): {'uris': {'canonicalUri': 'http://dx.doi.org/10.1186/s12866-...}>, <Result(share_v2/crossref/10.1007/s10140-015-1375-9): {'highlight': {}, 'contributors': [{'familyName': 'Buch', 'g...}>, <Result(share_v2/crossref/10.1128/iai.00732-15): {'description': '', 'highlight': {}, 'contributors': [{'give...}>, <Result(share_v2/crossref/10.1074/mcp.m116.058016): {'subjects': ['analytical chemistry', 'biochemistry', 'molec...}>, <Result(share_v2/crossref/10.2196/jmir.5239): {'highlight': {}, 'contributors': [{'familyName': 'Bickmore'...}>, <Result(share_v2/crossref/10.1038/ng.3383): {'uris': {'canonica

In [73]:
orcid_results.hits[1]

<Result(share_v2/crossref/10.1186/s40738-016-0017-6): {'highlight': {}, 'contributors': [{'familyName': 'Chow', 'g...}>

In [81]:
sb_url = 'https://pub.orcid.org/oauth/authorize'

step1_url = furl.furl(sb_url)
step1_url.args['client_id'] = 'APP-W2HCYJTEKPKTNV67'
step1_url.args['response_type'] = 'code'
step1_url.args['scope'] = '/read-public'
step1_url.args['redirect_uri'] ='https://developers.google.com/oauthplayground'
step1_url.url

'https://pub.orcid.org/oauth/authorize?client_id=APP-W2HCYJTEKPKTNV67&response_type=code&scope=/read-public&redirect_uri=https://developers.google.com/oauthplayground'

In [82]:
requests.get(step1_url.url)

<Response [500]>

In [116]:
values = {}
values['client_id']='APP-W2HCYJTEKPKTNV67'
values['client_secret'] = 'd519de68-004c-41f5-90af-95a77c23575c' 
values['scope'] = '/read-public'
values['response_type'] = 'code'

In [117]:
data = urlencode(values).encode('utf-8')

In [118]:
req = Request('https://pub.orcid.org/oauth/authorize',data)

In [126]:
req.add_header('Accept','application/json')

In [130]:
urlopen(req).read()

HTTPError: HTTP Error 500: Internal Server Error

In [128]:
req.get_header('Accept')

'application/json'

In [135]:
from io import BytesIO
from pycurl import Curl
buffer = BytesIO()
c = Curl()
c.setopt(c.URL, 'http://pycurl.io/')
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()


body = buffer.getvalue()
# Body is a byte string.
# We have to know the encoding in order to print it to a text file
# such as standard output.
print(body.decode('iso-8859-1'))






<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">

<head>
  <title>PycURL Home Page</title>
  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
  <meta name="author" content="Kjetil Jacobsen, Markus F.X.J. Oberhumer" />
  <meta name="description" content="PycURL Homepage" />
  <meta name="keywords" content="pycurl, curl, libcurl, python, wget, file transfer, urllib" />
  <meta name="revisit-after" content="30 days" />
  <meta name="robots" content="archive, index, follow" />
</head>


<body text="#000000" bgcolor="#ffffff" link="#0000ee" vlink="#551a8b" alink="#0000ee">

<center>
  <a href="http://curl.haxx.se/libcurl/"><img src="http://curl.haxx.se/ds-libcurl.jpg" width="466" height="181" border="0" alt="libcurl"></img></a>
</center>

<center>
  <br />
  <b><font size="+3" face="Courier New, Courier, mono">PYCURL 7.43.0</font></b><br />

In [136]:

import pycurl
try:
    # python 3
    from urllib.parse import urlencode
except ImportError:
    # python 2
    from urllib import urlencode

c = pycurl.Curl()
c.setopt(c.URL, 'http://pycurl.io/tests/testpostvars.php')

post_data = {'field': 'value'}
# Form data must be provided already urlencoded.
postfields = urlencode(post_data)
# Sets request method to POST,
# Content-Type header to application/x-www-form-urlencoded
# and data to send in request body.
c.setopt(c.POSTFIELDS, postfields)

c.perform()
c.close()

In [154]:
headers = {}
header['Content-Type'] = 'application/orcid+xml'
header['Authorization'] ='Bearer 988eb2a6-ae72-4717-b039-31039b9317c3'

In [173]:
url = 'https://pub.orcid.org/v1.2/{{orcid_id}}/orcid-profile/'.replace('{{orcid_id}}','0000-0001-5019-0242')
req = Request(url)

In [174]:
req.add_header('Content-Type','application/orcid+xml')
req.add_header('Authorization', 'Bearer 988eb2a6-ae72-4717-b039-31039b9317c3')

req.get_header('Content-Type')

In [175]:
req.get_header('Authorization')

'Bearer 988eb2a6-ae72-4717-b039-31039b9317c3'

In [176]:
resp = urlopen(req)

In [196]:
resp.read().decode()

''

In [233]:
records = ET.Element('records')
length = orcids_df.shape[0]
print(length)
for i in range(1500,length):
    time.sleep(1)
    orcid_id = orcids_df.ix[i]['orcids']
    #print(orcid_id)
    url = 'https://pub.orcid.org/v1.2/{{orcid_id}}/orcid-profile/'.replace('{{orcid_id}}',orcid_id)
    req = Request(url)
    req.add_header('Content-Type','application/orcid+xml')
    req.add_header('Authorization', 'Bearer 4875c695-3883-42da-9157-c47bf3ce343e')
    try:
        resp_xml = urlopen(req).read().decode()
    except Exception as e:
        print(str(e))
        continue
    #root = ET.fromstring(resp_xml)
    #kids = root.getchildren()
    profile = root.find('{http://www.orcid.org/ns/orcid}orcid-profile')
    records.append(profile)
    print(str(i),orcid_id)
    #elements = profile.getchildren()
    #for element in elements:
    #    print('\t',element.tag)
    #    for el in element.getchildren():
    #        print('\t\t',el.tag,el.attrib,el.text)
    #        for e in el.getchildren():
    #            print('\t\t\t',e.tag,e.attrib,e.text)
    #            for x in e.getchildren():
    #                print('\t\t\t\t',x.tag,x.attrib,x.text)
    #                for y in x.getchildren():
    #                    print('\t\t\t\t',y.tag,y.attrib,y.text)
    #print('get the kids')

#print(records)
#ET.tostring(records)
    
out_xml = ET.tostring(records)
out = io.open('/Volumes/jwa_drive1/git/Share/orcid_recs.xml','wb')
out.write('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'.encode())
out.write(out_xml)
out.close()
    

1925
1500 0000-0003-0420-0766
HTTP Error 409: Conflict
HTTP Error 409: Conflict
1503 0000-0003-0438-4382
1504 0000-0003-0439-381X
1505 0000-0003-0494-2516
1506 0000-0003-0521-7375
1507 0000-0003-0536-2986
1508 0000-0003-0542-3291
1509 0000-0003-0548-4502
1510 0000-0003-0555-0418
1511 0000-0003-0571-921X
1512 0000-0003-0582-7499
1513 0000-0003-0625-7256
1514 0000-0003-0628-9960
1515 0000-0003-0643-2421
1516 0000-0003-0674-2454
HTTP Error 409: Conflict
1518 0000-0003-0702-6716
1519 0000-0003-0725-2975
1520 0000-0003-0726-7149
1521 0000-0003-0733-927X
1522 0000-0003-0740-4481
1523 0000-0003-0752-5718
1524 0000-0003-0760-191X
1525 0000-0003-0760-386X
1526 0000-0003-0778-2159
1527 0000-0003-0783-7498
1528 0000-0003-0794-8964
1529 0000-0003-0808-8273
1530 0000-0003-0816-1717
1531 0000-0003-0826-8477
1532 0000-0003-0831-4023
1533 0000-0003-0839-2611
1534 0000-0003-0851-5837
1535 0000-0003-0877-5399
1536 0000-0003-0886-6688
1537 0000-0003-0889-6334
1538 0000-0003-0890-6440
1539 0000-0003-0905-

In [232]:
out_xml = ET.tostring(records)
out = io.open('/Volumes/jwa_drive1/git/Share/orcid_recs2.xml','wb')
out.write('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'.encode())
out.write(out_xml)
out.close()