In [None]:
# Import modules
import pandas as pd
import requests
from time import sleep

# Configuration
accept_media_type = 'application/json'
endpoint = 'https://query.wikidata.org/sparql'
user_agent_header = 'author_disambiguation/0.1 (mailto:steve.baskauf@vanderbilt.edu)'
sparql_sleep = 0.1

# Function definitions
def extract_local_name(iri):
    """Extracts the local name part of an IRI, e.g. a Q ID from a Wikidata IRI. Input: string. Returns: string."""
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    last_piece = len(pieces)
    return pieces[last_piece - 1]

def generate_sparql_header_dictionary(accept_media_type,user_agent_header):
    request_header_dictionary = {
        'Accept' : accept_media_type,
#        'Content-Type': 'application/sparql-query',
        'Content-Type': 'application/x-www-form-urlencoded',
        'User-Agent': user_agent_header
    }
    return request_header_dictionary

# The following function requires the request header generated above
sparql_request_header = generate_sparql_header_dictionary(accept_media_type, user_agent_header)

# Sends a query to the query service endpoint. 
# NOTE: sparql_request_header and endpoint are global variables defined earlier in the script
def send_sparql_query(query_string):
    # You can delete the two print statements if the queries are short. However, for large/long queries,
    # it's good to let the user know what's going on.
    #print('querying SPARQL endpoint to acquire item metadata')
    #response = requests.post(endpoint, data=query_string.encode('utf-8'), headers=sparql_request_header)
    response = requests.post(endpoint, data=dict(query=query_string), headers=sparql_request_header)
    #print(response.text) # uncomment to view the raw response, e.g. if you are getting an error
    data = response.json()

    # Extract the values from the response JSON
    results = data['results']['bindings']
    
    #print('done retrieving data')
    # print(json.dumps(results, indent=2))
    
    sleep(sparql_sleep) # delay to avoid hitting the Query Service too fast
    return results

# ----------------------------
# Begin main script
# ----------------------------

all_authors_frame = pd.read_csv('docs-authors.csv', na_filter=False, dtype = str)

# Create a non-redundant list of author IRIs
author_iris = list(set(all_authors_frame['contributor_iri']))
author_iris

In [None]:
# Create lists to hold unique values
ids = []
names = []
orcids = []
viafs = []
qids = []
other_ids = []
affiliation_qids = []

# Extract information for each author
for author_iri in author_iris:

    # Find the first row that matches that author
    for index, author in all_authors_frame.iterrows():
        if author_iri == author['contributor_iri']:
            ids.append(author_iri)
            names.append(author['contributor_literal'])
            affiliation_qids.append(extract_local_name(author['affiliation_uri']))
            none_of_the_above = True
            if 'https://orcid.org/' in author_iri:
                none_of_the_above = False
                orcids.append(extract_local_name(author_iri))
            else:
                orcids.append('')
            if 'http://viaf.org/viaf/' in author_iri:
                none_of_the_above = False
                viafs.append(extract_local_name(author_iri))
            else:
                viafs.append('')
            if 'http://www.wikidata.org/entity/' in author_iri:
                none_of_the_above = False
                qids.append(extract_local_name(author_iri))
            else:
                qids.append('')
            if none_of_the_above:
                other_ids.append(author_iri)
            else:
                other_ids.append('')
            break # stop checking for matches and go on to next author IRI

out_frame = pd.DataFrame({'qid': qids, 'id': ids, 'name': names, 'affiliation': affiliation_qids, 'orcid': orcids, 'viaf': viafs, 'other': other_ids})
out_frame.to_csv('authors.csv', index = False)
print('done')


Manually added huh ID numbers

In [None]:
# Send SPARQL queries to match people to Q IDs by their various identifiers

authors_frame = pd.read_csv('authors.csv', na_filter=False, dtype = str)

for index, author in authors_frame.iterrows():
    do_query = True
    if author['orcid'] != '':
        query_string = '''
select distinct ?qid ?name
where {
  ?qid wdt:P496 "''' + author['orcid'] + '''".
  ?qid rdfs:label ?name.
  filter(lang(?name)="en")
  }
'''
    elif author['viaf'] != '':
        query_string = '''
select distinct ?qid ?name
where {
  ?qid wdt:P214 "''' + author['viaf'] + '''".
  ?qid rdfs:label ?name.
  filter(lang(?name)="en")
  }
'''
    elif author['huh'] != '':
        query_string = '''
select distinct ?qid ?name
where {
  ?qid wdt:P6264 "''' + author['huh'] + '''".
  ?qid rdfs:label ?name.
  filter(lang(?name)="en")
  }
'''
    else:
        do_query = False
    
    if do_query:
        #print(query_string)
        results = send_sparql_query(query_string)
        if len(results) > 0:
            qid = extract_local_name(results[0]['qid']['value'])
            print(qid)
            author['qid'] = qid
            print(results[0]['name']['value'])

authors_frame.to_csv('authors_qids.csv', index = False)
print('done')



## Documents data

In [None]:
out_frame = pd.read_csv('documents/documents.csv', na_filter=False, dtype = str)
standards_frame = pd.read_csv('standards/standards.csv', na_filter=False, dtype = str)
source_docs_frame = pd.read_csv('documents/docs.csv', na_filter=False, dtype = str)

for index, document in out_frame.iterrows():
    print(document['part_of_iri'])
    document['partOf'] = standards_frame.loc[standards_frame.website == document['part_of_iri'], 'qid'].values[0]
    
out_frame.to_csv('documents/output.csv', index = False)
print('done')


## Match author items with documents

In [None]:
author_qid_frame = pd.read_csv('authors_qids.csv', na_filter=False, dtype = str)
documents_frame = pd.read_csv('documents/documents.csv', na_filter=False, dtype = str)
author_frame = pd.read_csv('documents/authors/authors.csv', na_filter=False, dtype = str)
#author_frame = author_frame.head(5).copy()

for index, author in author_frame.iterrows():
    print(author['author_stated_as'])
    # Look up Q ID
    try:
        author['author'] = author_qid_frame.loc[author_qid_frame.id == author['contributor_iri'], 'qid'].values[0]
    except:
        print(author)
    # Look up Q ID
    author['qid'] = documents_frame.loc[documents_frame.fullWork == author['author_ref1_referenceUrl'], 'qid'].values[0]
    
author_frame.to_csv('documents/authors/output.csv', index = False)
print('done')
