# Define functions and classes

Methods of the `Query()` class sends queries to Wikibase instances. It has the following methods:

`.generic_query(query)` Sends a specified query to the endpoint and returns a list of item Q IDs, item labels, or literal values. The variable to be returned must be `?object`.

`.single_property_values_for_item(qid)` Sends a subject Q ID to the endpoint and returns a list of item Q IDs, item labels, or literal values that are values of a specified property.

`.labels_descriptions(qids)` Sends a list of subject Q IDs to the endpoint and returns a list of dictionaries of the form `{'qid': qnumber, 'string': string}` where `string` is either a label, description, or alias.

`.search_statement(qids, reference_property_list)` Sends a list of Q IDs and a list of reference properties to the endpoint and returns information about statements using a specified property. If no value is specified, the information includes the values of the statements. For each statement, the reference UUID, reference property, and reference value is returned. If the statement has more than one reference, there will be multiple results per subject. Results are in the form `{'qId': qnumber, 'statementUuid': statement_uuid, 'statementValue': statement_value, 'referenceHash': reference_hash, 'referenceValue': reference_value}`

It has the following attributes:

| key | description | default value | applicable method |
|:-----|:-----|:-----|:-----|
| `endpoint` | endpoint URL of Wikabase | `https://query.wikidata.org/sparql` | all |
| `mediatype` | Internet media type | `application/json` | all |
| `useragent` | User-Agent string to send | `VanderBot/0.8` etc.| all |
| `requestheader` | request headers to send |(generated dict) | all |
| `sleep` | seconds to delay between queries | 0.25 | all |
| `isitem` | `True` if value is item, `False` if value a literal | `True` | `generic_query`, `single_property_values_for_item` |
| `uselabel` | `True` for label of item value , `False` for Q ID of item value | `True` | `generic_query`, `single_property_values_for_item` | 
| `lang` | language of label | `en` | `single_property_values_for_item`, `labels_descriptions`|
| `labeltype` | returns `label`, `description`, or `alias` | `label` | `labels_descriptions` |
| `labelscreen` | added triple pattern | empty string | `labels_descriptions` |
| `pid` | property P ID | `P31` | `single_property_values_for_item`, `search_statement` |
| `vid` | value Q ID | empty string | `search_statement` |

In [5]:
import requests   # best library to manage HTTP transactions
import json
import csv
from time import sleep
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process

# extracts the qNumber from a Wikidata IRI
def extract_qnumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts a local name from an IRI, specify the list item number for the last piece separated by slash
def extract_from_iri(iri, number_pieces):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[number_pieces]

# write a list of dictionaries to a CSV file
def writeDictsToCsv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# read from a CSV file into a list of dictionaries
def readDict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

class Query:
    def __init__(self, **kwargs):
        # attributes for all methods
        try:
            self.lang = kwargs['lang']
        except:
            self.lang = 'en' # default to English
        try:
            self.mediatype = kwargs['mediatype']
        except:
            self.mediatype = 'application/json' # default to JSON formatted query results
        try:
            self.endpoint = kwargs['endpoint']
        except:
            self.endpoint = 'https://query.wikidata.org/sparql' # default to Wikidata endpoint
        try:
            self.useragent = kwargs['useragent']
        except:
            self.useragent = 'VanderBot/0.8 (https://github.com/HeardLibrary/linked-data/tree/master/publications; mailto:steve.baskauf@vanderbilt.edu)' 
        self.requestheader = {
        'Accept' : self.mediatype,
        'User-Agent': self.useragent
        }
        try:
            self.pid = kwargs['pid'] # property's P ID
        except:
            self.pid = 'P31' # default to "instance of"  
        try:
            self.sleep = kwargs['sleep']
        except:
            self.sleep = 0.25 # default throtting of 0.25 seconds
            
        # attributes for single property values method
        try:
            self.isitem = kwargs['isitem']
        except:
            self.isitem = True # default to values are items rather than literals   
        try:
            self.uselabel = kwargs['uselabel']
        except:
            self.uselabel = True # default is to show labels of items
            
        # attributes for labels and descriptions method
        try:
            self.labeltype = kwargs['labeltype']
        except:
            self.labeltype = 'label' # default to "label". Other options: "description", "alias"
        try:
            self.labelscreen = kwargs['labelscreen']
        except:
            self.labelscreen = '' # instead of using a list of subject items, add this line to screen for items
            
        # attributes for search_statement method
        try:
            self.vid = kwargs['vid'] # Q ID of the value of a statement. 
        except:
            self.vid = '' # default to no value (the method returns the value of the statement)
            
    # send a generic query and return a list of Q IDs
    def generic_query(self, query):
        r = requests.get(self.endpoint, params={'query' : query}, headers=self.requestheader)
        results_list = []
        try:
        #if 1==1: # replace try: to let errors occur, also comment out the except: clause
            data = r.json()
            #print(data)
            statements = data['results']['bindings']
            if len(statements) > 0: # if no results, the list remains empty
                for statement in statements:
                    if self.isitem:
                        if self.uselabel:
                            result_value = statement['entity']['value']
                        else:
                            result_value = extract_qnumber(statement['entity']['value'])
                    else:
                        result_value = statement['entity']['value']
                    results_list.append(result_value)
        except:
            results_list = [r.text]
        
        # delay by some amount (quarter second default) to avoid hitting the SPARQL endpoint too rapidly
        sleep(self.sleep)
        return results_list
            

    # returns the value of a single property for an item by Q ID
    def single_property_values_for_item(self, qid):
        query = '''
select distinct ?object where {
    wd:'''+ qid + ''' wdt:''' + self.pid
        if self.uselabel and self.isitem:
            query += ''' ?objectItem.
    ?objectItem rdfs:label ?object.
    FILTER(lang(?object) = "''' + self.lang +'")'
        else:
            query += ''' ?object.'''            
        query +=  '''
    }'''
        #print(query)
        r = requests.get(self.endpoint, params={'query' : query}, headers=self.requestheader)
        results_list = []
        try:
        #if 1==1: # replace try: to let errors occur, also comment out the except: clause
            data = r.json()
            #print(data)
            statements = data['results']['bindings']
            if len(statements) > 0: # if no results, the list remains empty
                for statement in statements:
                    if self.isitem:
                        if self.uselabel:
                            result_value = statement['object']['value']
                        else:
                            result_value = extract_qnumber(statement['object']['value'])
                    else:
                        result_value = statement['object']['value']
                    results_list.append(result_value)
        except:
            results_list = [r.text]
        
        # delay by some amount (quarter second default) to avoid hitting the SPARQL endpoint too rapidly
        sleep(self.sleep)
        return results_list
    
    # search for any of the "label" types: label, alias, description. qids is a list of Q IDs without namespaces
    def labels_descriptions(self, qids):
        # option to explicitly list subject Q IDs
        if self.labelscreen == '':
            # create a string for all of the Wikidata item IDs to be used as subjects in the query
            alternatives = ''
            for qid in qids:
                alternatives += 'wd:' + qid + '\n'

        if self.labeltype == 'label':
            predicate = 'rdfs:label'
        elif self.labeltype == 'alias':
            predicate = 'skos:altLabel'
        elif self.labeltype == 'description':
            predicate = 'schema:description'
        else:
            predicate = 'rdfs:label'        

        # create a string for the query
        query = '''
select distinct ?id ?string where {'''
        
        # option to explicitly list subject Q IDs
        if self.labelscreen == '':
            query += '''
      VALUES ?id
    {
''' + alternatives + '''
    }'''
        # option to screen for Q IDs by triple pattern
        if self.labelscreen != '':
            query += '''
    ''' + self.labelscreen
            
        query += '''
    ?id '''+ predicate + ''' ?string.
    filter(lang(?string)="''' + self.lang + '''")
    }'''
        #print(query)

        results_list = []
        r = requests.get(self.endpoint, params={'query' : query}, headers=self.requestheader)
        data = r.json()
        results = data['results']['bindings']
        for result in results:
            # remove wd: 'http://www.wikidata.org/entity/'
            qnumber = extract_qnumber(result['id']['value'])
            string = result['string']['value']
            results_list.append({'qid': qnumber, 'string': string})

        # delay by some amount (quarter second default) to avoid hitting the SPARQL endpoint too rapidly
        sleep(self.sleep)
        return results_list

    # Searches for statements using a particular property. If no value is set, the value will be returned.
    def search_statement(self, qids, reference_property_list):
        # create a string for all of the Wikidata item IDs to be used as subjects in the query
        alternatives = ''
        for qid in qids:
            alternatives += 'wd:' + qid + '\n'

        # create a string for the query
        query = '''
select distinct ?id ?statement '''
        # if no value was specified, find the value
        if self.vid == '':
            query += '?statementValue '
        if len(reference_property_list) != 0:
            query += '?reference '
        for ref_prop_index in range(0, len(reference_property_list)):
            query += '?refVal' + str(ref_prop_index) + ' '
        query += '''
    where {
        VALUES ?id
    {
''' + alternatives + '''
    }
    ?id p:'''+ self.pid + ''' ?statement.
    ?statement ps:'''+ self.pid

        if self.vid == '': # return the value of the statement if no particular value is specified
            query += ' ?statementValue.'
        else:
            query += ' wd:' + self.vid + '.' # specify the value to be searched for

        if len(reference_property_list) != 0:
            query += '''
    optional {
        ?statement prov:wasDerivedFrom ?reference.''' # search for references if there are any
            for ref_prop_index in range(0, len(reference_property_list)):
                query +='''
        ?reference pr:''' + reference_property_list[ref_prop_index] + ' ?refVal' + str(ref_prop_index) + '.'
            query +='''
            }'''
        query +='''
      }'''
        print(query)

        results_list = []
        r = requests.get(self.endpoint, params={'query' : query}, headers=self.requestheader)
        data = r.json()
        results = data['results']['bindings']
        # ********** NOTE: need to deal with case where there are more than one reference per result
        # This will result in several results with the same subject qNumber
        for result in results:
            # remove wd: 'http://www.wikidata.org/entity/'
            qnumber = extract_qnumber(result['id']['value'])
            # remove wds: 'http://www.wikidata.org/entity/statement/'
            no_domain = extract_from_iri(result['statement']['value'], 5)
            # need to remove the qNumber that's appended in front of the UUID
            pieces = no_domain.split('-')
            last_pieces = pieces[1:len(pieces)]
            s = "-"
            statement_uuid = s.join(last_pieces)

            # if no value was specified, get the value that was found in the search
            if self.vid == '':
                statement_value = result['statementValue']['value']
            # extract the reference property data if any reference properties were specified
            if len(reference_property_list) != 0:
                if 'reference' in result:
                    # remove wdref: 'http://www.wikidata.org/reference/'
                    reference_hash = extract_qnumber(result['reference']['value'])
                else:
                    reference_hash = ''
                reference_values = []
                for ref_prop_index in range(0, len(reference_property_list)):
                    if 'refVal' + str(ref_prop_index) in result:
                        reference_value = result['refVal' + str(ref_prop_index)]['value']
                        # if it's a date, it comes down as 2019-12-05T00:00:00Z, but the API wants just the date: 2019-12-05
                        #if referenceProperty == 'P813': # the likely property is "retrieved"; just leave it if it's another property
                        #    referenceValue = referenceValue.split('T')[0]
                    else:
                        reference_value = ''
                    reference_values.append(reference_value)
            results_dict = {'qId': qnumber, 'statementUuid': statement_uuid}
            # if no value was specified, get the value that was found in the search
            if self.vid == '':
                results_dict['statementValue'] = statement_value
            if len(reference_property_list) != 0:
                results_dict['referenceHash'] = reference_hash
                results_dict['referenceValues'] = reference_values
            results_list.append(results_dict)

        # delay by some amount (quarter second default) to avoid hitting the SPARQL endpoint too rapidly
        sleep(self.sleep)
        return results_list

In [None]:
person = 'Q40670042'
orcid = '0000-0003-4365-3135'
qids = ['Q40670042', 'Q57082956', 'Q75060085']
reference_properties = ['P854', 'P813']

#get_orcid = Query(pid='P496', isitem=False)
#print(get_orcid.single_property_values_for_item(person) )
#print()

#get_class = Query(pid='P31', uselabel=False)
#print(get_class.single_property_values_for_item(person) )
#print()

#get_employer_label = Query(pid='P108')
#print(get_employer_label.single_property_values_for_item(person) )
#print()

#get_labels = Query(labeltype='label')
#print(get_labels.labels_descriptions(qids))
#print()
#get_descriptions = Query(labeltype='description')
#print(get_descriptions.labels_descriptions(qids))
#print()
#get_aliases = Query(labeltype='alias')
#print(get_aliases.labels_descriptions(qids))
get_employer = Query(pid='P108')
print(get_employer.search_statement(qids, reference_properties))

## Determine CETAF classes

Determine what classes are being used by CETAF institutions, then query to generate a list of items that are members of those classes. 

In [6]:
# Find out all of the classes of which CETAF institutions are instances
query = '''
select distinct ?entity where {
    ?institution wdt:P463 wd:Q5163385.
    ?institution wdt:P31 ?entity.
}'''
gen_query_qids = Query(uselabel=False)
cetaf_classlist = gen_query_qids.generic_query(query)

cetaf_labels = Query(labeltype='label')
cetaf_class_dict = cetaf_labels.labels_descriptions(cetaf_classlist)
print(json.dumps(cetaf_class_dict, indent = 2))

[
  {
    "qid": "Q31855",
    "string": "research institute"
  },
  {
    "qid": "Q3918",
    "string": "university"
  },
  {
    "qid": "Q33506",
    "string": "museum"
  },
  {
    "qid": "Q43229",
    "string": "organization"
  },
  {
    "qid": "Q167346",
    "string": "botanical garden"
  },
  {
    "qid": "Q414147",
    "string": "academy of sciences"
  },
  {
    "qid": "Q875538",
    "string": "public university"
  },
  {
    "qid": "Q866133",
    "string": "university museum"
  },
  {
    "qid": "Q588140",
    "string": "science museum"
  },
  {
    "qid": "Q570116",
    "string": "tourist attraction"
  },
  {
    "qid": "Q327333",
    "string": "government agency"
  },
  {
    "qid": "Q181916",
    "string": "herbarium"
  },
  {
    "qid": "Q1497375",
    "string": "ensemble"
  },
  {
    "qid": "Q1664720",
    "string": "institute"
  },
  {
    "qid": "Q1497649",
    "string": "memory institution"
  },
  {
    "qid": "Q1970365",
    "string": "natural history museum"
  },
 

In [9]:
# send a query for each class to retrieve items that are members
test_results = []
for wikidata_class in cetaf_class_dict:
    print(wikidata_class['qid'], ' ', wikidata_class['string'])
    
    graph_pattern = '''
    ?id wdt:P31 wd:''' + wikidata_class['qid'] + '.'
    get_labels = Query(labeltype='label', labelscreen=graph_pattern)
    
    # Get the labels for all of the hits
    test_labels = get_labels.labels_descriptions([])
    print(len(test_labels))
    print()
    test_results.append({'class_id': wikidata_class['qid'], 'class_name': wikidata_class['string'], 'labels': test_labels})
print('done')

Q31855   research institute
7589

Q3918   university
12659

Q33506   museum
16538

Q43229   organization
47557

Q167346   botanical garden
1959

Q414147   academy of sciences
227

Q875538   public university
835

Q866133   university museum
92

Q588140   science museum
247

Q570116   tourist attraction
1570

Q327333   government agency
10663

Q181916   herbarium
284

Q1497375   ensemble
2432

Q1664720   institute
1279

Q1497649   memory institution
769

Q1970365   natural history museum
235

Q1899015   conservation organization
80

Q1966910   national academy
97

Q2065736   cultural property
8478

Q2385804   educational institution
5298

Q2668072   collection
285701

Q2288140   Federal Scientific Institute
13

Q3343298   non-departmental public body
103

Q5341295   educational organization
129

Q5193377   cultural institution
1220

Q7315155   research center
797

Q13226383   facility
5017

Q17431399   national museum
544

Q16735822   history museum
320

Q18233199   country museum
7

Q2

In [10]:
json_string = json.dumps(test_results, indent = 2)
with open('test-output.json', 'wt', encoding='utf-8') as fileObject:
    fileObject.write(json_string)

## Retrieve data about herbaria from Index Herbariorum

Create a table with the useful information about each herbariumlisted.

In [11]:
# generates a dictionary to be passed in a requests GET method to generate the request header
def generateHeaderDictionary(acceptMediaType):
    userAgentHeader = 'VanderBot/0.8 (https://github.com/HeardLibrary/linked-data/tree/master/publications; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : acceptMediaType,
        'User-Agent': userAgentHeader
    }
    return requestHeaderDictionary

# retrieve JSON data from the API
acceptMediaType = 'application/json'
endpointUrl = 'http://sweetgum.nybg.org/science/api/v1/institutions'
r = requests.get(endpointUrl, headers=generateHeaderDictionary(acceptMediaType))
dataFull = r.json()
data = dataFull['data']
print('done')

done


In [12]:
print(json.dumps(data[0:10], indent=2))

[
  {
    "irn": 124564,
    "organization": "Forest Service, USDA",
    "code": "POFS",
    "division": "Division of Range Management",
    "department": "",
    "specimenTotal": 0,
    "currentStatus": "Inactive",
    "dateFounded": "",
    "taxonomicCoverage": "",
    "geography": "",
    "address": {
      "physicalStreet": "",
      "physicalCity": "Portland",
      "physicalState": "Oregon",
      "physicalZipCode": "",
      "physicalCountry": "U.S.A.",
      "postalStreet": "",
      "postalCity": "Portland",
      "postalState": "Oregon",
      "postalZipCode": "97208",
      "postalCountry": "U.S.A."
    },
    "contact": {
      "phone": "",
      "email": "",
      "webUrl": ""
    },
    "location": {
      "lat": 0,
      "lon": 0
    },
    "collectionsSummary": {
      "numAlgae": 0,
      "numAlgaeDatabased": 0,
      "numAlgaeImaged": 0,
      "numBryos": 0,
      "numBryosDatabased": 0,
      "numBryosImaged": 0,
      "numFungi": 0,
      "numFungiDatabased": 0,
   

In [13]:

# pull out the data from the JSON and put in list of lists (table) in a file
output_table = []
fieldnames = ['wikidata_id', 'orgid', 'organization', 'code', 'division', 'department', 'email']

for herb in data:
    results_dict = {}
    results_dict['wikidata_id'] = ''
    results_dict['orgid'] = herb['irn']
    results_dict['organization'] = herb['organization']
    results_dict['code'] = herb['code']
    results_dict['division'] = herb['division']
    results_dict['department'] = herb['department']
    results_dict['email'] = herb['contact']['email']
    output_table.append(results_dict)

filename = 'herb-basic.csv'
writeDictsToCsv(output_table, filename, fieldnames)
print('done')

done


## Match candidate items with IH collections

For each of the retrieved items, do fuzzy matching with IH collection names and see which is most effective.

In [None]:
filename = 'herb-basic.csv'
output_table = readDict(filename)

filename = 'test-output.json'
json_string = readDict(filename)
test_results = json.loads(json_string)

In [None]:
match_results = []
for result in test_results[0:1]:
    herbarium_matches = []
    print('Class:', result['class_name'])
    for item in result['labels']:
        for row in output_table:
            nameTestRatio = fuzz.token_set_ratio(item['string'], row['organization'])
            if nameTestRatio >= 50:
                herbarium_matches.append({'qid': item['qid'], 'orgid': row['orgid'], 'name': row['organization']})
    match_results.append({'class': result['class'], 'matches': herbarium_matches})
    print('Matches: ', len(herbarium_matches))
    print('Fraction:' , len(herbarium_matches)/len(result['labels']))
    print()

In [None]:
# query for a single variable that's an item named 'item'
# returns a list of results
def searchWikidataForQIdByIhCode(herb_code):
    query = '''
select distinct ?item where {
  ?item wdt:P5858 "''' + herb_code + '''".
  }
'''
    results = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers = generateHeaderDictionary(acceptMediaType))
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            qNumber = extractQNumber(wikidataIri)
            results.append(qNumber)
    except:
        results = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint to rapidly
    sleep(0.25)
    return results

# Look up the institution using the IH code
filename = deptShortName + '-employees-with-wikidata.csv'
people = readDict(filename)

#for personIndex in range(0, len(people)):
for personIndex in range(1, 100): 
    ihCodes = searchWikidataForQIdByIhCode(people[personIndex]['herb_code'])
    print(people[personIndex]['herb_code'], ihCodes)
    

# Old code

In [None]:
import requests   # best library to manage HTTP transactions
#from bs4 import BeautifulSoup # web-scraping library
import json
from time import sleep
import csv
import math
from fuzzywuzzy import fuzz # fuzzy logic matching
from fuzzywuzzy import process
import xml.etree.ElementTree as et # library to traverse XML tree
import urllib
import datetime

# For a particular processing round, set a short name for the department here.
# This name is used to generate a set of unique processing files for that department.
testEmployer = 'Vanderbilt University' # to test against Wikidata employer property
employerQId = 'Q29052' # Vanderbilt University
deathDateLimit = '2010' # any date deaths before this date will be assumed to not be a match

# NOTE: eventually need to test against all affiliations in cases of faculty with multiple appointments

# ***********************************
# NOTE: the script fails if there is a current item in Wikidata that has the same values for both label and description. 
# A check needs to be run for this !!!
# ***********************************

# Here is some example JSON from a departmental configuration file (department-configuration.json):

'''
{
  "deptShortName": "anthropology",
  "aads": {
    "categories": [
      ""
    ],
    "baseUrl": "https://as.vanderbilt.edu/aads/people/",
    "nTables": 1,
    "departmentSearchString": "African American and Diaspora Studies",
    "departmentQId": "Q79117444",
    "testAuthorAffiliation": "African American Diaspora Studies Vanderbilt",
    "labels": {
      "source": "column",
      "value": "name"
    },
    "descriptions": {
      "source": "constant",
      "value": "African American and Diaspora Studies scholar"
    }
  },
  "bsci": {
    "categories": [
      "primary-training-faculty",
      "research-and-teaching-faculty",
      "secondary-faculty",
      "postdoc-fellows",
      "emeriti"
    ],
    "baseUrl": "https://as.vanderbilt.edu/biosci/people/index.php?group=",
    "nTables": 1,
    "departmentSearchString": "Biological Sciences",
    "departmentQId": "Q78041310",
    "testAuthorAffiliation": "Biological Sciences Vanderbilt",
    "labels": {
      "source": "column",
      "value": "name"
    },
    "descriptions": {
      "source": "constant",
      "value": "biology researcher"
    }
  }
}
'''
# Note that the first key: value pair sets the department to be processed.

# The default labels and descriptions can either be a column in the table or set as a constant. 
# If it's a column, the value is the column header.  If it's a constant, the value is the string to assign as the value.

# The nTables value is the number of HTML tables in the page to be searched.  Currently (2020-01-19) it isn't used
# and the script just checks all of the tables, but it could be implemented if there are tables at the end that don't 
# include employee names.

with open('department-configuration.json', 'rt', encoding='utf-8') as fileObject:
    text = fileObject.read()
deptSettings = json.loads(text)
deptShortName = deptSettings['deptShortName']
print('Department currently set for', deptShortName)

wikidataEndpointUrl = 'https://query.wikidata.org/sparql'
degreeList = [
    {'string': 'Ph.D.', 'value': 'Ph.D.'},
    {'string': 'PhD', 'value': 'Ph.D.'},
    {'string': 'D.Phil.', 'value': 'D.Phil.'},
    {'string': 'J.D.', 'value': 'J.D.'}
     ]

# NCBI identification requirements:
# tool name and email address should be sent with all requests
# see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
emailAddress = 'steve.baskauf@vanderbilt.edu' # put your email address here
toolName = 'VanderBot' # give your application a name here

# generates a dictionary to be passed in a requests GET method to generate the request header
def generateHeaderDictionary(acceptMediaType):
    userAgentHeader = 'VanderBot/0.8 (https://github.com/HeardLibrary/linked-data/tree/master/publications; mailto:steve.baskauf@vanderbilt.edu)'
    requestHeaderDictionary = {
        'Accept' : acceptMediaType,
        'User-Agent': userAgentHeader
    }
    return requestHeaderDictionary

# write a list of lists to a CSV file
def writeCsv(fileName, array):
    fileObject = open(fileName, 'w', newline='', encoding='utf-8')
    writerObject = csv.writer(fileObject)
    for row in array:
        writerObject.writerow(row)
    fileObject.close()

# write a list of dictionaries to a CSV file
def writeDictsToCsv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csvFileObject:
        writer = csv.DictWriter(csvFileObject, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# read from a CSV file into a list of dictionaries
def readDict(filename):
    fileObject = open(filename, 'r', newline='', encoding='utf-8')
    dictObject = csv.DictReader(fileObject)
    array = []
    for row in dictObject:
        array.append(row)
    fileObject.close()
    return array

# extracts the qNumber from a Wikidata IRI
def extractQNumber(iri):
    # pattern is http://www.wikidata.org/entity/Q6386232
    pieces = iri.split('/')
    return pieces[4]

# extracts a local name from an IRI, specify the list item number for the last piece separated by slash
def extractFromIri(iri, numberPieces):
    # with pattern like http://www.wikidata.org/entity/Q6386232 there are 5 pieces with qId as number 4
    pieces = iri.split('/')
    return pieces[numberPieces]

# see https://www.wikidata.org/wiki/Property:P21 for values
def decodeSexOrGender(code):
    code = code.lower()
    if code == 'm':
        qId = 'Q6581097'
    elif code == 'f':
        qId = 'Q6581072'
    elif code == 'i':
        qId = 'Q1097630'
    elif code == 'tf':
        qId = 'Q1052281'
    elif code == 'tm':
        qId = 'Q2449503'
    else:
        qId = ''
    return qId

# query for a single variable that's an item named 'item'
# returns a list of results
def searchWikidataForQIdByOrcid(orcid):
    query = '''
select distinct ?item where {
  ?item wdt:P496 "''' + employees[employeeIndex]['orcid'] + '''".
  }
'''
    results = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers = generateHeaderDictionary(acceptMediaType))
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            qNumber = extractQNumber(wikidataIri)
            results.append(qNumber)
    except:
        results = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint to rapidly
    sleep(0.25)
    return results

# returns a list of employer strings for the item with Wikidata ID qId; P108 is employer
def searchWikidataEmployer(qId):
    resultsList = []
    query = '''select distinct ?employer where {
        wd:'''+ qId + ''' wdt:P108 ?employerId.
        ?employerId rdfs:label ?employer.
        FILTER(lang(?employer) = 'en')
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                resultsList.append(statement['employer']['value'])
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

# returns a list of value Q IDs of the property propertyId for the item with Wikidata ID qId
def searchWikidataSingleProperty(qId, propertyId, valueType):
    resultsList = []
    query = '''select distinct ?object where {
        wd:'''+ qId + ''' wdt:''' + propertyId + ''' ?object.
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the list remains empty
            for statement in statements:
                if valueType == 'item':
                    resultValue = extractQNumber(statement['object']['value'])
                else:
                    resultValue = statement['object']['value']
                resultsList.append(resultValue)
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

def checkOrcid(orcid):
    namespace = 'https://orcid.org/'
    endpointUrl = namespace + orcid
    acceptMediaType = 'application/ld+json'
    r = requests.get(endpointUrl, headers=generateHeaderDictionary(acceptMediaType))
    code = r.status_code
    #print(r.text)
    data = r.json()
    response = {'code': code, 'data': data}
    if response['code'] != 200:
        print('Attempt to dereference ORCID resulted in HTTP response code ', response['code'])
        data['orcidReferenceValue'] = ''
    else:
        print('Successfully retrieved')
        wholeTimeStringZ = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
        dateZ = wholeTimeStringZ.split('T')[0] # form 2019-12-05
        wholeDateZ = '+' + dateZ + 'T00:00:00Z' # form +2019-12-05T00:00:00Z as provided by Wikidata
    # delay a quarter second to avoid hitting the API too rapidly
    sleep(0.25)
    return(wholeDateZ)

# if the value passed is '' then the value will be retrieved.  Otherwise, the value is used to screen.
def searchStatementAtWikidata(qIds, prop, value, refPropList):
    # create a string for all of the Wikidata item IDs to be used as subjects in the query
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    # create a string for the query
    query = 'select distinct ?id ?statement '
    # if no value was specified, find the value
    if value == '':
        query += '?statementValue '
    if len(refPropList) != 0:
        query += '?reference '
    for refPropIndex in range(0, len(refPropList)):
        query += '?refVal' + str(refPropIndex) + ' '
    query += '''where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id p:'''+ prop + ''' ?statement.
  ?statement ps:'''+ prop
    
    if value == '':
        query += ' ?statementValue.'
    else:
        query += ' wd:' + value + '.'

    if len(refPropList) != 0:
        query += '''
  optional {
    ?statement prov:wasDerivedFrom ?reference.'''
        for refPropIndex in range(0, len(refPropList)):
            query +='''
    ?reference pr:''' + refPropList[refPropIndex] + ''' ?refVal''' + str(refPropIndex) + '''.'''
        query +='''
        }'''
    query +='''
  }'''
    #print(query)

    returnValue = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    results = data['results']['bindings']
    # ********** NOTE: need to deal with case where there are more than one reference per result
    # This will result in several results with the same qNumeber, orcid, and referenceHash
    for result in results:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(result['id']['value'], 4)
        # remove wds: 'http://www.wikidata.org/entity/statement/'
        noDomain = extractFromIri(result['statement']['value'], 5)
        # need to remove the qNumber that's appended in front of the UUID
        
        # NOTE: formerly used this:
        #statementUuid = noDomain.partition(qNumber + '-')[2]
        # However, there was at least one case where the appended qNumber had a lower case Q and failed to match.
        # So needed a different approach.
        pieces = noDomain.split('-')
        lastPieces = pieces[1:len(pieces)]
        s = "-"
        statementUuid = s.join(lastPieces)

        # if no value was specified, get the value that was found in the search
        if value == '':
            statementValue = result['statementValue']['value']
        if len(refPropList) != 0:
            if 'reference' in result:
                # remove wdref: 'http://www.wikidata.org/reference/'
                referenceHash = extractFromIri(result['reference']['value'], 4)
            else:
                referenceHash = ''
            referenceValues = []
            for refPropIndex in range(0, len(refPropList)):
                if 'refVal' + str(refPropIndex) in result:
                    refVal = result['refVal' + str(refPropIndex)]['value']
                    # if it's a date, it comes down as 2019-12-05T00:00:00Z, but the API wants just the date: 2019-12-05
                    #if referenceProperty == 'P813': # the likely property is "retrieved"; just leave it if it's another property
                    #    referenceValue = referenceValue.split('T')[0]
                else:
                    refVal = ''
                referenceValues.append(refVal)
        resultsDict = {'qId': qNumber, 'statementUuid': statementUuid}
        # if no value was specified, get the value that was found in the search
        if value == '':
            resultsDict['statementValue'] = statementValue
        if len(refPropList) != 0:
            resultsDict['referenceHash'] = referenceHash
            resultsDict['referenceValues'] = referenceValues
        returnValue.append(resultsDict)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    
    return returnValue

# search for any of the "label" types: label, alias, description
def searchLabelsDescriptionsAtWikidata(qIds, labelType, language):
    # create a string for all of the Wikidata item IDs to be used as subjects in the query
    alternatives = ''
    for qId in qIds:
        alternatives += 'wd:' + qId + '\n'
        
    if labelType == 'label':
        predicate = 'rdfs:label'
    elif labelType == 'alias':
        predicate = 'skos:altLabel'
    elif labelType == 'description':
        predicate = 'schema:description'
    else:
        predicate = 'rdfs:label'        
        
    # create a string for the query
    query = 'select distinct ?id ?string '
    query += '''where {
  VALUES ?id
{
''' + alternatives + '''}
  ?id '''+ predicate + ''' ?string.
  filter(lang(?string)="''' + language + '''")
  }'''
    #print(query)

    returnValue = []
    acceptMediaType = 'application/json'
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=generateHeaderDictionary(acceptMediaType))
    data = r.json()
    results = data['results']['bindings']
    for result in results:
        # remove wd: 'http://www.wikidata.org/entity/'
        qNumber = extractFromIri(result['id']['value'], 4)
        string = result['string']['value']
        resultsDict = {'qId': qNumber, 'string': string}
        returnValue.append(resultsDict)

    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    
    return returnValue

# Match employees to Wikidata

Script developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/wikidata/match_bsci_wikidata.ipynb

Attempts to match records of people Wikidata knows to work at Vanderbilt with departmental people by matching their ORCIDs, then name strings. If there isn't a match with the downloaded Wikidata records, for employees with ORCIDs, the script attempts to find them in Wikidata by directly doing a SPARQL search for their ORCID.

As people are matched (or determined to not have a match), a code is recorded with information about how the match was made.  Here are the values:

```
0=unmatched
1=matched with ORCID in both sources
2=ORCID from match to ORCID records but name match to Wikidata (no ORCID)
3=no ORCID from match to ORCID records but name match to Wikidata (with ORCID); could happen if affiliation isn't matched in ORCID
4=no ORCID from match to ORCID records but name match to Wikidata (no ORCID)
5=ORCID from match to ORCID records and found via SPARQL ORCID search (likely non-VU affiliated in Wikidata)
6=ORCID from match to ORCID records and found via SPARQL name search (non-VU affiliated without ORCID)
7=no name match
8=ORCID from match to ORCID records, error in SPARQL ORCID search
9=no ORCID from match to ORCID records, error in SPARQL name search
10=affiliation match in article
11=match by human choice after looking at entity data
12=no matching entities were possible matches
```

# Crosscheck people against publications

Developed at https://github.com/HeardLibrary/linked-data/blob/master/publications/crosscheck-publications.ipynb

Checks possible Wikidata records against publications in CrossRef and PubMed to see if the author metadata will disambiguate the Wikidata record.


In [None]:
acceptMediaType = 'application/json'
requestHeaderDictionary = generateHeaderDictionary(acceptMediaType)

def generateNameAlternatives(name):
    # get rid of periods
    name = name.replace('.', '')
    pieces = name.split(' ')
    
    # generate initials for all names
    initials = []
    for piece in pieces:
        initials.append(piece[0:1])
    
    # NOTE: currently doesn't handle ", Jr.", "III", etc.
    
    alternatives = []
    # full name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += pieces[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # first and last name with initials and periods
    nameVersion = pieces[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first and last name only
    nameVersion = pieces[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial and last name only
    nameVersion = initials[0] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # first initial with period and last name only
    nameVersion = initials[0] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with last name
    nameVersion = initials[0] + ' '
    for pieceNumber in range(1, len(pieces)-1):
        nameVersion += initials[pieceNumber] + ' '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials with periods with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber] + '. '
    nameVersion += pieces[len(pieces)-1]
    alternatives.append(nameVersion)

    # all name initials concatenated with last name
    nameVersion = ''
    for pieceNumber in range(0, len(pieces)-1):
        nameVersion += initials[pieceNumber]
    nameVersion += ' ' + pieces[len(pieces)-1]
    alternatives.append(nameVersion)
    
    # remove duplicates
    dedupe = list(set(alternatives))

    return dedupe

def searchNameAtWikidata(name):
    nameList = generateNameAlternatives(name)
    alternatives = ''
    for alternative in nameList:
        alternatives += '"' + alternative + '"@en\n'
    query = '''
select distinct ?item ?label where {
  VALUES ?value
  {
  ''' + alternatives + '''}
?item rdfs:label|skos:altLabel ?value.
?item rdfs:label ?label.
FILTER(lang(?label)='en')
  }
'''
    #print(query)
    #print('searching for ', name)
    results = []
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            wikidataIri = statement['item']['value']
            if 'label' in statement:
                name = statement['label']['value']
            else:
                name = ''
            qNumber = extractQNumber(wikidataIri)
            results.append({'qId': qNumber, 'name': name})
    except:
        results = [{'error': r.text}]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return results

# returns a dictionary of various descriptors of the item with Wikidata ID qId
# P106 is occupation, schema:description is filtered to be the English description
def searchWikidataDescription(qId):
    resultsDict = {}
    query = '''select distinct ?description ?orcid ?occupation where {
        optional {
            wd:'''+ qId + ''' schema:description ?description.
            FILTER(lang(?description) = 'en')
            }
        optional {
            wd:'''+ qId + ''' wdt:P106 ?occupationId.
            ?occupationId rdfs:label ?occupation.
            FILTER(lang(?occupation) = 'en')            
            }
        optional {wd:'''+ qId + ''' wdt:P496 ?orcid.}
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        if len(statements) > 0: # if no results, the dictionary remains empty
            # Only a single description per language is allowed, so there should only be one description
            if 'description' in statements[0]:
                description = statements[0]['description']['value']
            else:
                description = ''
            resultsDict['description'] = description
            
            # Only a single ORCID is allowed, so there should only be one orcid value
            if 'orcid' in statements[0]:
                orcid = statements[0]['orcid']['value']
            else:
                orcid = ''
            resultsDict['orcid'] = orcid
            
            # if there are multiple statements, that's because there are more than one occupation
            occupationList = []
            for statement in statements:
                if 'occupation' in statement:
                    occupationList.append(statement['occupation']['value'])
            resultsDict['occupation'] = occupationList
    except:
        resultsDict = {'error': r.text}
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsDict

# returns a list of results of articles by person with Wikidata ID qId
def searchWikidataArticle(qId):
    resultsList = []
    # P50 is "author"; P698 is the PubMed ID of the article; P356 is the DOI of the article
    query = '''select distinct ?title ?doi ?pmid where {
      ?article wdt:P50 wd:''' + qId + '''.
      optional {
          ?article rdfs:label ?title.
          FILTER(lang(?title) = 'en')
          }
      optional {?article wdt:P698 ?pmid.}
      optional {?article wdt:P356 ?doi.}
      }'''
    #print(query)
    r = requests.get(wikidataEndpointUrl, params={'query' : query}, headers=requestHeaderDictionary)
    try:
        data = r.json()
        statements = data['results']['bindings']
        for statement in statements:
            if 'title' in statement:
                title = statement['title']['value']
                #print('title=',title)
            else:
                title = ''
            if 'pmid' in statement:
                pmid = statement['pmid']['value']
            else:
                pmid = ''
            if 'doi' in statement:
                doi = statement['doi']['value']
            else:
                doi = ''
            resultsList.append({'title': title, 'pmid': pmid, 'doi': doi})
    except:
        resultsList = [r.text]
    # delay a quarter second to avoid hitting the SPARQL endpoint too rapidly
    sleep(0.25)
    return resultsList

def retrievePubMedData(pmid):
    fetchUrl = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    paramDict = {
        'tool': toolName, 
        'email': emailAddress,
        'db': 'pubmed', 
         #'retmode': 'xml', 
        'rettype': 'abstract', 
        'id': pmid
    }
    response = requests.get(fetchUrl, params=paramDict)    
    #print(response.url)
    if response.status_code == 404:
        affiliations = [] # return an empty list if the constructed URL won't dereference
    else:
        pubData = response.text  # the response text is XML
        #print(pubData)  # uncomment this line to see the XML

        # process the returned XML, see https://docs.python.org/2/library/xml.etree.elementtree.html
        root = et.fromstring(pubData)
        try:
            title = root.findall('.//ArticleTitle')[0].text
        except:
            title = ''
        names = root.findall('.//Author')
        affiliations = []
        for name in names:
            try:
                affiliation = name.find('./AffiliationInfo/Affiliation').text
            except:
                affiliation = ''
            try:
                lastName = name.find('./LastName').text
            except:
                lastName = ''
            try:
                foreName = name.find('./ForeName').text
            except:
                foreName = ''
            try:
                idField = name.find('./Identifier')
                if idField.get('Source') == 'ORCID':
                    orcid = idField.text
                else:
                    orcid = ''
            except:
                orcid = ''

            #print(lastName)
            #print(affiliation)
            affiliations.append({'affiliation': affiliation, 'surname': lastName, 'forename': foreName, 'orcid': orcid})
        #print()

    # See https://www.ncbi.nlm.nih.gov/books/NBK25497/ for usage guidelines. 
    # An API key is required for more than 3 requests per second.
    sleep(0.5) # wait half a second before hitting the API again to avoid getting blocked
    return affiliations

def retrieveCrossRefDoi(doi):
    authorList = []
    crossRefEndpointUrl = 'https://api.crossref.org/works/'
    encodedDoi = urllib.parse.quote(doi)
    searchUrl = crossRefEndpointUrl + encodedDoi
    acceptMediaType = 'application/json'
    response = requests.get(searchUrl, headers=generateHeaderDictionary(acceptMediaType))
    if response.status_code == 404:
        authorList = [] # return an empty list if the DOI won't dereference at CrossRef
    else:
        try:
            data = response.json()
            #print(json.dumps(data, indent = 2))
            if 'author' in data['message']:
                authors = data['message']['author']
                for author in authors:
                    authorDict = {}
                    if 'ORCID' in author:
                        authorDict['orcid'] = author['ORCID']
                    else:
                        authorDict['orcid'] = ''
                    if 'given' in author:
                        authorDict['givenName'] = author['given']
                    else:
                        authorDict['givenName'] = ''
                    if 'family' in author:
                        authorDict['familyName'] = author['family']
                    else:
                        authorDict['familyName'] = ''
                    affiliationList = []
                    if 'affiliation' in author:
                        for affiliation in author['affiliation']:
                            affiliationList.append(affiliation['name'])
                    # if there aren't any affiliations, the list will remain empty
                    authorDict['affiliation'] = affiliationList
                    authorList.append(authorDict)
        except:
            authorList = [data]
    return authorList

# ***** BODY OF SEARCH
filename = deptShortName + '-employees-with-wikidata.csv'
employees = readDict(filename)

#for employeeIndex in range(0, len(employees)):
for employeeIndex in range(11, 50): # just do one person for testing
    # perform search only for people who weren't already matched
    if employees[employeeIndex]['wikidataStatus'] == '0':
        matchStatus = 0
        print('--------------------------')
        results = searchNameAtWikidata(employees[employeeIndex]['name'])
        print('Position: ', employees[employeeIndex]['position'], ', Specialities: ', employees[employeeIndex]['specialities'])
        print('Born: ', employees[employeeIndex]['birth_date'], ', Herb code: ', employees[employeeIndex]['herb_code'], ', Place: ', employees[employeeIndex]['city'], ', ', employees[employeeIndex]['state'])
        if len(results) == 0:
            print('No Wikidata name match: ', employees[employeeIndex]['name'])
            matchStatus = 7
            print()
        else:
            print('SPARQL name search: ', employees[employeeIndex]['name'])
            if len(results) == 1:
                if 'error' in results[0]:
                    matchStatus = 9
                    print('Error message in
                          name search:', results[0]['error'])
                    break # discontinue processing this person
            qIds = []
            nameVariants = []
            potentialOrcid = []
            for result in results:
                qIds.append(result['qId'])
                nameVariants.append(result['name'])
            
            testAuthor = employees[employeeIndex]['name']
            testOrcid = employees[employeeIndex]['orcid']

            if testOrcid == '':
                print('(no ORCID)')
            else:
                print('ORCID: ', testOrcid)
            print()
            
            foundMatch = False # start the flag with the person not being matched
            possibleMatch = False # start the flag with there not being a possibility that the person could match
            for qIdIndex in range(0, len(qIds)):
                potentialOrcid.append('') # default to no ORCID found for that person
                print()
                print(qIdIndex, 'Wikidata ID: ', qIds[qIdIndex], ' Name variant: ', nameVariants[qIdIndex], ' ', 'https://www.wikidata.org/wiki/' + qIds[qIdIndex])
                wdClassList = searchWikidataSingleProperty(qIds[qIdIndex], 'P31', 'item')
                # if there is a class property, check if it's a human
                if len(wdClassList) != 0:
                    # if it's not a human
                    if wdClassList[0] != 'Q5':
                        print('This item is not a human!')
                        break
                        
                # check for a death date
                deathDateList = searchWikidataSingleProperty(qIds[qIdIndex], 'P570', 'string')
                if len(deathDateList) == 0:
                    print('No death date given.')
                else:
                    deathDate = deathDateList[0][0:10] # all dates are converted to xsd:dateTime and will have a y-m-d date
                    if deathDate < deathDateLimit:
                        # if the person died a long time ago, don't retrieve other stuff
                        print('This person died in ', deathDate)
                        break
                    else:
                        # if the person died recently, we still might be interested in them so keep going
                        print('This person died in ', deathDate)

                # check for a birth date
                if employees[employeeIndex]['birth_date'] != '': # only check Wikidata if the person has a birthdate
                    birthDateList = searchWikidataSingleProperty(qIds[qIdIndex], 'P569', 'string')
                    if len(birthDateList) == 0: # do nothing if there are no birthdates retrieved from Wikidata
                        print('No birth date given.')
                    else:
                        birthDate = birthDateList[0][0:4] # get only the first four digits since only years are given
                        if birthDate != employees[employeeIndex]['birth_date']:
                            print('Wikidata birthdate ', birthDate, ' does not match ', employees[employeeIndex]['birth_date'])
                            break

                descriptors = searchWikidataDescription(qIds[qIdIndex])
                employers = searchWikidataEmployer(qIds[qIdIndex])
                #print(descriptors)
                if descriptors != {}:
                    if descriptors['description'] != '':
                        print('description: ', descriptors['description'])
                    for occupation in descriptors['occupation']:
                        print('occupation: ', occupation)
                    for employer in employers:
                        print('employer: ', employer)
                    if descriptors['orcid'] != '':
                        if testOrcid == '':
                            # **** NOTE: if the person has an ORCID, it may be possible to find articles via ORCID
                            # that aren't linked in Wikidata. Not sure if this happens often enough to handle it
                            print('ORCID: ', descriptors['orcid'])
                            potentialOrcid[qIdIndex] = descriptors['orcid']
                        else:
                            # This should always be true if the SPARQL query for ORCID was already done
                            if testOrcid != descriptors['orcid']:
                                print('*** NOT the same person; ORCID ' + descriptors['orcid'] + ' does not match.')
                                break # don't continue the loop (look up references) since it's definitely not a match
                            else:
                                print('*** An ORCID match! How did it get missed in the earlier SPARQL query?')
                                break
                else:
                    print('No description or occupation given.')

                result = searchWikidataArticle(qIds[qIdIndex])
                if len(result) == 0:
                    print('No articles authored by that person')
                else:
                    articleCount = 0
                    for article in result:
                        print()
                        print('Checking article: ', article['title'])
                        if article['pmid'] == '':
                            print('No PubMed ID')
                        else:
                            print('Checking authors in PubMed article: ', article['pmid'])
                            pubMedAuthors = retrievePubMedData(article['pmid'])
                            if pubMedAuthors == []:
                                print('PubMed ID does not seem to be valid.')
                            #print(pubMedAuthors)
                            for author in pubMedAuthors:
                                nameTestRatio = fuzz.token_set_ratio(author['surname'], testAuthor)
                                #print(nameTestRatio, author['surname'])
                                if nameTestRatio >= 90:
                                    # if the PubMed metadata gives an ORCID for the matched person, record it unless 
                                    # the ORCID has already been gotten from the Wikidata record
                                    if author['orcid'] != '':
                                        if testOrcid == '':
                                            print('ORCID from article: ', author['orcid'])
                                            if potentialOrcid[qIdIndex] == '':
                                                potentialOrcid[qIdIndex] = author['orcid']
                                        else:
                                            if testOrcid != author['orcid']:
                                                print('*** NOT the same person; ORCID ' + author['orcid'] + ' does not match.')
                                                break # don't continue the loop (look up authors) since it's definitely not a match
                                            else:
                                                print('*** An ORCID match!')
                                                foundMatch = True
                                                matchStatus = 6
                                                break # don't continue the loop (look up authors) since it's an ORCID match

                                    if author['affiliation'] != '': 
                                        setRatio = fuzz.token_set_ratio(deptSettings[deptShortName]['testAuthorAffiliation'], author['affiliation'])
                                        print('Affiliation test: ', setRatio, author['affiliation'])
                                        if setRatio >= 90:
                                            foundMatch = True
                                            matchStatus = 10
                                            break # don't continue the loop (look up authors) since it's an affiliation match
                                    else:
                                        break # give up on this article because no affiliation string
                        # Don't look up the DOI if it's already found a match with PubMed
                        if foundMatch:
                            break # stop checking articles after a PubMed one has matched
                        else:
                            if article['doi'] == '':
                                print('No DOI')
                            else:
                                print('Checking authors in DOI article: ', article['doi'])
                                doiAuthors = retrieveCrossRefDoi(article['doi'])
                                if doiAuthors == []:
                                    print('DOI does not dereference at CrossRef')
                                for author in doiAuthors:
                                    nameTestRatio = fuzz.token_set_ratio(author['familyName'], testAuthor)
                                    #print(nameTestRatio, author['familyName'])
                                    if nameTestRatio >= 90:
                                        if author['orcid'] != '':
                                            if testOrcid == '':
                                                # DOI records the entire ORCID URI, not just the ID number
                                                # so pull the last 19 characters from the string
                                                print('ORCID from article: ', author['orcid'][-19:])
                                                # only add the ORCID from article if there isn't already one,
                                                # for example, one gotten from the Wikidata record itself
                                                if potentialOrcid[qIdIndex] == '':
                                                    potentialOrcid[qIdIndex] = author['orcid'][-19:]
                                            else:
                                                if testOrcid != author['orcid']:
                                                    print('*** NOT the same person; ORCID ' + author['orcid'] + ' does not match.')
                                                    break # don't continue the loop (look up authors) since it's definitely not a match
                                                else:
                                                    print('*** An ORCID match!')
                                                    foundMatch = True
                                                    matchStatus = 6
                                                    break # don't continue the loop (look up authors) since it's an ORCID match


                                        if len(author['affiliation']) > 0:
                                            for affiliation in author['affiliation']:
                                                setRatio = fuzz.token_set_ratio(deptSettings[deptShortName]['testAuthorAffiliation'], affiliation)
                                                print('Affiliation test: ', setRatio, affiliation)
                                                if setRatio >= 90:
                                                    foundMatch = True
                                                    matchStatus = 10
                                                    break # don't continue the loop (look up authors) since it's an affiliation match
                                        else:
                                            break # give up on this article because no affiliation string
                            if foundMatch:
                                break # stop checking articles after a DOI one has matched
                        articleCount += 1
                        if articleCount > 10:
                            checkMore = input('There are more than 10 articles. Press Enter to skip the rest or enter anything to get the rest.')
                            if checkMore == '':
                                break
                    if foundMatch:
                        print('***', qIds[qIdIndex], ' is a match.')
                        print()
                        employees[employeeIndex]['wikidataId'] = qIds[qIdIndex]
                        employees[employeeIndex]['orcid'] = potentialOrcid[qIdIndex]
                        break # quit checking Q IDs since the person was matched
                    else:
                        print('No match found.')
                print('Employee: ', employees[employeeIndex]['name'], ' vs. name variant: ', nameVariants[qIdIndex])
                possibleMatch = True # made it all the way through the loop without hitting a break, so a match is possible
                print()
            if not foundMatch:
                if not possibleMatch:
                    matchStatus = 12
                else:
                    choiceString = input('Enter the number of the matched entity, or press Enter/return if none match: ')
                    if choiceString == '':
                        matchStatus = 7
                    else:
                        # NOTE: there is no error trapping here for mis-entry !!!
                        choice = int(choiceString)
                        matchStatus = 11
                        employees[employeeIndex]['wikidataId'] = qIds[choice]
                        # write a discovered ORCID only if the person didn't already have one
                        if (potentialOrcid[choice] != '') and (employees[employeeIndex]['orcid'] == ''):
                            employees[employeeIndex]['orcid'] = potentialOrcid[choice]
                    print()
                
        # record the final match status
        employees[employeeIndex]['wikidataStatus'] = str(matchStatus)
    
    # write the file after each person is checked in case the user crashes the script
    filename = deptShortName + '-employees-curated.csv'
    fieldnames = ['wikidataId', 'name', 'irn', 'herb_code', 'birth_date', 'position', 'specialities', 'city', 'state', 'wikidataStatus', 'orcid']
    writeDictsToCsv(employees, filename, fieldnames)

print()
print('Done')


# Download various statements and references, then generate write file

NOTE: between the previous step and this one, one can add a gender/sex column to the table that will be processed if it exists.  Column header: 'gender'.  Allowed values (from Wikidata): m=male, f=female, i=intersex, tf=transgender female, tm=transgender male

In [None]:
filename = deptShortName + '-employees-curated.csv'
employees = readDict(filename)

# create a list of the employees who have Wikidata qIDs
qIds = []
for employee in employees:
    if employee['wikidataId'] != '':
        qIds.append(employee['wikidataId'])

# get all of the ORCID data that is already in Wikidata
prop = 'P496' # ORCID iD
value = '' # since no value is passed, the search will retrieve the value
refProps = ['P813'] # retrieved
wikidataOrcidData = searchStatementAtWikidata(qIds, prop, value, refProps)
#print(json.dumps(wikidataOrcidData, indent=2))

# match people who have ORCIDs with ORCID data downloaded from Wikidata
for employeeIndex in range(0, len(employees)):
    matched = False
    for wikidataOrcidDataIndex in range(0, len(wikidataOrcidData)):
        if wikidataOrcidData[wikidataOrcidDataIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matched = True
            if employees[employeeIndex]['orcid'] != wikidataOrcidData[wikidataOrcidDataIndex]['statementValue']:
                print('Non-matching ORCID for ', employees[employeeIndex]['name'])
            # if there is a match, record whatever data was retrieved
            else:
                employees[employeeIndex]['orcidStatementUuid'] = wikidataOrcidData[wikidataOrcidDataIndex]['statementUuid']
                employees[employeeIndex]['orcidReferenceHash'] = wikidataOrcidData[wikidataOrcidDataIndex]['referenceHash']
                # if there is no referenceHash then try to dereference the ORCID
                if employees[employeeIndex]['orcidReferenceHash']== '':
                    # if there is a match, check whether the ORCID record can be retrieved
                    print('Checking ORCID for Wikidata matched: ', employees[employeeIndex]['name'])
                    # returned value is the current date if successful; empty string if not
                    employees[employeeIndex]['orcidReferenceValue'] = checkOrcid(employees[employeeIndex]['orcid'])
                # if there is an existing reference, record the value for the first reference property (only one ref property)
                else:
                    print('Already an ORCID reference for: ', employees[employeeIndex]['name'])
                    # need to add the + in front of dateTime, which is needed by the API for upload
                    employees[employeeIndex]['orcidReferenceValue'] = '+' + wikidataOrcidData[wikidataOrcidDataIndex]['referenceValues'][0]
            # stop checking at the first match.
            break
    # if the person doesn't match with those whose ORCIDs came back from the query...
    if not matched:
        # check for access if they have an ORCID (not present in Wikidata)
        if employees[employeeIndex]['orcid'] != '':
            print('Checking ORCID for unmatched: ', employees[employeeIndex]['name'])
            # the function returns the current date (to use as the retrieved date) if the ORCID is found, otherwise empty string
            employees[employeeIndex]['orcidReferenceValue'] = checkOrcid(employees[employeeIndex]['orcid'])

# get data already in Wikidata about people employed at Vanderbilt
prop = 'P108' # employer
refProps = ['P854', 'P813'] # source URL, retrieved
wikidataEmployerData = searchStatementAtWikidata(qIds, prop, employerQId, refProps)
#print(json.dumps(wikidataEmployerData, indent=2))

# match people with employment data downloaded from Wikidata
for employeeIndex in range(0, len(employees)):
    matchedStatement = False
    matchedReference = False
    for wikidataEmployerDataIndex in range(0, len(wikidataEmployerData)):
        if wikidataEmployerData[wikidataEmployerDataIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matchedStatement = True
            employees[employeeIndex]['employerStatementUuid'] = wikidataEmployerData[wikidataEmployerDataIndex]['statementUuid']
            employees[employeeIndex]['employerReferenceHash'] = wikidataEmployerData[wikidataEmployerDataIndex]['referenceHash']
            # if there is a referenceHash then record the values for the two reference properties: P813, P854'; retrieved, source URL
            if employees[employeeIndex]['employerReferenceHash']!= '':
                # need to add the + in front of dateTime, which is needed by the API for upload
                employees[employeeIndex]['employerReferenceSourceUrl'] = wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][0]
                if wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][0] == deptSettings[deptShortName]['baseUrl'] + employees[employeeIndex]['category']:
                    matchedReference = True
                employees[employeeIndex]['employerReferenceRetrieved'] = '+' + wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][1]
            # stop checking if there is an exact match to the reference URL. Otherwise keep looping.
            # if there is a later reference that matches ours, it will overwrite any previous reference data
            # otherwise, the existing (different) reference data will be retained
            if matchedReference:
                break
    # everyone is assigned the employerQId as a value because either they showed up in the SPARQL search for employerQId
    # or we are making a statement that they work for employerQId.
    employees[employeeIndex]['employer'] = employerQId
    if not matchedReference:  # generate the reference metadata if the reference URL wasn't found
        wholeTimeStringZ = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
        dateZ = wholeTimeStringZ.split('T')[0] # form 2019-12-05
        wholeDateZ = '+' + dateZ + 'T00:00:00Z' # form +2019-12-05T00:00:00Z as provided by Wikidata
        employees[employeeIndex]['employerReferenceSourceUrl'] = deptSettings[deptShortName]['baseUrl'] + employees[employeeIndex]['category']
        employees[employeeIndex]['employerReferenceRetrieved'] = wholeDateZ

# *** This is a copy and paste of the employer section above, modified for affiliation

# get data already in Wikidata about people affiliated with the department
prop = 'P1416' # affiliation
refProps = ['P854', 'P813'] # source URL, retrieved
wikidataEmployerData = searchStatementAtWikidata(qIds, prop, deptSettings[deptShortName]['departmentQId'], refProps)
#print(json.dumps(wikidataEmployerData, indent=2))

# match people with affiliation data downloaded from Wikidata
for employeeIndex in range(0, len(employees)):
    matchedStatement = False
    matchedReference = False
    for wikidataEmployerDataIndex in range(0, len(wikidataEmployerData)):
        if wikidataEmployerData[wikidataEmployerDataIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matchedStatement = True
            employees[employeeIndex]['affiliationStatementUuid'] = wikidataEmployerData[wikidataEmployerDataIndex]['statementUuid']
            employees[employeeIndex]['affiliationReferenceHash'] = wikidataEmployerData[wikidataEmployerDataIndex]['referenceHash']
            # if there is a referenceHash then record the values for the two reference properties: P813, P854'; retrieved, source URL
            if employees[employeeIndex]['affiliationReferenceHash']!= '':
                # need to add the + in front of dateTime, which is needed by the API for upload
                employees[employeeIndex]['affiliationReferenceSourceUrl'] = wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][0]
                if wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][0] == deptSettings[deptShortName]['baseUrl'] + employees[employeeIndex]['category']:
                    matchedReference = True
                employees[employeeIndex]['affiliationReferenceRetrieved'] = '+' + wikidataEmployerData[wikidataEmployerDataIndex]['referenceValues'][1]
            # stop checking if there is an exact match to the reference URL. Otherwise keep looping.
            # if there is a later reference that matches ours, it will overwrited any previous reference data
            # otherwise, the existing (different) reference data will be retained
            if matchedReference:
                break
        
    # everyone is assigned the department as a value because either they showed up in the SPARQL search
    # or we are making a statement that they are affiliated with the department.
    employees[employeeIndex]['affiliation'] = deptSettings[deptShortName]['departmentQId']
    if not matchedReference:  # generate the reference metadata if the reference URL wasn't found
        wholeTimeStringZ = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
        dateZ = wholeTimeStringZ.split('T')[0] # form 2019-12-05
        wholeDateZ = '+' + dateZ + 'T00:00:00Z' # form +2019-12-05T00:00:00Z as provided by Wikidata
        employees[employeeIndex]['affiliationReferenceSourceUrl'] = deptSettings[deptShortName]['baseUrl'] + employees[employeeIndex]['category']
        employees[employeeIndex]['affiliationReferenceRetrieved'] = wholeDateZ

# get all of the data that is already in Wikidata about who are humans
prop = 'P31' # instance of
value = 'Q5' # human
refProps = [] # no ref property needed
wikidataHumanData = searchStatementAtWikidata(qIds, prop, value, refProps)

# Find out which people have assertions that they are humans and record their statement IDs.
# Assign the properties to all others.
for employeeIndex in range(0, len(employees)):
    for wikidataHumanIndex in range(0, len(wikidataHumanData)):
        if wikidataHumanData[wikidataHumanIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            employees[employeeIndex]['instanceOfUuid'] = wikidataHumanData[wikidataHumanIndex]['statementUuid']
    # everybody is assigned a value of 'human'
    employees[employeeIndex]['instanceOf'] = 'Q5'

# hack of human code immediately above

# get all of the data that is already in Wikidata about the sex or gender of the researchers
prop = 'P21' # sex or gender
value = '' # don't provide a value so that it will return whatever value it finds
refProps = [] # no ref property needed
wikidataHumanData = searchStatementAtWikidata(qIds, prop, value, refProps)

# Find out which people have assertions of sex/gender and record their statement IDs.
# Assign the value for the property to all others.
# NOTE: Wikidata doesn't seem to care a lot about references for this property and we don't really have one anyway
for employeeIndex in range(0, len(employees)):
    matched = False
    for wikidataHumanIndex in range(0, len(wikidataHumanData)):
        if wikidataHumanData[wikidataHumanIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matched = True
            employees[employeeIndex]['sexOrGenderUuid'] = wikidataHumanData[wikidataHumanIndex]['statementUuid']
            # use the value in Wikidata and ignore the value in the 'gender' column of the table.
            # extractFromIri() function strips the namespace from the qId
            employees[employeeIndex]['sexOrGenderQId'] = extractFromIri(wikidataHumanData[wikidataHumanIndex]['statementValue'], 4)
    if not matched:
        # assign the value from the 'gender' column in the table if not already in Wikidata
        if 'gender' in employees[employeeIndex]:
            employees[employeeIndex]['sexOrGenderQId'] = decodeSexOrGender(employees[employeeIndex]['gender'])
        else:
            employees[employeeIndex]['sexOrGenderQId'] = ''

# get all of the English language labels for the employees that are already in Wikidata
labelType = 'label'
language = 'en'
wikidataLabels = searchLabelsDescriptionsAtWikidata(qIds, labelType, language)

# Match people with their labels
for employeeIndex in range(0, len(employees)):
    matched = False
    for wikidataLabelIndex in range(0, len(wikidataLabels)):
        if wikidataLabels[wikidataLabelIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matched = True
            employees[employeeIndex]['labelEn'] = wikidataLabels[wikidataLabelIndex]['string']
    if not matched:
        # assign the value from the 'name' column in the table if not already in Wikidata
        if deptSettings[deptShortName]['labels']['source'] == 'column':
            # then use the value from the default label column.
            defaultLabelColumn = deptSettings[deptShortName]['labels']['value']
            employees[employeeIndex]['labelEn'] = employees[employeeIndex][defaultLabelColumn]
        else:
            # or use the default label value.
            employees[employeeIndex]['labelEn'] = deptSettings[deptShortName]['labels']['value']

# get all of the English language descriptions for the employees that are already in Wikidata
labelType = 'description'
language = 'en'
wikidataDescriptions = searchLabelsDescriptionsAtWikidata(qIds, labelType, language)

# Match people with their descriptions
for employeeIndex in range(0, len(employees)):
    matched = False
    for wikidataDescriptionIndex in range(0, len(wikidataDescriptions)):
        if wikidataDescriptions[wikidataDescriptionIndex]['qId'] == employees[employeeIndex]['wikidataId']:
            matched = True
            employees[employeeIndex]['description'] = wikidataDescriptions[wikidataDescriptionIndex]['string']
    if not matched:
        # assign a default value if not already in Wikidata
        if deptSettings[deptShortName]['descriptions']['source'] == 'column':
            # then use the value from the default description column.
            defaultDescriptionColumn = deptSettings[deptShortName]['descriptions']['value']
            employees[employeeIndex]['description'] = employees[employeeIndex][defaultDescriptionColumn]
        else:
            # or use the default description value.
            employees[employeeIndex]['description'] = deptSettings[deptShortName]['descriptions']['value']

# Get all of the aliases already at Wikidata for employees.  
# Since there can be multiple aliases, they are stored as a list structure.
# The writing script can handle multiple languages, but here we are only dealing with English ones.

# retrieve the aliases in that language that already exist in Wikidata and match them with table rows
labelType = 'alias'
language = 'en'
aliasesAtWikidata = searchLabelsDescriptionsAtWikidata(qIds, labelType, language)
for entityIndex in range(0, len(employees)):
    personAliasList = []
    if employees[entityIndex]['wikidataId'] != '':  # don't look for the label at Wikidata if the item doesn't yet exist
        for wikiLabel in aliasesAtWikidata:
            if employees[entityIndex]['wikidataId'] == wikiLabel['qId']:
                personAliasList.append(wikiLabel['string'])
    # if not found, the personAliasList list will remain empty
    employees[entityIndex]['alias'] = json.dumps(personAliasList)

# set the departmental short name for all entities
for employeeIndex in range(0, len(employees)):
    employees[employeeIndex]['department'] = deptShortName

# write the file
filename = deptShortName + '-employees-to-write.csv'
fieldnames = ['department', 'wikidataId', 'name', 'labelEn', 'alias', 'description', 'orcidStatementUuid', 'orcid', 'orcidReferenceHash', 'orcidReferenceValue', 'employerStatementUuid', 'employer', 'employerReferenceHash', 'employerReferenceSourceUrl', 'employerReferenceRetrieved', 'affiliationStatementUuid', 'affiliation', 'affiliationReferenceHash', 'affiliationReferenceSourceUrl', 'affiliationReferenceRetrieved', 'instanceOfUuid', 'instanceOf', 'sexOrGenderUuid', 'sexOrGenderQId', 'gender', 'degree', 'category', 'wikidataStatus', 'role']
writeDictsToCsv(employees, filename, fieldnames)

print()
print('Done')

# Set file name in CSV metadata file

Prior to writing the data to Wikidata using the `process_csv_metadata_full.py` script, the input file name needs to be changed in the `csv-metadata.json` file to have the correct `deptShortName` for the department. 


In [None]:
with open('csv-metadata.json', 'rt', encoding='utf-8') as inFileObject:
    text = inFileObject.read()
schema = json.loads(text)
schema['tables'][0]['url'] = deptShortName + '-employees-to-write.csv'
outText = json.dumps(schema, indent = 2)
with open('csv-metadata.json', 'wt', encoding='utf-8') as outFileObject:
    outFileObject.write(outText)
print('Department to be written:', deptShortName)