# Retrieve IDs from AK database

<b>Academic Knowledge API overview:</b>  
Microsoft's Academic Knowledge (AK) API provides a free database of academic publications. It enables searches for specific articles given a set of parameters (e.g., author's name, article title, year of publication, field of study etc.).  

For more information, see for example:  
Overview of parameters: https://westus.dev.cognitive.microsoft.com/docs/services/56332331778daf02acc0a50b/operations/565d753be597ed16ac3ffc03  
Overview of entity attributes: https://docs.microsoft.com/en-us/azure/cognitive-services/academic-knowledge/entityattributes  
Overview of query syntax: https://docs.microsoft.com/en-us/azure/cognitive-services/academic-knowledge/queryexpressionsyntax

We needed a unique identifier for each fellow to use as a means of searching for their publications records.

<b> Automated method:</b>  
We used fellow's name and current university affiliation (taken from the turing.ac.uk website and internal sources, see turing_fellows_complete.csv) to find ID, This only worked for a subset of researchers. 
    
<b>Manual method:</b>  
We visited each researchers' personal web page (or google searched for it in instances where it was not provided) and manually coppied 2-3 publication titles for each Turing fellow (see example_article_titles.csv). These titles were used to retrieve an ID from the database.    

<b>Notes:</b>  
To use the AK database, an access key needs to be obtained. Here we load it from api_key.txt.  

There are multiple reasons for the failure to find an ID: often there is a difference between a fellow's name on the Turing website (e.g., shortened versions of first name) and their name in the AK database which stores names as they appear on their publications (which can include middle name initials etc.); not all articles are stored in the AK database; in some instances the AK database wrongly records recent articles as published at fellow's previous institution so they do not have any articles recorded as published at current university; database is prone to internal errors - in some instances we had to repeat a call to the database to retrieve a match.

Some researchers are stored under multiple IDs.  

The IDs retrieved using the automated method matched those we retrieved manually. Further, the automated method retrieved a few additional IDs which were checked and found to be correct.  

# 0: Set up

### Required Packages

In [None]:
import http.client, urllib.parse, json, re, unicodedata, string
import pandas as pd
from difflib import SequenceMatcher
from bs4 import BeautifulSoup
from urllib.request import urlopen

### Load data
(i) key for AK API from text file  
(ii) information about Turing Fellows

In [None]:
#extract access key to academic knowledge api
with open('api_key.txt', 'r') as myfile:
    key=myfile.read().replace('\n', '')   

In [None]:
fellows = pd.read_csv('data_files/turing_fellows.csv')
fellows['full_name'] = fellows['Name'].apply(lambda x: " ".join(x.split(" ")[1:]).lower().replace("-", " "))

#dictionary for storing IDs
turing_fellows = {}
for index, row in fellows.iterrows():
    name = row['full_name']
    turing_fellows[name] = []

### Helper functions

In [None]:
def save_to_file(my_list, file_name):
    """
    function for saving data to file
    input is list which is saved as one row in file called file_name
    """
    f = open('data_files/'  + file_name, 'a') 
    for i in my_list:
        f.write(str(i) + ',')
    f.write('\n') # write a line ending
    f.close() #close and "save" the output file
    
def clean_title(title):
    title = title.replace("æ", "")
    title = title.replace("Ê", "")
    return title
    
#function for removing accents from strings
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def score_similarity(string_1, string_2):
    """
    return ratio similarity score (ignoring cases)
    """
    return SequenceMatcher(None, string_1.lower(), string_2.lower()).ratio()

### Functions for making a call to AK API to extract author information (including ID)

In [None]:
def format_query(info, method):
    if method == 'name':
        #remove any accents in name
        name = strip_accents(info[0]).replace("-","")
        if len(info) == 2:
            expression = "Composite(AND(AA.AuN=='" + name.lower() + "', AA.AfN='" + info[1].lower() + "'))"
        else:
            expression = "Composite(AA.AuN=='" + info[0].lower() + "')"
    elif method == 'title':
        title = re.sub(r'\[.*?\]|\(.*?\)|\W+', ' ',  info[0].lower())
        title = title.strip()
        expression = "Ti='" + str(title) + "'" 
        
    return expression

def get_parameters(expression):
    parameters = urllib.parse.urlencode({
        #the below query is for a specific author while that author is at a specific institution for publications in 2013 and later
        'expr': expression,
        'model': 'latest',
        'count': "1", #number of results to return, this is set above what expect to find just to make sure that get all articles
        'offset': '0', #index of the first result to retun
        'orderby': '', #name of attribute used for sorting  
        'attributes': 'AA.AuN,AA.AuId,AA.AfN',
    })
    
    return parameters

def get_id(info, key, method):
    """
    info = [researcher name, researcher affiliation] OR [paper title] 
    key = access key for AK API
    method : 'name' for name (and affiliation), 'title' for title
    
    extract list of authors
    """
    headers = {'Ocp-Apim-Subscription-Key': key,}
    expression = format_query(info, method)
    params = get_parameters(expression)
    
    try:
        conn = http.client.HTTPSConnection('westus.api.cognitive.microsoft.com')
        conn.request("GET", "/academic/v1.0/evaluate?%s" % params, "{body}", headers)
        response = conn.getresponse()

        #save response from query, this returns bytes object
        data = response.read()     
        data_dict=json.loads(data)
        authors = []
        
        if 'entities' in data_dict.keys():
            if len(data_dict['entities']) != 0:
                entities = data_dict['entities'][0]
                if "AA" in entities.keys():
                    authors = entities["AA"]

        return authors
    
        conn.close()
        
    except Exception as e:
        print(e)

## 1: Get ID using name and affiliation

In [None]:
for name in turing_fellows.keys(): 
    
    uni = fellows.loc[fellows['full_name']==name]['University'].values[0]
    authors = get_id([name, uni], key, 'name')
    
    #loop through retrieved author list
    for i in authors:
        author = i['AuN']

        #if have match - save ID
        if author == name:
            turing_fellows[author].append(i['AuId'])        

## 2: Use manually retrieved publication titles to get ID (might require multiple runs)

In [None]:
all_titles = pd.read_csv('data_files/example_article_titles.csv', encoding = "ISO-8859-1")
all_titles = all_titles.dropna('index', how='all')
final_IDs = turing_fellows

In [None]:
for index, row in all_titles.iterrows():

    researcher = row['Name'].lower()
    titles = [row['title_1'], row['title_2'], row['title_3']]

    if final_IDs[researcher] == []:
   
        for title in titles:
            
            if type(title) != float: #check there is a third title
                title = clean_title(title)
                authors = get_id([title], key, 'title')

                for author in authors:
                    
                    if author['AuN'] == researcher: #if have exact name match - add
                        final_IDs[author['AuN']].append(author['AuId'])

                    #if have a surname match - check degree of match
                    #print results to monitor
                    elif author['AuN'].split(" ")[-1].lower() == researcher.split(" ")[-1].lower() and score_similarity(author['AuN'], researcher) >= .5: 
                        final_IDs[researcher.lower()].append(author['AuId'])
                        print('accepted:', researcher, " = ", author['AuN'], " : ", author['AuId'])

## 3: Remove repeats and save to csv

In [None]:
for i in final_IDs.keys():
    final_IDs[i] = set(final_IDs[i])

In [None]:
save_to_file(['name', 'uni', 'id_1', 'id_2', 'id_3'], 'turing_AK_IDs.csv')

for index, row in fellows_all.iterrows():
    name = row['Name'].lower()
    uni = row['Affiliation']
    
    ids = final_IDs[name]
    
    save_to_file([name, uni] + ids, 'turing_AK_IDs.csv')