# Extract publication information from the AK API

Microsoft's Academic Knowledge (AK) API provides a free database of academic publications. In a previous step we colected a list of IDs under which the Turing fellows are stored in the AK database (turing_AK_IDs.csv). Here we use the collected IDs to extract up to 50 articles published by each fellow in 2012 or later.  

To use the AK database, an access key needs to be obtained. Here we load it from api_key.txt.  

Results were stored as publications_eng.csv

## 0: Functions and packages

### Required packages

In [None]:
import http.client, urllib.parse, json, re, unicodedata, string
import pandas as pd
from urllib.request import urlopen
import numpy as np

### Load data
(i) read key for AK API from text file  
(ii) load information about AK API ID for each fellow

In [None]:
#extract access key to academic knowledge api
with open('api_key.txt', 'r') as myfile:
    key = myfile.read().replace('\n', '')

ids_list = pd.read_csv('data_files/turing_AK_IDs.csv')

### General functions

In [None]:
#function for removing accents from strings
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_text(text):
    text = text.replace(",", "")
    text = strip_accents(text)
    return text

def save_to_file(my_list, file_name):
    """
    function for saving data to file
    input is list which is saved as one row in file called file_name
    """
    f = open('data_files/' + file_name, 'a') 
    for i in my_list:
        f.write(str(i) + ',')
    f.write('\n') # write a line ending
    f.close() #close and "save" the output file

def save_info(data, file_name):
    """
    save data (information) returned by the id_to_paper function (list of dictionaries)   
    file_name = where want to save extracted info
    """ 
    for i in data:
        article = []
        
        for j in i.keys():
            if j != 'keywords' and j != 'urls':
                text = clean_text(str(i[j]))
                article.append(text)
            else:
                article.extend(i[j])

        #save all article related information
        save_to_file(article, file_name)

### Helper functions for extracting data from AK API

In [None]:
def generate_abstract(IA):
    """
    Function for creating abstract out of abstract indexes provided by academic knowledge API
    IA is a dictionary with keywords IndexLength and InvertedIndex, IA.IndexLength = length of abstract
    IA.InvertedIndex is a dictionary where each word is a key and it provides an associated list of indexes
    of where in the abstract it appears
    returns abstract as string
    """
    #create list of words of length n where n = length of abstract
    abstract = ["word"] * IA['IndexLength']

    #search through each unique word in the abstract
    for word in IA['InvertedIndex'].keys():
        
        #find positions/indexes in the abstract that this word appears at (can be 1 position or multiple)
        for index in IA['InvertedIndex'][word]:
        
            #place word in correct position in abstract list
            abstract[index] = word;

    #join all abstract words and return as string
    abstract = ' '.join(abstract)
    abstract = abstract.rstrip()
    abstract = re.sub('[^A-Za-z0-9]+', ' ', abstract)
    
    return abstract

def get_parameters(author_id, year, count):
    
    expression = "AND(Composite(AA.AuId=" + str(author_id) + "),Y>" + str(year) + ")"
    att_list = ['Id', 'L', 'AA.AuN', 'AA.AuId', 'AA.AfN','Ti', 'F.FN', 'J.JN', 'E']
    attributes = ",".join(att_list)
    
    params = urllib.parse.urlencode({
        'expr': expression, 
        'model': 'latest', #unless want to use a different version of the AK API
        'count': str(count), #number of results to return
        'offset': '0', #index of the first result to retun
        'orderby': 'Y:asc', #name of attribute used for sorting  
        'attributes': attributes,
    })
    
    return params

def get_author_info(authors, author_id):
    name, affiliation = "", ""
    for i in authors:
        if i['AuId'] == author_id:
            name = i['AuN'].replace("ł", "l")
            if 'AfN' in i.keys():
                affiliation = i['AfN']
                
    return name, affiliation

def get_urls(source):
    urls = []
    for i in source:
        urls.append(i['U'])
    #make sure the array is always 20 in length
    for num in range(20-len(urls)):
        urls.append("")
        
    return urls

def get_data(author_id, author_name, data_dict):      
    #all information of interest is saved under the key 'entities'
    #data_dict['entities'] returns a list of length = number of articles asked for
    #within that list, at each index is a dictionary with all the information we want to access about that article
    
    articles = data_dict["entities"]
    final_articles_data = []
    
    for i in range(len(articles)):
        entities = articles[i]
        
        #make sure paper language is english
        if 'L' in entities.keys():
            if entities['L'] == 'en':

                #author information
                if 'AA' in entities.keys():
                    name, affiliation = get_author_info(entities['AA'], author_id)

                #paper id, title and journal name
                article_info = {'Id':"", 'Ti':"", 'J.JN':""}
                for attribute in article_info.keys():
                    if attribute in entities.keys():
                        article_info[attribute] = entities[attribute]

                #fields of study (F) i.e. AK assigned keywords 
                field_of_study = []
                if "F" in entities.keys():
                    for i in range(len(entities['F'])):
                        field_of_study.append(entities["F"][i]["FN"])

                #entities["E"] returns a string of a dictionary with "extended attributes"
                #abstract, doi, source URLs
                extended = {'IA':"", 'DOI':"", 'S':[]}    
                if "E" in entities.keys():
                    E = json.loads(entities["E"])
                    
                    if 'IA' in E.keys():
                        extended['IA'] = generate_abstract(E["IA"])

                    if 'DOI' in E.keys():
                        extended['DOI'] = E['DOI']

                    if 'S' in E.keys():
                        extended['S'] = get_urls(E['S'])

                final_articles_data.append({'author_id':author_id, 'standard_name':author_name, 'ak_name':name, 'affiliation':affiliation,  'paper_id': article_info['Id'], 
                        'title': article_info['Ti'], 'abstract': extended['IA'], 'journal_name':article_info['J.JN'], 'doi':extended['DOI'], 'urls':extended['S'], 
                        'keywords':field_of_study})

    return final_articles_data

### Function for making call to AK API

In [None]:
def id_to_paper(author_id, name, key, year = 2011, count = 20):
    """
    Function for extracting publication/article information
    
    Need to provide:
    - author ID
    - cut-off year for publications that interested in (this year is NOT included) i.e. for 2017 put 2016
    - count = how many articles want to return
    
    returns list of dictionaries, one dictionary per article with all info specified in attributes list + supplied info
    """
    
    headers = {'Ocp-Apim-Subscription-Key': key}
    params = get_parameters(author_id, year, count)

    try:
        conn = http.client.HTTPSConnection('westus.api.cognitive.microsoft.com')
        conn.request("GET", "/academic/v1.0/evaluate?%s" % params, "{body}", headers)
        response = conn.getresponse()
        data = response.read()    
        
        articles_data = []

        try:
            data_dict=json.loads(data)
        except ValueError:
            data_dict = {}

        #check that something has been returned, if not - ignore
        if 'entities' in data_dict.keys():
            articles_data = get_data(author_id, name, data_dict)
            
        conn.close()
        
        return articles_data
    
    except Exception as e:
        print(e)

## 1: Extract publications information

### Set up - file headings

In [None]:
#file where extract publications per author
file_name = 'publications_eng.csv'

attributes = ['author_id', 'standard_name', 'ak_name','affiliation','paper_id','title','abstract', 'journal_name', 'doi']

#anticipate up to 20 URLs and 50 keywords (this is excessive but avoids possible errors - will delete empty columns later)
num_urls = 20
for i in range(num_urls):
    attributes.append('url_' + str(i))
    
num_keywords = 50
for i in range(num_keywords):
    attributes.append('keyword_' + str(i))

save_to_file(attributes, file_name)  

### Retrieve and save AK data

In [None]:
#number of publications to retrieve
num_publications = 50

for index, row in ids_list.iterrows():
    
    #track progress
    if index%(int(ids_list.shape[0]/10)) == 0:
        print(str(round(index/ids_list.shape[0],2)*100)+"%")
        
    #some fellows have multiple IDs
    author_ids = row['id_1':'id_3']
    for author_id in author_ids:
        if not pd.isnull(author_id):
            max_tries = 10
            num_tries, found = 0, 0

            while num_tries < max_tries and found == 0:
                num_tries += 1
                data = id_to_paper(int(author_id), row['name'], key,  count = num_publications)
                if data != []:
                    found = 1

            save_info(data, file_name)
print('end')