# Extract publication information from the AK API

Microsoft's Academic Knowledge (AK) API provides a free database of academic publications. In a previous step we colected a list of IDs under which the Turing fellows are stored in the AK database (see turing_AK_IDs.csv). Here we used the collected IDs to extract up to 50 articles published by each fellow in 2012 or later.  

To use the AK database, an access key needs to be obtained. Here we load it from api_key.txt.  

Results were stored as publications_eng.csv

## 0: Set up

### Required packages

In [None]:
import http.client, urllib.parse, json, re, unicodedata, string
import pandas as pd
from urllib.request import urlopen
import numpy as np

### Load data
(i) read key for AK API from text file  
(ii) load information about AK API ID for each fellow

In [None]:
#extract access key to academic knowledge api
with open('api_key.txt', 'r') as myfile:
    key = myfile.read().replace('\n', '')

ids_list = pd.read_csv('data_files/turing_AK_IDs.csv')

### Helper functions

In [None]:
#function for saving generated data to csv file
directory = 'data_files/' 

def save_to_file(my_list, file_name):
    """
    function for saving data to file
    input is list which is saved as one row in file called file_name
    """
    f = open(directory + file_name, 'a') 
    for i in my_list:
        f.write(str(i) + ',')
    f.write('\n') # write a line ending
    f.close() #close and "save" the output file
      
def save_info(data, file_name):
    """
    save data (information) returned by the id_to_paper function
    - function returns list of dictionaries
    - can iterate through list
    
    file_name = where want to save extracted info
    """ 
    for i in data:
        article = []
        
        for j in i.keys():

            if j != 'keywords' and j != 'urls':

                #remove commas
                text = str(i[j]).replace(",", "")

                #remove accents
                text = strip_accents(text)

                #append retrieved info to article list
                article.append(text)

            else:
                article.extend(i[j])

        #save all article related information
        save_to_file(article, file_name)
            
#function for removing accents from strings
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def count_found(turing_fellows):
    count = 0
    for i in turing_fellows:
        if turing_fellows[i] != []:
            count += 1
    return count

### Functions for extracting publications data from AK API

In [None]:
def generate_abstract(IA):
    """
    Function for creating abstract out of abstract indexes provided by academic knowledge API
    
    IA is a dictionary with keywords IndexLength and InvertedIndex 
    
    IA.IndexLength = length of abstract
    
    IA.InvertedIndex is a dictionary where each word is a key and it provides an associated list of indexes
    of where in the abstract it appears
    
    returns abstract as string
    """
    
    #create list of words of length n where n = length of abstract
    abstract = ["word"] * IA['IndexLength']

    #search through each unique word in the abstract
    for word in IA['InvertedIndex'].keys():
        
        #find positions/indexes in the abstract that this word appears at (can be 1 position or multiple)
        for index in IA['InvertedIndex'][word]:
        
            #place word in correct position in abstract list
            abstract[index] = word;

    #join all abstract words and return as string
    return ' '.join(abstract)

def get_parameters(author_id, year, count):
    #format expression
    expression = "AND(Composite(AA.AuId=" + str(author_id) + "),Y>" + str(year) + ")"
    
    #define attributes that want to extract
    att_list = ['Id', 'L', 'AA.AuN', 'AA.AuId', 'AA.AfN','Ti', 'F.FN', 'J.JN', 'E']
    attributes = ",".join(att_list)
    
    params = urllib.parse.urlencode({
        'expr': expression, 
        'model': 'latest', #unless want to use a different version of the AK API
        'count': str(count), #number of results to return
        'offset': '0', #index of the first result to retun
        'orderby': 'Y:asc', #name of attribute used for sorting  
        'attributes': attributes,
    })
    
    return params

def get_data(author_id, data_dict):      
    #all information of interest is saved under the key 'entities'
    #data_dict['entities'] returns a list of length = number of articles asked for
    #within that list, at each index is a dictionary with all the information we want to access about that article
    articles = data_dict["entities"]
    final_articles_data = []
    
    #loop through all articles:
    for i in range(len(articles)):
        
        entities = articles[i]
        
        name, affiliation, paper_id = "", "", ""
        language, title, journal_name = "", "", "", 
        abstract, doi = "", ""
        field_of_study, urls = [], []
        
        #author information
        if 'AA' in entities.keys():
            for i in entities["AA"]:
                #find right author in list of paper authors
                if i['AuId'] == author_id:
                    name = i['AuN']
                    name = name.replace("ł", "l")
                    #sometimes affiliation info is missing
                    if 'AfN' in i.keys():
                        affiliation = i['AfN']

        #paper ID
        if 'Id' in entities.keys():
            paper_id = entities['Id']
  
        #paper language
        if 'L' in entities.keys():
            language = entities['L']

        #article title
        if 'Ti' in entities.keys():
            title = entities["Ti"]

        #journal name
        if 'J.JN' in entities.keys():
            journal_name = entities['J.JN']

        #fields of study (F) return a list where at each index is a dictionary of the form {"FN":{name of field of study}}
        #each article has different number of field keywords associated with it
        if "F" in entities.keys():
            for i in range(len(entities['F'])):
                field_of_study.append(entities["F"][i]["FN"])

        #entities["E"] returns a string of a dictionary - use JSON to interpret as dictionary   
        #we are interested in IA 
        #IA is a dictionary with the keys IndexLength and InvertedIndex
        #abstract is in "inverted form" so need to recontruct it
        if "E" in entities.keys():
            E = json.loads(entities["E"])
            if 'IA' in E.keys():
                IA = E["IA"]
                abstract = generate_abstract(IA)
                abstract = abstract.rstrip()
                abstract = re.sub('[^A-Za-z0-9]+', ' ', abstract)

            #get urls to source sites - can use to try to get full pdf
            if 'S' in E.keys():
                source = E['S']
                for i in source:
                    urls.append(i['U'])
                #make sure the array is always 20 in length
                for num in range(20-len(urls)):
                    urls.append("")

            if 'DOI' in E.keys():
                doi = E['DOI']
                
        #save each article's information if article in English
        if language == 'en':
            final_articles_data.append({'author_id':author_id, 'name':name, 'affiliation':affiliation,  'paper_id': paper_id, 
                    'title': title, 'abstract': abstract, 'journal_name':journal_name, 'doi':doi, 'urls':urls, 
                    'keywords':field_of_study})

    return final_articles_data

In [None]:
def id_to_paper(author_id, key, year = 2011, count = 20):
    """
    Function for extracting publication/article information
    
    Need to provide:
    - author ID
    - cut-off year for publications that interested in (this year is NOT included) i.e. for 2017 put 2016
    - count = how many articles want to return
    
    Accepted attributes to return:
    - Id = unique article ID (can use to catch possible repeats)
    - Ti = article title
    - F.FN = Field of study keywords, from Microsoft:
        Our fields of study are formal research topics that should be recognized by the broader research community. 
        We currently have around 50k of them, and are looking to expand them.
        We start with author and publisher supplied keywords and attempt to conflate them with our fields of study. 
        If the conflation is of sufficient quality we expose the labels via our API.
        We also leverage machine learning techniques to do additional labeling.    
    - J.JN = journal name
    - L = paper language (english = eng)
    - E = returns a JSON object with further information such as the inverted abstract.
        Attributes of interest in the E object: IA = inverted abstract, IA.IndexLength - abstract word count, 
        IA.InvertedIndex = Dictionary of unique abstract words and their corresponding positions in the original abstract (saved as list)
        S = Sources - list of web sources of the paper; S.Ty = Source Type (1:HTML, 2:Text, 3:PDF, 4:DOC, 5:PPT, 6:XLS, 7:PS); S.U = source url
    - AA = further author information, to confirm name and affiliation
    
    returns list of dictionaries, one dictionary per article with all info specified in attributes list + supplied info
    """
    
    #required packages to load
    #import http.client, urllib.request, urllib.parse, urllib.error, base64, json
    
    headers = {'Ocp-Apim-Subscription-Key': key}

    #set parameters
    params = get_parameters(author_id, year, count)

    try:
        conn = http.client.HTTPSConnection('westus.api.cognitive.microsoft.com')
        conn.request("GET", "/academic/v1.0/evaluate?%s" % params, "{body}", headers)
        response = conn.getresponse()
    
        final_articles_data = []
    
        #save response from query, this returns bytes object
        data = response.read()
        #convert bytes to string
        #data = data.decode("utf-8") 
        
        try:
            data_dict=json.loads(data)
        except ValueError:
            data_dict = {}

        #check that something has been returned, if not - ignore
        if 'entities' in data_dict.keys():
            
            final_articles_data = get_data(author_id, data_dict)
            
        #close connection and return information on all found articles
        conn.close()
        return final_articles_data
    
    except Exception as e:
        print(e)

## 1: Extract publications information

In [None]:
#file where extract publications per author
file_name = 'publications_eng.csv'
#number of publications to retrieve
num_publications = 50

#create headings in file
#anticipate up to 20 URLs and 50 keywords (this is excessive but avoids possible errors - will delete empty columns later)
num_urls = 20
attributes = ['author_id', 'name','affiliation','paper_id','title','abstract', 'journal_name', 'doi']
for i in range(num_urls):
    attributes.append('url_' + str(i))
num_keywords = 50
for i in range(num_keywords):
    attributes.append('keyword_' + str(i))

# save_to_file(attributes, file_name)    

failed, tries = [], []
count = ids_list.shape[0]

for index, row in ids_list.iterrows():
    
    if index%(int(count/10)) == 0:
        print(str(round(index/count,2)*100)+"%")
        
    #some fellows have multiple IDs
    author_ids = row['id_1':'id_3']

    for author_id in author_ids:
        if not pd.isnull(author_id):
            max_tries = 10
            num_tries, found = 0, 0

            while num_tries < max_tries and found == 0:
                num_tries += 1
                data = id_to_paper(int(author_id), key,  count = num_publications)
                if data != []:
                    found = 1

            tries.append(num_tries)
            save_info(data, file_name)

            if found == 0:
                failed.append(author_id)
                
print('end')
print(" ")
print('Failed to find authors: ' + str(failed))