In [None]:
# @title ##### Get Conference Data for a specific author

"""
MAG - Microsoft Academic Graph
==============================
MAG provides data of all research papers.
""" 
import json
import requests
from pprint import pprint
import csv
import pandas as pd
from datetime import date
import ast

class MAG:
  """
  Arguments:
  ---------
    - country string : Country of which we need data
    - doctype : [Book, BookChapter, Conference, Dataset, Journal, Patent, Repository] Type of document.
    - start_date : start date of data
    - end_date : end date of data
    - field_of_study : The field we need data for like (artificial intelligence, robotics) 
    - retry : Number of attemts to connect to MAG
  """ 
  def __init__(self, **kwargs):
    df = pd.read_excel("/content/drive/My Drive/AI Index Visualization Project/DATA HARVESTORS/MAG/Lifecycle/Author.xlsx")
    self.e_d = date.today().strftime("%Y-%m-%d")
    self.s_d = kwargs.get('start_date').split('-')
    self.e_d = kwargs.get('end_date').split('-')
    self.s_d = int(self.s_d[0])
    self.e_d = int(self.e_d[0])
    self.start_date = kwargs.get('start_date')
    self.end_date = kwargs.get('end_date')
    self.author = kwargs.get('author')
    self.retry = kwargs.get('retry')
    self.author_id = int(df[df['Author']==self.author]['Author ID'])

  def basicPreprocess(self):
    """
    Performs basic preprocessing steps.
    """
    da = pd.read_csv('data_file.csv')
    da['AA'] = da['AA'].apply(ast.literal_eval)
    df1 = pd.concat({k:pd.DataFrame(v) for k, v in da['AA'].items()})
    da = da.join(df1.reset_index(level=1, drop=True)).reset_index(drop=True)
    da = da.drop(['logprob','prob','AA'],axis = 1)
    da = da.rename(columns={'AuId':'Author_id','Ti':'Title','AuN':'Author','DAfN':'Affiliation','Y':'Date'})
    da = da[da['Author_id']==self.author_id]
    da.to_csv("data_file.csv")
    return da

  def json_to_csv(self):
    json_object = json.dumps(self.query, indent = 6) 
    with open("mag.json", "w") as outfile: 
      outfile.write(json_object)     
    with open('mag.json') as json_file: 
      data = json.load(json_file) 

    data_entities = data['entities'] 

    # now we will open a file for writing 
    data_file = open('data_file.csv', 'w') 

    # create the csv writer obGCP Bigtableject 
    csv_writer = csv.writer(data_file) 

    # Counter variable used for writing 
    # headers to the CSV file 
    count = 0

    for ent in data_entities: 
      if count == 0: 

        # Writing headers of CSV file 
        header = ent.keys() 
        csv_writer.writerow(header) 
        count += 1

      # Writing data of CSV file 
      csv_writer.writerow(ent.values()) 

    data_file.close() 

  def grid(self):
    da = pd.read_csv("data_file.csv")
    grid = pd.read_csv('/content/drive/My Drive/AI Index Visualization Project/DATASETS/Conference data/Country data/Grid/grid.csv')
    grid = grid[['Name','Country']].rename(columns={'Name':'Affiliation'})
    da_country = pd.merge(da,grid,how = 'left', on = ['Affiliation'])
    da_country = da_country.fillna('')
    da_country.to_csv('data_file.csv')
    return da_country

  def getData(self):
    """
    Data from MAG.

    RETURNS
    -------
    - Return dataframe containing MAG data
    """
    from time import sleep
    retry = 0
    while(True):
        try:
          if(retry == self.retry):
            print("Failed to connect to MAG \
                      please recheck the connection")
            break
          endpoint = 'https://api.labs.cognitive.microsoft.com/'
          api_version = 'academic/v1.0/evaluate?'
          headers = {'Ocp-Apim-Subscription-Key': 'd9b12d4dac1f4ccea5e60d6f562b5cc1',}
          searchstring = "And(Composite(AA.AuId={}),Y=[{},{}])".format(self.author_id,self.s_d,self.e_d)
          url = endpoint + api_version + "expr=" + searchstring + "&attributes=Ti,AA.AuN,AA.DAfN,AA.AuId,Y" + "&count=100000" + "&orderby=Y:asc"
          response  = requests.get(url, headers=headers)
          query = response.json()
          self.query = query
          self.json_to_csv()
          self.data = self.basicPreprocess()
          self.data = self.grid()
          pprint(self.query)
          return self.data
        except Exception as e:
                retry = retry + 1
                print("MAG Exception: ", e)
                print("Reattempting the server: ", retry)
                sleep(60)

def main():
  Author = "andrew y ng" #@param {type : "string"}
  Start_date = "1980-01-01" #@param {type : "string"}
  End_date = "2021-01-01" #@param {type : "string"}  
  a = MAG(author = Author,start_date = Start_date,end_date = End_date,retry=5)
  q = a.getData()
  q.to_csv(f"/content/drive/My Drive/AI Index Visualization Project/DATASETS/Conference data/Author/Lifecycle/{Author}.csv")

if __name__ == "__main__":
    main()
    

{'entities': [{'AA': [{'AuId': 2118586410,
                       'AuN': 'michael kearns',
                       'DAfN': 'Bell Labs'},
                      {'AuId': 2078633956,
                       'AuN': 'yishay mansour',
                       'DAfN': 'Tel Aviv University'},
                      {'AuId': 2104401652,
                       'AuN': 'andrew y ng',
                       'DAfN': 'Carnegie Mellon University'},
                      {'AuId': 2146939727,
                       'AuN': 'dana ron',
                       'DAfN': 'Hebrew University of Jerusalem'}],
               'Ti': 'an experimental and theoretical comparison of model '
                     'selection methods',
               'Y': 1995,
               'logprob': -18.878,
               'prob': 6.3297823e-09},
              {'AA': [{'AuId': 2104401652, 'AuN': 'andrew y ng', 'DAfN': ''}],
               'Ti': 'preventing overfitting of cross validation data',
               'Y': 1997,
               'logpr

In [None]:
df = pd.read_csv("data_file.csv")
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Title,Date,Author,Author_id,Affiliation,Country
0,0,2,an experimental and theoretical comparison of ...,1995,andrew y ng,2104401652,Carnegie Mellon University,United States
1,1,2,an experimental and theoretical comparison of ...,1995,andrew y ng,2104401652,Carnegie Mellon University,Australia
2,2,4,preventing overfitting of cross validation data,1997,andrew y ng,2104401652,,
3,3,7,an information theoretic analysis of hard and ...,1997,andrew y ng,2104401652,Carnegie Mellon University,United States
4,4,7,an information theoretic analysis of hard and ...,1997,andrew y ng,2104401652,Carnegie Mellon University,Australia
...,...,...,...,...,...,...,...,...
271,271,1414,appendixnet deep learning for diagnosis of app...,2020,andrew y ng,2104401652,Stanford University,United States
272,272,1431,penet a scalable deep learning model for autom...,2020,andrew y ng,2104401652,Stanford University,United States
273,273,1437,effective data fusion with generalized vegetat...,2020,andrew y ng,2104401652,,
274,274,1446,incorporating machine learning and social dete...,2020,andrew y ng,2104401652,Stanford University,United States


In [None]:
grid = pd.read_csv('/content/drive/My Drive/AI Index Visualization Project/DATASETS/Conference data/Country data/Grid/grid.csv')
grid = grid[['Name','Country']].rename(columns={'Name':'Affiliation'})
grid

Unnamed: 0,Affiliation,Country
0,Australian National University,Australia
1,Monash University,Australia
2,University of Queensland,Australia
3,Macquarie University,Australia
4,UNSW Sydney,Australia
...,...,...
97790,Advanced Analysis Center,Japan
97791,Genetic Resources Center,Japan
97792,Research Center for Agricultural Information T...,Japan
97793,Center for Seeds and Seedlings,Japan


In [None]:
grid[grid['Country']=='United States'].head(40)

Unnamed: 0,Affiliation,Country
43,Ludwig Cancer Research,United States
53,Naval Postgraduate School,United States
56,Smithsonian Institution,United States
62,Arup,United States
73,Washington State Department of Health,United States
79,Luther College,United States
81,Boston Children's Hospital,United States
84,United States Geological Survey,United States
89,Hypres (United States),United States
91,National Oceanic and Atmospheric Administration,United States
