In [None]:
import csv
import time
import requests

In [None]:

#Find author id by searching the authors name. 
# Some names yield multiple results corresponding to multiple persons with the same name
# We take the first ID connected to a name. 

# This is the URL to use API. Unfortunatly we can only request one author at a time, when we search by name
BASE_URL = "https://api.semanticscholar.org/graph/"
VERSION = "v1/"
RESOURCE = "author/search"
my_url = BASE_URL + VERSION + RESOURCE

#We make our set of author names found previously into a list. 
authors=[]
with open("data/initialNames.csv", 'r') as f:
    for name in csv.reader(f):
        authors.append(name)     


# we want to write to a csv file, so we have the data saved.
f = open('authorIDs.csv',"w")
writer = csv.writer(f,lineterminator = '\n')


# dictonaries to save information in. 
authorIDs={}
faultyNames=[]

# we can only request 100 times pr 5 min so we split up the list into chuncks of 100=n
# and implement timer

chunkSize = 100
sleepTIme= 60*5
authorChunks = [authors[i:i + chunkSize] for i in range(0, len(authors), chunkSize)]
#begin performing the requests
for chunkNum,chunk in enumerate(authorChunks):
    for author in chunk:
        params = {'query':author,}
        r = requests.get(my_url, params=params)
        #check if we get a valid response and f not we keep track of the name causing problems. 
        if r.reason=="OK":
            #check if there is data  in the expected place if not we keep track of the name causing problems. 
            if (id := r.json()["data"]) != []: 
                #we assume only one person pr name is enough                    
                authorIDs[id[0]["authorId"]] = id[0]["name"]
                writer.writerow([id[0]["authorId"], id[0]["name"]])    
            else: 
                faultyNames.append((author,r.json()))
                writer.writerow([author,"faulty"])
                print(author)
        else:
            faultyNames.append((author,r.json()))
            print(author)
            writer.writerow([author,"faulty"])
            
    print(f"I have requested {chunkNum+1} chunks out of {len(authorChunks)}")        
    time.sleep(sleepTIme)

f.close()


In [None]:
#Lets load the IDs of the "orginal" authors from the csv. 
authorIDs={}
with open("data/authorIDs.csv", 'r') as f:
    for line in csv.reader(f):
        if line[1]!="faulty":
            id = line[0]
            name = line[1]
            authorIDs[id]=name  

In [None]:
#Now for each of the authors we will go through their published papers
#and find all their collaborators' IDs
# NOTE that their is a limit to 100 authors pr request  

collaboratorsIDs={}
numRequests = 100
batchSize = 100
sleepTIme= 60*5
authorList=list(authorIDs.keys())
authorBathces = [authorList[i:i + batchSize] for i in range(0, len(authorList), batchSize)]
requestList = [authorBathces[i:i + numRequests] for i in range(0, len(authorBathces),numRequests)]



BASE_URL = "https://api.semanticscholar.org/graph/"
VERSION = "v1/"
RESOURCE = "author/batch"
my_url = BASE_URL + VERSION + RESOURCE
params = {
            "fields":"papers.authors"}

In [None]:
# do the actuall requests. 
# we want to write to a csv file, so we have the data saved.
f = open('data/collaboratorIDs.csv',"w")
writer = csv.writer(f,lineterminator = '\n')

orgAuthorLookedUp = 0
for request in requestList:
    for batch in request:        
        json_data={"ids": batch}
        r=requests.post(my_url,json=json_data,params = params)
        # the following is  the structure of the returned list. 
        # Each element contains a dict where the key "papers" contains a list of dicts(one for each paper) and in these dicts the key "authors" is a list of dicts of authors of the paper. These dicts have keys "authorId" and "name". We are after these two values. 
        for orgAuthor in r.json():
            #try: 
            if orgAuthor["papers"] != []:
                orgAuthorLookedUp+=1

                for paper in orgAuthor["papers"]:                            
                    for collaborator in paper["authors"]:
                        id = collaborator["authorId"]
                        name =collaborator["name"]                    
                        collaboratorsIDs[id]=name
                        writer.writerow([id,name])
            else:
                print(f"author {orgAuthor} has no papers(?)")
                
            #except:
            #    print(r.json())
            #    print(orgAuthor)
            #    pass
    #time.sleep(sleepTIme)

In [None]:
#finally we merge the "orginal" author dict and the "collaborator" author dict into one and save this to csv.
allAuthors = authorIDs | collaboratorsIDs
f = open('data/allAuthorIDs.csv',"w")
writer = csv.writer(f,lineterminator = '\n')
for id in list(allAuthors.keys()):
    writer.writerow([id,allAuthors[id]])

In [None]:
print(f"Out of {len(authorIDs)} authors, {len(authorIDs)-orgAuthorLookedUp} didn't publish a paper  ")
print(f'We found a total of {len(allAuthors)} unique authors in total when including the collaborators')