# Collect metadata from institutions for non-traditional outputs

This collects metadata from records in institutional repositories that are using Figshare as the repository platform to create a comparison dataset of non traditional research outputs. 

The assumption is that these records go through some type of vetting. Five institutions are sampled:
- University of Cape Town
- Monash University
- University of Sheffield
- 4TU
- University of Arizona
- Loughborough
- University of Illinois at Chicago

50 recent records from the following items types are collected from each institution:
- figure
- media
- dataset
- presentations
- posters
- software


In [None]:
import csv
import requests
import json as json
import datetime
import pandas as pd

In [None]:
#List of Institution ids (283-Uni of Cape Town,21-Monash,54-sheffield,898-4tu,797-u Az)
INST_LIST = [283,21,54,898,797,2,693]

In [None]:
#Get 50 Figures from each institution
figures = []

for i in INST_LIST:
    query = '{"item_type":1,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    figures.extend(j)
    #for x in j:
    #    figures.append(x) #Above creates a list. So append each record from that list to the master list
    
print(len(figures),'records collected')

In [None]:
#Get 50 Media from each institution
media = []

for i in INST_LIST:
    query = '{"item_type":2,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    media.extend(j)
    
print(len(media),'records collected')

In [None]:
#Get 50 Datasets from each institution
datasets = []

for i in INST_LIST:
    query = '{"item_type":3,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    datasets.extend(j)
    
print(len(datasets),'records collected')

In [None]:
#Get 50 Presentations from each institution
presentations = []

for i in INST_LIST:
    query = '{"item_type":7,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    presentations.extend(j)
    
print(len(presentations),'records collected')

In [None]:
#Get 50 Posters from each institution
posters = []

for i in INST_LIST:
    query = '{"item_type":5,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    posters.extend(j)
    
print(len(posters),'records collected')

In [None]:
#Get 50 Software from each institution
software = []

for i in INST_LIST:
    query = '{"item_type":9,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    software.extend(j)
    
print(len(software),'records collected')

In [None]:
sample = figures
sample.extend(media)
sample.extend(datasets)
sample.extend(presentations)
sample.extend(posters)
sample.extend(software)

len(sample)

In [None]:
#Save Master File
with open('raw-inst-basic-metadata.json', "w") as write_file:
    json.dump(sample, write_file)

In [None]:
#Create a dataframe from the JSON formatted data
df = pd.DataFrame(sample)

In [None]:
#Save a file of all the metadata
#save_file = df.to_excel("xxxxxxxxx.xlsx")

In [None]:
#Open up the same file for reading
with open("raw-inst-basic-metadata.json", "r", encoding='utf8') as read_file: #Replace this with the filename of your choice
    sample = json.load(read_file)

print(len(sample),"records")

In [None]:
#Create list of ids
#article_ids = [item['id'] for item in sample]
article_ids = []
for item in sample:
    article_ids.append(item['id'])
print(len(article_ids))

In [None]:
#Visit each item and extract the size for each file and collect some publication dates - Comparison Records
#use rpartition on '/'

def exists(obj, chain):
    _key = chain.pop(0)
    if _key in obj:
        return exists(obj[_key], chain) if chain else obj[_key]

#-----------------------------Create csv files---------------------------------------
df_author_info=open('institution-sample-full-metadata-authors'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv', 'w', encoding='utf-8', newline='')
df_file_info=open('institution-sample-full-metadata-files'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv', 'w', encoding='utf-8', newline='')
dffull=open('institution-sample-full-metadata'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv', 'w', encoding='utf-8', newline='')
#Write header row to csv
csv.writer(df_author_info).writerow(['item_id','author_id','full_name','orcid_id'])
csv.writer(df_file_info).writerow(['item_id','file_name','file_size_bytes','link_only'])
csv.writer(dffull).writerow(['id','posted_date','doi','title','description','type','categories','funders',
                              'count_categories','count_references','count_tags','views','firstOnline','version'])

funderMaster = []
categoryMaster = [] 
embargoed = []

for i in comp_ids:
    #tup = i.rpartition('/') #From end of URL, split string '/' 
    #article_id = str(tup[2]) #Then select the id from the resulting tuple.
    record = json.loads(requests.get('https://api.figshare.com/v2/articles/' + str(i)).content)
    views = json.loads(requests.get('https://stats.figshare.com/total/views/article/' + str(i)).content)
    #Get publish date, title, categories
    #First get a list of category names and funder information from record
    cats = []
    for c in record['categories']:
        cats.append(c['title'])
        comp_categoryMaster.append(c['title'])
    
    funders = [] #funder name might be duplicated across multiple grants for an item. Create a dictionary to deduplicate later
    for f in record['funding_list']:
        entry = {"item_id":i, "grant_title":f['title'], "funder_name":f['funder_name'], "postDate":record['timeline']['posted']}
        funders.append(entry)
        comp_funderMaster.append(entry)
    
    #grants = [] #items shouldn't have two of the exact same title listed so just make a list
    #for g in record['funding_list']:
    #    grants.append(f['title'])
    
    #Then write to csv
    csv.writer(dffull).writerow([
                    i,
                    record['timeline']['posted'],
                    record['doi'],
                    record['title'],
                    record['description'],
                    record['defined_type_name'],
                    cats,
                    funders,
                    len(exists(record,['categories'])) if exists(record,['categories'])!=None else 0, #CHECK THIS
                    len(exists(record,['references'])) if exists(record,['references'])!=None else 0,
                    len(exists(record,['tags'])) if exists(record,['tags'])!=None else 0,
                    views['totals'],
                    exists(record['timeline'],['firstOnline']),
                    exists(record,['version'])
                    ])
    
  
    #Get file names and sizes

    if record['is_embargoed'] == 0: #If the record is not embargoed
            for z in record['files']:  
                csv.writer(df_file_info).writerow([
                    i, #item id
                    z['name'],
                    z['size'],
                    z['is_link_only']])
    else:
        comp_embargoed.append(record['figshare_url']) #if embargoed, add url to a list
        
    #Get author info

    for a in record['authors']:  
            csv.writer(df_author_info).writerow([
                i, #item id
                a['id'], #author id
                a['full_name'],
                a['orcid_id']])
    

comp_author_info.close() #Close the output file, release all locks
comp_file_info.close() #Close the output file, release all locks
dffull.close() #Close the output file, release all locks

#Load file just created as a dataframe
df_author_info = pd.read_csv('institution-sample-full-metadata-authors'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv')
df_file_info = pd.read_csv('institution-sample-full-metadata-files'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv')
dffull = pd.read_csv('institution-sample-full-metadata'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv')

print(len(dffull),"records gathered and",len(df_file_info),"file info records gathered")


In [None]:
#Combine the original metadata records and the item info
#first remove the title and doi columns from the second table
dffull = comp_item_info_x[['id','posted_date','description','type','categories','funders',
                              'count_categories','count_references','count_tags','views','firstOnline','version']].copy()


#Merge the basic metadata with the detailed metadata
df_combo = df.merge(dffull, how='inner', on='id')
print(len(df_combo))

#change col name
df_combo = df_combo.rename(columns={"url_public_html": "public_url"})
#add needed cols
comparison = 'Comparison Records'
df_combo['origin'] = comparison
df_combo['school'] = ""
#extract relevant columns
df_master = comp_item_info[['id','public_url','posted_date','doi',
                                     'title', 'description', 'type','categories','funders',
                                    'count_categories','count_references','count_tags', 'origin',
                                     'views','firstOnline','version']]


#Save a CSV file of all the metadata.
save_file = df_master.to_csv('institution-sample-MASTER-metadata-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf-8')
df_master.head()

In [None]:
#Save the category and funding lists - Comparison Records
#convert list to df
categorydf = pd.DataFrame(categoryMaster, columns=["title"], index=None)
save_file = categorydf.to_csv('institution-sample-categorylist.csv',encoding='utf-8')

#save funder json
with open('institution-sample-funderlist.json', "w") as write_file:
    json.dump(funderMaster, write_file)

#Flatten funder info and save as csv
funderdf = pd.json_normalize(funderMaster)
save_file = funderdf.to_csv('institution-sample-funderlist.csv',encoding='utf-8')