# Collect metadata from institutions for non-traditional outputs

This collects metadata from records in institutional repositories that are using Figshare as the repository platform to create a comparison dataset of non traditional research outputs. 

The assumption is that these records go through some type of vetting. Five institutions are sampled:
- University of Cape Town
- Monash University
- University of Sheffield
- 4TU
- University of Arizona

50 recent records from the following items types are collected from each institution:
- figure
- media
- dataset


In [1]:
import csv
import requests
import json as json
import datetime
import pandas as pd

In [37]:
#List of Institution ids (283-Uni of Cape Town,21-Monash,54-sheffield,898-4tu,797-u Az)
INST_LIST = [283,21,54,898,797]

In [77]:
#Get 50 Figures from each institution
figures = []

for i in INST_LIST:
    query = '{"item_type":1,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    figures.extend(j)
    #for x in j:
    #    figures.append(x) #Above creates a list. So append each record from that list to the master list
    
print(len(figures),'records collected')

156 records collected


In [78]:
#Get 50 Media from each institution
media = []

for i in INST_LIST:
    query = '{"item_type":2,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    media.extend(j)
    
print(len(media),'records collected')

129 records collected


In [79]:
#Get 50 Datasets from each institution
datasets = []

for i in INST_LIST:
    query = '{"item_type":3,"institution": ' + str(i) + ',"page": 1, "page_size": 50}' #add more using a comma
    y = json.loads(query) #Figshare API requires json paramaters
    response = requests.post('https://api.figshare.com/v2/articles/search', params=y)
    j=json.loads(response.text) #parse the json into a list named j
    datasets.extend(j)
    
print(len(datasets),'records collected')

250 records collected


In [81]:
sample = figures
sample.extend(media)
sample.extend(datasets)

len(sample)

535

In [84]:
#Save Master File
with open('raw-inst-basic-metadata.json', "w") as write_file:
    json.dump(sample, write_file)

In [9]:
#Create a dataframe from the JSON formatted data
df = pd.DataFrame(sample)

In [None]:
#Save a file of all the metadata
#save_file = df.to_excel("xxxxxxxxx.xlsx")

In [2]:
#Open up the same file for reading
with open("raw-inst-basic-metadata.json", "r", encoding='utf8') as read_file: #Replace this with the filename of your choice
    sample = json.load(read_file)

print(len(sample),"records")

535 records


In [3]:
#Create list of ids
#article_ids = [item['id'] for item in sample]
article_ids = []
for item in sample:
    article_ids.append(item['id'])
print(len(article_ids))

535


In [8]:
# Create a csv file, use the API to gather data, reopen the csv as a dataframe


#This function deals with multiply nested json elements in the csv write line below
#https://stackoverflow.com/questions/43491287/elegant-way-to-check-if-a-nested-key-exists-in-a-python-dict
def exists(obj, chain):
    _key = chain.pop(0)
    if _key in obj:
        return exists(obj[_key], chain) if chain else obj[_key]

#Create csv file
metadata=open('institution-sample-full-metadata'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv', 'w', encoding='utf-8', newline='')
#Write header row to csv
csv.writer(metadata).writerow(['id','description','is_embargoed','funding_list','is_metadata_record','resource_doi','count_categories','count_references','count_tags'])
       
                      
              
for l in article_ids:
    s=requests.get('https://api.figshare.com/v2/articles/'+str(l))
    r=json.loads(s.text)
    
    #write the values to the csv file.
    csv.writer(metadata).writerow([
        r['id'],
        r.get('description'), #For any of these .get(), adding ",'N/A'" will fill the null cells with 'N/A'. However, metadata assessment counts non nulls
        r.get('is_embargoed'),
        exists(r,['funding_list']) if exists(r,['funding_list']) !=None else 0,
        r.get('is_metadata_record'),
        exists(r,['resource_doi']) if exists(r,['resource_doi']) !=None else 0,
        len(exists(r,['categories'])) if exists(r,['categories'])!=None else 0,
        len(exists(r,['references'])) if exists(r,['references'])!=None else 0,
        len(exists(r,['tags'])) if exists(r,['tags'])!=None else 0]) #write one line to csv file
    
metadata.close() #Close the output file, release all locks

#Open up the same file as a dataframe. Encode cp1252 avoids a utf8 error.
dffull = pd.read_csv('institution-sample-full-metadata'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf8')

print('The resulting dataframe has',len(dffull),' rows')

The resulting dataframe has 535  rows


In [12]:
#Merge the basic metadata with the detailed metadata
df_master = df.merge(dffull, how='inner', on='id')
#Save a CSV file of all the metadata.
save_file = df_master.to_csv('institution-sample-MASTER-metadata-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf-8')
df_master.head()

Unnamed: 0,project_id,id,title,doi,handle,url,published_date,thumb,defined_type,defined_type_name,...,resource_title,resource_doi_x,description,is_embargoed,funding_list,is_metadata_record,resource_doi_y,count_categories,count_references,count_tags
0,,19630440,Acoustic Flight Path Example,10.25375/uct.19630440.v1,,https://api.figshare.com/v2/articles/19630440,2022-05-05T13:06:43Z,https://s3-eu-west-1.amazonaws.com/ppreviews-u...,1,figure,...,,,<p>Example of all the calls recorded from<em> ...,False,[],False,0,0,0,2
1,,16866583,A Comparison of Fluorescent Microscopy Methods...,10.25375/uct.16866583.v1,,https://api.figshare.com/v2/articles/16866583,2022-04-08T06:28:09Z,https://s3-eu-west-1.amazonaws.com/ppreviews-u...,1,figure,...,,,<p><a></a><i>Chlamydia trachomatis</i> (<i>C. ...,False,"[{'id': 21352057, 'title': 'MSC', 'grant_code'...",False,0,0,0,5
2,128348.0,19227846,A note on the extent of autocorrelation of res...,10.25375/uct.19227846.v1,,https://api.figshare.com/v2/articles/19227846,2022-04-07T13:35:46Z,https://s3-eu-west-1.amazonaws.com/ppreviews-u...,1,figure,...,,,<div>One of the observations by the December 2...,False,[],False,0,0,0,8
3,129353.0,19396754,Comparison of t values for different sets of s...,10.25375/uct.19396754.v1,,https://api.figshare.com/v2/articles/19396754,2022-04-04T11:25:54Z,https://s3-eu-west-1.amazonaws.com/ppreviews-u...,1,figure,...,,,Annex E of Report of the ‘Second’ Internationa...,False,[],False,0,0,1,7
4,128348.0,19352099,A note on comparisons of recent west coast roc...,10.25375/uct.19352099.v1,,https://api.figshare.com/v2/articles/19352099,2022-04-04T09:57:48Z,https://s3-eu-west-1.amazonaws.com/ppreviews-u...,1,figure,...,,,Recent assessment results in FISHERIES/2015/JA...,False,[],False,0,0,0,6
