## This retrieves all metadata from a portal and sorts by item size


## Import libraries

In [1]:
import json
import requests
import pandas as pd
import csv
import datetime

## Set base URL

In [2]:
#Set the base URL
BASE_URL = 'https://api.figshare.com/v2'


## Retrieve Metadata by Author Name (Note this does not disambiguate people with the same name)

In [3]:
#author name
#name = "ENTER NAME BETWEEN QUOTES"

#Institution id
INST_ID = "969" #Example INST_ID = "658"

In [4]:
#Retrieve list of metadata
#SET THE PAGE SIZE to make sure you get all the records

#Gather basic metadata for items (articles) that meet your search criteria

query = '{"search_for":"", "institution":' + INST_ID + ', "page_size":100}' #Set up string
y = json.loads(query) #Convert the string to a dictionary (JSON)
#y['search_for'] = ':author: \"'+ name + '\"' #This add in the name you are searching for in quotes for an exact match

#y = json.loads(query) #Figshare API requires json paramaters
r=requests.post(BASE_URL + "/articles/search", params=y)
articles = json.loads(r.text) 

if r.status_code != 200:
    print('Something is wrong:',r.content)
else:
    print('Collected',len(articles),'metadata records')

Collected 85 metadata records


In [5]:
#Create a list of all the article ids
article_ids = [item['id'] for item in articles]

In [10]:
#For each id in the article id list, retrieve all the metadata for the article by visiting the Figshare article API endpoint 
#This may take a while- for example, 6,000 records takes about 1.5 hours
full_articles = []
for art_id in article_ids:
    article = json.loads(requests.get(BASE_URL + "/articles/{}".format(art_id)).content)
    article['item_id'] = article.pop('id')
    full_articles.append(article)

In [None]:
#Get file info
files = pd.json_normalize(
    full_articles, 
    record_path =['files'], 
    meta=['item_id']
)
#This reshapes the data so that metadata field names are columns and each row is an id.
#files = files.pivot(index="id", columns="name", values="value")

In [16]:
test = files.groupby(by=['item_id'])['size'].sum()
test.to_frame()
test.sort_by(by=['size'])
test.head(10)

item_id
14980698     83231784982
15026043       542915422
15041316     14432019729
15050289      2428895453
16510584     79333676922
16821412    431229976478
16850653    486420963506
16867417      1745488589
16989889         5299603
17113883    208139751689
Name: size, dtype: int64

In [22]:
testdf = test.to_frame()
testdf.rename(index={1: "size"}, inplace=True)
testdf.head()

Unnamed: 0_level_0,size
item_id,Unnamed: 1_level_1
14980698,83231784982
15026043,542915422
15041316,14432019729
15050289,2428895453
16510584,79333676922


In [25]:
testdf.sort_values(by=['size'], ascending=False)

Unnamed: 0_level_0,size
item_id,Unnamed: 1_level_1
21375565,1530541492772
22202866,945730673646
19310870,510416713884
16850653,486420963506
19607172,446685353425
...,...
18143735,123663
21711245,68050
20492757,51460
21675005,15832
