# Retrieving data by searching the 4TU.ResearchData API

This script runs through the following steps:
1. Define search parameters
2. Send request to 4TU (figshare) API
3. Converts response into pandas dataframe
4. Stores list of article IDs
5. Extracts file IDs from articles
6. Downloads all files associated with all articles

In [9]:
import requests
import json
import pandas as pd

In [10]:
#store token and API url
URL='https://api.figshare.com/v2/articles/search'
api_token = 'ac7d7197c0e1d4b51da5d374139d1db611d0f79fb0ad1584899030f1cb026dc3d6dcfe8c8ecd5f5a586c868cb85c69b13444e65c7ebafcf562aecab9f30d138d'

#define search terms
params={
    "search_for":":keyword: carbon",
    # "search_for": "":description: " # w/o quotes will match any of the words included in the description field
    "institution": 898, #unique 4tu code
    "item_type": 3, #item type is dataset
    "page": 1,
    "page_size": 1000 #adjust to number larger than anticipated search results
}

# query in blocks of 10, while loop that continues until you get less than 10, so you know it's the last

#full list of search terms available at https://docs.figshare.com/#articles_search

In [11]:
#request articles based on search parameters set above
response = requests.post(
    url = URL,
    json = params,
    headers = {"Authorization": f"token {api_token}"} 
)

#store response as a json object
j = response.json()

In [12]:
#convert json object to pandas df (optional step, for analysis)
df = pd.DataFrame.from_dict(j)[['id', 'title', 'doi', 'published_date', 'defined_type_name', 'resource_doi']]

#show first few rows of df
df

Unnamed: 0,id,title,doi,published_date,defined_type_name,resource_doi
0,14747862,Bulk isotope data set supporting the manuscrip...,10.4121/14747862.v2,2021-08-26T14:38:00Z,dataset,10.1111/oik.08450
1,12719012,"Data underlying the publication: Strontium, ox...",10.4121/uuid:f6dc4f20-a6e0-4b2f-b2f8-b79a4f9061c3,2020-03-31T00:00:00Z,dataset,


In [13]:
#extract list of ids to use for download
ids = df['id'].tolist()
len(ids)
ids

[14747862, 12719012]

In [14]:
# for all the articles, retrieve the details of the article
art = []
for art_id in ids:
    response = requests.get(
        url = "https://api.figshare.com/v2/articles/"+str(art_id),
        headers = {"Authorization": f"token {api_token}"})
    art.append(pd.json_normalize(response.json()))

art = pd.concat(art)[['id','files','tags','categories','custom_fields','authors','description','license.name', 'title', 'doi', 'published_date', 'defined_type_name', 'resource_doi']]   


In [15]:
# retrieve data from dictionaries nested in the art dataframe
file_names=[]
files=[]
custom_fields_df = []
author_names =[]
categories=[]

for art_id in ids:  
    #files
    tmp_files = pd.json_normalize(art[art['id']==art_id].iloc[0]['files'])[['id', 'name', 'is_link_only', 'download_url']]
    tmp_files = tmp_files.assign(article_id = art_id)
    files.append(tmp_files)
    tmp_filename = ', '.join(tmp_files["name"])
    file_names.append(tmp_filename)
    
    #custom fields
    custom_fields = ['Geolocation', 'Time coverage', 'Geolocation Longitude', 'Geolocation Latitude']
    tmp_custom_fields_df = pd.json_normalize(art[art['id']==art_id].iloc[0]['custom_fields'])
    tmp_custom_fields_df = tmp_custom_fields_df[tmp_custom_fields_df['name'].isin(custom_fields)]
    tmp_custom_fields_df = tmp_custom_fields_df.set_index('name').T
    tmp_custom_fields_df = tmp_custom_fields_df.assign(id = art_id)
    custom_fields_df.append(tmp_custom_fields_df)

    #authors 
    tmp_authors = pd.json_normalize(art[art['id']==art_id].iloc[0]['authors'])[['full_name']]
    tmp_authorname = ', '.join(tmp_authors["full_name"])
    author_names.append(tmp_authorname)
    
    #categories 
    tmp_categories = pd.json_normalize(art[art['id']==art_id].iloc[0]['categories'])[['title']]
    tmp_categoryname = ', '.join(tmp_categories["title"])
    categories.append(tmp_categoryname)
    
    
    
art = art.assign(files = file_names)   
files = pd.concat(files)
custom_fields_df = pd.concat(custom_fields_df)
art = pd.merge(art, custom_fields_df, how="inner", on=["id"])
art = art.assign(authors = author_names)  
art = art.assign(categories = categories)  

# Convert tags list into coma separated string
art['keywords'] = [', '.join(map(str, l)) for l in art['tags']]

In [16]:
art.columns.values.tolist()
art = art[['title',
    'id',
     'published_date',
    'files',
    'categories',
    'authors',
    'description',
    'license.name',
    'doi',
 'Time coverage',
 'Geolocation',
 'Geolocation Longitude',
 'Geolocation Latitude']]

In [17]:
art

Unnamed: 0,title,id,published_date,files,categories,authors,description,license.name,doi,Time coverage,Geolocation,Geolocation Longitude,Geolocation Latitude
0,Bulk isotope data set supporting the manuscrip...,14747862,2021-08-26T14:38:00Z,"Riekenberg_Joling_Isobank_4.csv, README.txt",Ecological Applications,"Philip Riekenberg, D.W. (David) Thieltges, Lon...",Bulk carbon and nitrogen stable isotope values...,CC0,10.4121/14747862.v2,,"Wadden Sea, North Sea",,
1,"Data underlying the publication: Strontium, ox...",12719012,2020-03-31T00:00:00Z,"Aruba, Bonaire, Colombia, Curacao, Dominican R...","Physiology, Geochemistry, Anthropology","Esther Plomp, S.H.M. (Suzan) Verdegaal-Warmerd...",This dataset contains the isotopic results (st...,CC BY 4.0,10.4121/uuid:f6dc4f20-a6e0-4b2f-b2f8-b79a4f9061c3,1960/1995,"Aruba, Bonaire, Colombia, Curacao, Dominican R...",,


In [18]:
files

Unnamed: 0,id,name,is_link_only,download_url,article_id
0,28336731,Riekenberg_Joling_Isobank_4.csv,False,https://ndownloader.figshare.com/files/28336731,14747862
1,30469620,README.txt,False,https://ndownloader.figshare.com/files/30469620,14747862
0,24076859,"Aruba, Bonaire, Colombia, Curacao, Dominican R...",False,https://ndownloader.figshare.com/files/24076859,12719012
1,24076862,data.zip,False,https://ndownloader.figshare.com/files/24076862,12719012


In [19]:
file_ids = files['id'].tolist()

In [20]:
for file_id in file_ids:
        response = requests.get(
            url = f"https://api.figshare.com/v2/file/download/" + str(file_id),
            headers = {"Authorization": f"token {api_token}"})
        file_name = files[files['id']== file_id].loc[:,'name']
        open(file_name, "wb").write(response.content)

        
   

In [42]:
files[files['id']== file_id].loc[:,'name']

1    data.zip
Name: name, dtype: object

In [27]:
open("data.zip", "wb").write(response.content)

86796

In [43]:
with open('example.txt', 'w') as outfile:
    outfile.write(file_data)

TypeError: write() argument must be str, not bytes

In [34]:
response.content

with open("sample.json", "w") as outfile:
    outfile.write(response.content)

TypeError: write() argument must be str, not bytes

In [None]:
#ToDo:

# -> adapt to reflect the keyword structure of GEF files ( anchor test etc.)
# -> work on downloader (choose files to retrieve files, create a loop)
# -> generalise into functions and incorporate into the main programme