## Collect all item metadata and output tables linked by item id

Tables: metadata, authors, funding, tags, categories, custom fields



## Import libraries

In [1]:
import json
import requests
import pandas as pd
import csv
import datetime

## Set token, admin id, and base URL

In [30]:
#Set the token in the header.
text_file = open("././././testing-token.txt", "r") #Paste your token in a text file and save it where this notebook is
TOKEN = text_file.read()
TOKEN.strip() #removes any hidden spaces
text_file.close()


api_call_headers = {'Authorization': 'token ' + TOKEN}

#Set the base URL
BASE_URL = 'https://api.figsh.com/v2' #Change this if you want the production environment

#Set file name descriptor
descriptor = 'metadata'


## Retrieve Metadata
1. Get basic metadata by impersonating accounts (need a list of account ids with the items they own)
2. Get a list of all public metadata and get a list of ids
3. Subtract public articles from the list of private metadata ids (separating only draft items or fully embargoed items)
4. Use your choice of list of item ids to retrieve all metadata fields for each article
5. Convert the resulting JSON to a dataframe
6. Save the dataframe to CSV or Excel

In [3]:
private_items = []
for i in range(1,2):
    item = json.loads(requests.get(BASE_URL + '/account/institution/articles?page_size=1000&page={}'.format(i), headers=api_call_headers).content)
    private_items.extend(item)
print(len(private_items),'metadata records collected')


110 metadata records collected


In [4]:
#Keep records that are either public or fully embargoed, i.e. not drafts of never published records
published_items = []
for item in private_items:
    if item['published_date'] != None: #if a record has a published date
           published_items.append(item)
            
print(len(published_items), "records kept,",len(private_items) - len(published_items),"records removed")

88 records kept, 22 records removed


In [46]:
#save the json
with open('published_items-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.json', 'w') as f:
    json.dump(published_items, f)

## Collect metadata and views for each item
This also sets up dictionaries for certain metadata elements that will become their own tables 

In [None]:
item_metadata = []
author_metadata = []
funding_metadata = []
categories_metadata = []
tags_data = []
files_metadata = []
unpublished_count = 0

for item in published_items:
    m = requests.get(BASE_URL + '/articles/' + str(item['id']), headers=api_call_headers)
    metadata=json.loads(m.text)
    
    if m.status_code == 200: #if the record is not a previously published and then unpublished record

        views = json.loads(requests.get('https://stats.figshare.com/total/views/article/' + str(item['id']), headers=api_call_headers).content)
        #Add views to the full metadata record and add to the main list
        metadata['views'] = views['totals']
        item_metadata.append(metadata)

        #Add item id to each set of content for individual tables
        authors = metadata['authors']
        for a in authors:
            a['item_id'] = item['id']
            author_metadata.append(a)

        funding = metadata['funding_list']
        for f in funding:
            f['item_id'] = item['id']
            funding_metadata.append(f)
    
        cats = metadata['categories']
        for c in cats:
            c['item_id'] = item['id']
            categories_metadata.append(c)
    
        for t in metadata['tags']: #tags are a list so its a bit different
            tags = {}
            tags['item_id'] = item['id']
            tags['name'] = t
            tags_data.append(tags)

        if metadata['is_embargoed'] == 0: #If the record is not embargoed
            if len(metadata['files']) > 0: #If the record is not 'metadata only'
                files = metadata['files']
                for f in files:
                    f['item_id'] = item['id']
                    files_metadata.append(f)
    else:
        unpublished_count += 1
    
    
print('Full metadata for',len(item_metadata),'items retrieved.', unpublished_count,'items had a published date but are drafts')


In [111]:
print('Full metadata for',len(item_metadata),'items retrieved.', unpublished_count,'items had a published date but are drafts')


Full metadata for 70 items retrieved. 18 items had a published date but are drafts


### Create separate tables

In [112]:
authordf = pd.json_normalize(author_metadata)
save_file = authordf.to_csv(descriptor + '-authors.csv',encoding='utf-8')

funderdf = pd.json_normalize(funding_metadata)
save_file = funderdf.to_csv(descriptor + '-funding.csv',encoding='utf-8')

categorydf = pd.json_normalize(categories_metadata)
save_file = categorydf.to_csv(descriptor + '-categories.csv',encoding='utf-8')

tagdf = pd.DataFrame(tags_data, columns=["item_id","name"], index=None)

filesdf = pd.json_normalize(files_metadata)
save_file = filesdf.to_csv(descriptor + '-files.csv',encoding='utf-8')


In [72]:
#save the json. Change the file name to represent the list of ids you used.
with open('full_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.json', 'w') as f:
    json.dump(item_metadata, f)

In [113]:
#Create a dataframe from the JSON formatted data
df = pd.DataFrame(item_metadata)

## Format the spreadsheet

### Remove the columns that are now separate tables

In [114]:
df = df.drop(columns=['files', 'authors','funding_list','tags','categories'])

### Split out the dates

In [115]:
#The dates are all contained within one column called 'timeline'. Flatten that column and associate the values
#with the proper article id in a new dataframe

temp_date_list = []

for item in item_metadata:
    dateitem = item['timeline']
    dateitem['id'] = item['id']
    temp_date_list.append(dateitem)

df_dates = pd.json_normalize(
    temp_date_list 
)

#Merge the dataframes
df_formatted = df.merge(df_dates, how='outer', on='id')

print("Dates split out and merged")

Dates split out and merged


### Add Group names
This retrieves a list of Groups and then formats the dataframe so that each group has id of its parent Group. The top level group has itself as the parent. The group names are then added to the main dataframe.

In [116]:
#Get list of groups. 
s=requests.get(BASE_URL + '/account/institution/groups', headers=api_call_headers)
groups=json.loads(s.text)

#Create a dataframe of groups
df_groups = pd.json_normalize(groups)

df_groups_parent = df_groups[['id','name']] #Create reference dataframe
df_groups = df_groups.rename(columns={'id': 'group_id','name': 'group_name'}) #Rename id col in main dataframe
df_groups_parent = df_groups_parent.rename(columns={'name': 'parent_group_name'}) #Rename name col in reference dataframe

df_groups = df_groups.sort_values(by=['parent_id'])
top_group_id = df_groups.iloc[0]['group_id'] #Store the group id for top group 

df_groups.loc[df_groups['parent_id'] == 0, 'parent_id'] = top_group_id #For top level group, replace the zero value parent id with top level group id

df_groups = df_groups.merge(df_groups_parent, how='inner',left_on=['parent_id'], right_on=['id']) #Add parent group name

df_groups = df_groups[['group_id','group_name','parent_group_name']] #Pare down to needed columns


#Merge the dataframes 
df_formatted = df_formatted.merge(df_groups, how='inner', on='group_id') #If you use 'outer' it will include a blank record for each group with no records

print("Names for",len(df_groups),"different groups were added to the metadata records")

Names for 11 different groups were added to the metadata records


### Split out custom fields
This creates new columns for each custom field.

If different groups have different custom metadata, check the output carefully to make sure things mapped properly

In [117]:
#The custom fields are all contained within one column called 'custom_fields'. Flatten that column and associate the values
#with the proper article id in a new dataframe
custom = pd.json_normalize(
    item_metadata, 
    record_path =['custom_fields'], 
    meta=['id']
)

if len(custom) > 0:
    #This reshapes the data so that metadata field names are columns and each row is an id.
    custom = custom.pivot(index="id", columns="name", values="value")
    
    #Merge the dataframes so that all the custom fields are visible along with all the other metadata
    df_formatted = df_formatted.merge(custom, how='outer', on='id') #Outer merge keeps records that have no custom metadata.
    print('Custom fields split out and merged')
else:
    print('No custom fields to split out')

Custom fields split out and merged


## Save the spreadsheet

In [118]:
#Save a CSV file of all the metadata. Change the file name if necessary to match dates.
save_file = df_formatted.to_csv(descriptor + '-main.csv',encoding='utf-8')

In [91]:
#Or save an Excel file of all the metadata. Change the file name if necessary to match dates.
#save_file = df_formatted.to_excel('all-records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.xlsx')