## This retrieves all metadata and statistics for an author - items and collections

There are two ways to collect records: by name or by ORCID

Note: This is set up to only return records from a given institution. It could be modified to search across all Figshare repositories by removing the institution id query term.


## Import libraries

In [1]:
import json
import requests
import pandas as pd
import csv
import datetime

## Set base URL

In [None]:
#Set the base URL
BASE_URL = 'https://api.figshare.com/v2'


## Retrieve Metadata by Author Name (Note this does not disambiguate people with the same name)

In [None]:
#author name
name = "ENTER NAME BETWEEN QUOTES"

#Institution id
INST_ID = "ENTER ID HERE BETWEEN QUOTES" #Example INST_ID = "658"

In [None]:
#Retrieve list of private metadata- this is for unpublished and published records.
#SET THE PAGE SIZE to make sure you get all the records from your account (both public and draft)

#Gather basic metadata for items (articles) that meet your search criteria

query = '{"search_for":"", "institution":' + INST_ID + ', "page_size":100}' #Set up string
y = json.loads(query) #Convert the string to a dictionary (JSON)
y['search_for'] = ':author: \"'+ name + '\"' #This add in the name you are searching for in quotes for an exact match

#y = json.loads(query) #Figshare API requires json paramaters
r=requests.post(BASE_URL + "/articles/search", params=y)
articles = json.loads(r.text) 

if r.status_code != 200:
    print('Something is wrong:',r.content)
else:
    print('Collected',len(articles),'metadata records')

In [None]:
#Create a dataframe from the JSON formatted data
dfbasic = pd.DataFrame(articles)

## Or, Retrieve Metadata by ORCID

In [None]:
#author name
orcid = "ENTER ORCID BETWEEN QUOTES"



#Institution id
#INST_ID = "ENTER ID HERE BETWEEN QUOTES" #Example INST_ID = "658"
INST_ID = "ENTER ID HERE BETWEEN QUOTES" #Example INST_ID = "658"

In [None]:
#Retrieve list of private metadata- this is for unpublished and published records.
#SET THE PAGE SIZE to make sure you get all the records from your account (both public and draft)

#Gather basic metadata for items (articles) that meet your search criteria

query = '{"search_for":":orcid:' + orcid + '", "institution":' + INST_ID + ', "page_size":100}' #Set up string
y = json.loads(query) #Convert the string to a dictionary (JSON)
y['search_for'] = ':author: \":orcid'+ name + '\"' #This add in the name you are searching for in quotes for an exact match

#y = json.loads(query) #Figshare API requires json paramaters
r=requests.post(BASE_URL + "/articles/search", params=y)
articles = json.loads(r.text) 

if r.status_code != 200:
    print('Something is wrong:',r.content)
else:
    print('Collected',len(articles),'metadata records')

In [None]:
#Create a dataframe from the JSON formatted data
dfbasic = pd.DataFrame(articles)

## Collect stats



In [None]:
# Create a csv file, use an API to gather data, reopen the csv as a dataframe


#Create a list of all the article ids
article_ids = [item['id'] for item in articles]    

#Create csv file
metadata=open('article-stats'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv', 'w', newline='')
#Write header row to csv
csv.writer(metadata).writerow(['id','views','downloads'])            

            
for l in article_ids:
    s=requests.get('https://stats.figshare.com/total/views/article/'+ str(l))
    r=json.loads(s.text)
    t=requests.get('https://stats.figshare.com/total/downloads/article/'+ str(l))
    q=json.loads(t.text)
    
    #write the values to the csv file. Dates in json files are seconds from jan 1 1970 so datetime.datetime.fromtimestamp converts
    csv.writer(metadata).writerow([
        l,
        r.get('totals'), #For any of these .get(), adding ",'N/A'" will fill the null cells with 'N/A'. However, metadata assessment counts non nulls
        q.get('totals')]) 
    
    
metadata.close() #Close the output file, release all locks

#Open up the same file as a dataframe. Encode cp1252 avoids a utf8 error.
dfstats = pd.read_csv('article-stats'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf8')

print('The resulting dataframe has',len(dfstats),'rows')

### Merge the dataframes

In [None]:
dfmerged = dfbasic.merge(dfstats, how='inner', on='id')
dfmerged.head()

### If you have Collections run this next cell. Otherwise skip it.

In [None]:
#Retrieve list of private metadata- this is for unpublished and published records.
#SET THE PAGE SIZE to make sure you get all the records

#Get collections
query = '{"search_for":"", "institution":' + INST_ID + ', "page_size":100}' #Set up string
y = json.loads(query) #Convert the string to a dictionary (JSON)
y['search_for'] = ':author: \"'+ name + '\"' #Add in the name you are searcing for in quotes for an exact match

#y = json.loads(query) #Figshare API requires json paramaters
r=requests.post(BASE_URL + "/collections/search", params=y)
collections = json.loads(r.text)

#Create a dataframe from the JSON formatted data
dfcollbasic = pd.DataFrame(collections)

#Gather Stats
#Create a list of all the article ids
coll_ids = [item['id'] for item in collections]    

#Create csv file
metadata=open('collection-stats'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv', 'w', newline='')
#Write header row to csv
csv.writer(metadata).writerow(['id','views','downloads'])            

            
for l in coll_ids:
    s=requests.get('https://stats.figshare.com/total/views/article/'+ str(l))
    r=json.loads(s.text)
    t=requests.get('https://stats.figshare.com/total/downloads/article/'+ str(l))
    q=json.loads(t.text)
    
    #write the values to the csv file. Dates in json files are seconds from jan 1 1970 so datetime.datetime.fromtimestamp converts
    csv.writer(metadata).writerow([
        l,
        r.get('totals'), #For any of these .get(), adding ",'N/A'" will fill the null cells with 'N/A'. However, metadata assessment counts non nulls
        q.get('totals')]) 
    
    
metadata.close() #Close the output file, release all locks

#Open up the same file as a dataframe. Encode cp1252 avoids a utf8 error.
dfcollstats = pd.read_csv('collection-stats'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf8')

dfcollmerged = dfcollbasic.merge(dfcollstats, how='inner', on='id')

#Append the collections rows to the article dataframe
dfmerged = dfmerged.append(dfcollmerged)


### Format the dates column

In [None]:
#The dates are all contained within one column called 'timeline'. 
#Use the JSON to create a better format and then merge with the dataframe
#with the proper article id in a new dataframe

temp_date_list = []

for item in articles:
    dateitem = item['timeline']
    dateitem['id'] = item['id']
    temp_date_list.append(dateitem)

df_dates_items = pd.json_normalize(
    temp_date_list 
)


#Have to use 'try' here just in case you ran the Collection cell above
try:
    #Get a dates dataframe
    temp_coll_date_list = []

    for item in collections:
        dateitem = item['timeline']
        dateitem['id'] = item['id']
        temp_coll_date_list.append(dateitem)

    df_coll_dates_coll = pd.json_normalize(
        temp_coll_date_list 
    )
# catch when published_coll_records is None
except AttributeError:
    pass
# catch when it hasn't even been defined
except NameError:
    pass


#Append the dataframes (if collections have been found)
try:
    #Append the dates dataframes
    df_dates = df_dates_items.append(df_dates_coll)
# catch when df_dates_coll is None
except AttributeError:
    df_dates = df_dates_items
# catch when it hasn't even been defined
except NameError:
    df_dates = df_dates_items
    
#Merge the date dataframe with the metadata dataframe
df_formatted = dfmerged.merge(df_dates, how='outer', on='id')

print("Dates split out and merged")

### View Totals

In [None]:
#See your summarized stats
print('Total views =', df_formatted['views'].sum(),'and total downloads =',df_formatted['downloads'].sum())


# Save the spreadsheet

## If you are running this in Google Colab

In [None]:
#When you run this cell it will ask you to authenticate so that you can create files to download
from google.colab import drive
drive.mount('/drive')

In [None]:
from google.colab import files
df_formatted.to_csv(str(name) + '-published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf-8') #create the CSV
files.download(str(name) + '-published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv') #download to your computer

## If you are running this locally
That is you downloaded the Jupyter Notebook file

In [None]:
#Save a CSV file of all the metadata. Change the file name if necessary to match dates.
save_file = df_formatted.to_csv(str(name) + '-published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf-8')

In [None]:
#Or save an Excel file of all the metadata. Change the file name if necessary to match dates.
save_file = df_formatted.to_excel(str(name) + '-published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.xlsx')

In [None]:
#OPTIONAL: save the json.
with open(str(name) + 'published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.json', 'w') as f:
    json.dump(published_records, f)