## This retrieves all metadata and statistics for an author - items and collections

There are two ways to collect records: by name or by ORCID



## Import libraries

In [1]:
import json
import requests
import pandas as pd
import csv
import datetime

## Set base URL

In [2]:
#Set the base URL
BASE_URL = 'https://api.figshare.com/v2'


## Retrieve Metadata by Author Name (Note this does not disambiguate people with the same name)

In [5]:
#author name
name = "ENTER NAME BETWEEN QUOTES"



In [8]:
#Retrieve list of metadata
#SET THE PAGE SIZE to make sure you get all the records

#Gather basic metadata for items (articles) that meet your search criteria

query = '{"search_for":"", "page_size":100}' #Set up string
y = json.loads(query) #Convert the string to a dictionary (JSON)
y['search_for'] = ':author: \"'+ name + '\"' #This add in the name you are searching for in quotes for an exact match

#y = json.loads(query) #Figshare API requires json paramaters
r=requests.post(BASE_URL + "/articles/search", params=y)
articles = json.loads(r.text) 

if r.status_code != 200:
    print('Something is wrong:',r.content)
else:
    print('Collected',len(articles),'metadata records')

Collected 10 metadata records


In [9]:
#Create a dataframe from the JSON formatted data
dfbasic = pd.DataFrame(articles)

## Or, Retrieve Metadata by ORCID

In [None]:
#author orcid
orcid = "ENTER ORCID BETWEEN QUOTES"


In [None]:
#Retrieve list of metadata
#SET THE PAGE SIZE to make sure you get all the records

#Gather basic metadata for items (articles) that meet your search criteria

query = '{"search_for":":orcid:' + orcid + '", "page_size":100}' #Set up string
y = json.loads(query) #Convert the string to a dictionary (JSON)

#y = json.loads(query) #Figshare API requires json paramaters
r=requests.post(BASE_URL + "/articles/search", params=y)
articles = json.loads(r.text) 

if r.status_code != 200:
    print('Something is wrong:',r.content)
else:
    print('Collected',len(articles),'metadata records')

In [None]:
#Create a dataframe from the JSON formatted data
dfbasic = pd.DataFrame(articles)

## Collect stats



In [10]:
# Create a csv file, use an API to gather data, reopen the csv as a dataframe


#Create a list of all the article ids
article_ids = [item['id'] for item in articles]    

#Create csv file
metadata=open('article-stats'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv', 'w', newline='')
#Write header row to csv
csv.writer(metadata).writerow(['id','views','downloads'])            

            
for l in article_ids:
    s=requests.get('https://stats.figshare.com/total/views/article/'+ str(l))
    r=json.loads(s.text)
    t=requests.get('https://stats.figshare.com/total/downloads/article/'+ str(l))
    q=json.loads(t.text)
    
    #write the values to the csv file. Dates in json files are seconds from jan 1 1970 so datetime.datetime.fromtimestamp converts
    csv.writer(metadata).writerow([
        l,
        r.get('totals'), #For any of these .get(), adding ",'N/A'" will fill the null cells with 'N/A'. However, metadata assessment counts non nulls
        q.get('totals')]) 
    
    
metadata.close() #Close the output file, release all locks

#Open up the same file as a dataframe. Encode cp1252 avoids a utf8 error.
dfstats = pd.read_csv('article-stats'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf8')

print('The resulting dataframe has',len(dfstats),'rows')

The resulting dataframe has 10 rows


### Merge the dataframes

In [11]:
dfmerged = dfbasic.merge(dfstats, how='inner', on='id')
dfmerged.head()

Unnamed: 0,project_id,id,title,doi,handle,url,published_date,thumb,defined_type,defined_type_name,group_id,url_private_api,url_public_api,url_private_html,url_public_html,timeline,resource_title,resource_doi,views,downloads
0,,6876647,Sci-Tech 201: Infographics,10.6084/m9.figshare.6876647.v1,,https://api.figshare.com/v2/articles/6876647,2018-07-30T11:03:33Z,,7,presentation,,https://api.figshare.com/v2/account/articles/6...,https://api.figshare.com/v2/articles/6876647,https://figshare.com/account/articles/6876647,https://figshare.com/articles/presentation/Sci...,"{'posted': '2018-07-30T11:03:33', 'firstOnline...",,,118,40
1,,20291481,Is there Such Thing as a Typical Day? Libraria...,10.6084/m9.figshare.20291481.v3,,https://api.figshare.com/v2/articles/20291481,2022-07-27T16:48:21Z,,7,presentation,,https://api.figshare.com/v2/account/articles/2...,https://api.figshare.com/v2/articles/20291481,https://figshare.com/account/articles/20291481,https://figshare.com/articles/presentation/Is_...,"{'posted': '2022-07-27T16:48:21', 'firstOnline...",,,56,15
2,,13382954,Sci-Tech201: Researchers' Barriers to Publishi...,10.6084/m9.figshare.13382954.v1,,https://api.figshare.com/v2/articles/13382954,2020-12-15T21:25:55Z,https://s3-eu-west-1.amazonaws.com/pfigshare-u...,7,presentation,,https://api.figshare.com/v2/account/articles/1...,https://api.figshare.com/v2/articles/13382954,https://figshare.com/account/articles/13382954,https://figshare.com/articles/presentation/Sci...,"{'posted': '2020-12-15T21:25:55', 'firstOnline...",,,35,3
3,,6445418,Sci-Tech 101 Global Climate Change Resources -...,10.6084/m9.figshare.6445418.v1,,https://api.figshare.com/v2/articles/6445418,2018-06-05T11:30:17Z,https://ndownloader.figshare.com/files/1185359...,7,presentation,,https://api.figshare.com/v2/account/articles/6...,https://api.figshare.com/v2/articles/6445418,https://figshare.com/account/articles/6445418,https://figshare.com/articles/presentation/Sci...,"{'posted': '2018-06-05T11:30:17', 'firstOnline...",,,435,48
4,,6394226,Sci-Tech 101: Global Climate Change Resources-...,10.6084/m9.figshare.6394226.v3,,https://api.figshare.com/v2/articles/6394226,2018-06-05T11:30:52Z,https://ndownloader.figshare.com/files/1180448...,7,presentation,,https://api.figshare.com/v2/account/articles/6...,https://api.figshare.com/v2/articles/6394226,https://figshare.com/account/articles/6394226,https://figshare.com/articles/presentation/Sci...,"{'posted': '2018-06-05T11:30:52', 'firstOnline...",,,487,96


### If you have Collections run this next cell. Otherwise skip it.

In [None]:
#Retrieve list of private metadata- this is for unpublished and published records.
#SET THE PAGE SIZE to make sure you get all the records

#Get collections
query = '{"search_for":"", "institution":' + INST_ID + ', "page_size":100}' #Set up string
y = json.loads(query) #Convert the string to a dictionary (JSON)
y['search_for'] = ':author: \"'+ name + '\"' #Add in the name you are searcing for in quotes for an exact match

#y = json.loads(query) #Figshare API requires json paramaters
r=requests.post(BASE_URL + "/collections/search", params=y)
collections = json.loads(r.text)

#Create a dataframe from the JSON formatted data
dfcollbasic = pd.DataFrame(collections)

#Gather Stats
#Create a list of all the article ids
coll_ids = [item['id'] for item in collections]    

#Create csv file
metadata=open('collection-stats'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv', 'w', newline='')
#Write header row to csv
csv.writer(metadata).writerow(['id','views','downloads'])            

            
for l in coll_ids:
    s=requests.get('https://stats.figshare.com/total/views/article/'+ str(l))
    r=json.loads(s.text)
    t=requests.get('https://stats.figshare.com/total/downloads/article/'+ str(l))
    q=json.loads(t.text)
    
    #write the values to the csv file. Dates in json files are seconds from jan 1 1970 so datetime.datetime.fromtimestamp converts
    csv.writer(metadata).writerow([
        l,
        r.get('totals'), #For any of these .get(), adding ",'N/A'" will fill the null cells with 'N/A'. However, metadata assessment counts non nulls
        q.get('totals')]) 
    
    
metadata.close() #Close the output file, release all locks

#Open up the same file as a dataframe. Encode cp1252 avoids a utf8 error.
dfcollstats = pd.read_csv('collection-stats'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf8')

dfcollmerged = dfcollbasic.merge(dfcollstats, how='inner', on='id')

#Append the collections rows to the article dataframe
dfmerged = dfmerged.append(dfcollmerged)


### Format the dates column

In [12]:
#The dates are all contained within one column called 'timeline'. 
#Use the JSON to create a better format and then merge with the dataframe
#with the proper article id in a new dataframe

temp_date_list = []

for item in articles:
    dateitem = item['timeline']
    dateitem['id'] = item['id']
    temp_date_list.append(dateitem)

df_dates_items = pd.json_normalize(
    temp_date_list 
)


#Have to use 'try' here just in case you ran the Collection cell above
try:
    #Get a dates dataframe
    temp_coll_date_list = []

    for item in collections:
        dateitem = item['timeline']
        dateitem['id'] = item['id']
        temp_coll_date_list.append(dateitem)

    df_coll_dates_coll = pd.json_normalize(
        temp_coll_date_list 
    )
# catch when published_coll_records is None
except AttributeError:
    pass
# catch when it hasn't even been defined
except NameError:
    pass


#Append the dataframes (if collections have been found)
try:
    #Append the dates dataframes
    df_dates = df_dates_items.append(df_dates_coll)
# catch when df_dates_coll is None
except AttributeError:
    df_dates = df_dates_items
# catch when it hasn't even been defined
except NameError:
    df_dates = df_dates_items
    
#Merge the date dataframe with the metadata dataframe
df_formatted = dfmerged.merge(df_dates, how='outer', on='id')

print("Dates split out and merged")

Dates split out and merged


### View Totals

In [13]:
#See your summarized stats
print('Total views =', df_formatted['views'].sum(),'and total downloads =',df_formatted['downloads'].sum())


Total views = 1674 and total downloads = 330


## Get a breakdown of views by country by item

In [14]:
#Set the base URL
BASE_URL2 = 'https://stats.figshare.com/'

In [19]:
#Collect views by group by geolocation for the last month
breakdown = []
for i in articles:
    URL = BASE_URL2 + '/breakdown/total/views/article/' + str(i['id']) #by not specifying timeframe, will retrieve for the last month
    r = requests.get(URL)
    result=json.loads(r.text)
    result['item_id'] = i['id']
    breakdown.append(result)

In [22]:
#The results may have nested values for cities. Just get the totals by country and create a dataframe
country_totals = []
for record in breakdown:
    for item in record['breakdown']['total']:
        country_totals.append({'item_id':record['item_id'],'country':item,'total':record['breakdown']['total'][item]['total']})
df_geoviews = pd.DataFrame(country_totals)
df_geoviews.head(20)

Unnamed: 0,item_id,country,total
0,6876647,United States,17
1,6876647,China,8
2,6876647,Singapore,4
3,6876647,Ireland,2
4,6876647,Japan,2
5,20291481,United States,46
6,20291481,Japan,4
7,20291481,Singapore,2
8,20291481,China,1
9,20291481,Iceland,1


### Map the views

In [31]:
df_geototals = df_geoviews.groupby(by='country')['total'].sum().reset_index()
df_geototals.head()

Unnamed: 0,country,total
0,China,22
1,Denmark,8
2,Germany,5
3,Iceland,1
4,India,1


In [23]:
import plotly.express as px

In [33]:
#Map it (based on this https://stackoverflow.com/questions/72991306/frequency-map-by-country)

url = (
    "https://raw.githubusercontent.com/python-visualization/folium/master/examples/data"
)

fig = px.choropleth(df_geototals,
                    locations="country",
                    locationmode="country names",
                    geojson = f"{url}/world-countries.json",
                    color="total"
                   )

fig.show()

# Save the spreadsheet

## If you are running this in Google Colab

In [None]:
#When you run this cell it will ask you to authenticate so that you can create files to download
from google.colab import drive
drive.mount('/drive')

In [None]:
from google.colab import files
df_formatted.to_csv(str(name) + '-published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf-8') #create the CSV
files.download(str(name) + '-published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv') #download to your computer

## If you are running this locally
That is you downloaded the Jupyter Notebook file

In [35]:
#Save a CSV file of all the metadata. Change the file name if necessary to match dates.
save_file = df_formatted.to_csv(str(name) + '-published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf-8')
save_file = df_geototals.to_csv(str(name) + '-record-views-country-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.csv',encoding='utf-8')


In [None]:
#Or save an Excel file of all the metadata. Change the file name if necessary to match dates.
save_file = df_formatted.to_excel(str(name) + '-published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.xlsx')

In [None]:
#OPTIONAL: save the json.
with open(str(name) + 'published_records-'+str(datetime.datetime.now().strftime("%Y-%m-%d"))+'.json', 'w') as f:
    json.dump(published_records, f)