### Aggregation of Digital Scholarship Archive File Data

The following code uses the REST API provided by Rice's Digital Scholarship Archive (https://scholarship.rice.edu/rest) to aggregate statistics for file views present in the archive by community and collection.

In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
def get_community(jsonroot, name):
    """
    Recursively find a community in dictionary jsonroot 
    with the given name.
    Returns None if no match is found.
    """
    community = None
    for subcommunity in jsonroot['community']:
        if subcommunity['name'] == name:
            return subcommunity
        subsearch = get_community(subcommunity, name)
        if subsearch:
            return subsearch
        
    return None

In [3]:
def get_all_collections(jsonroot):
    """
    Recursively find all collections in dictionary jsonroot.
    Returns a dataframe with columns id, name, and handle.
    """
    ids = []
    names = []
    handles = []
    
    for collection in jsonroot['collection']:
        ids.append(collection['id'])
        names.append(collection['name'])
        handles.append(collection['handle'])
        
    collections = pd.DataFrame({'id': ids, 'name': names, 'handle': handles})
    
    for subcommunity in jsonroot['community']:
        subframe = get_all_collections(subcommunity)
        collections = pd.concat([collections, subframe], ignore_index=True)
    
    return collections

Step 1: Find the community of interest.

In [4]:
# the hierarchy page contains the entire tree-like structure of the Digital Scholarship Archive,
#   where a community is an interior node and a collection is a leaf with associated file data.
hierarchyurl = 'https://scholarship.rice.edu/rest/hierarchy'

response = requests.get(hierarchyurl)
data = json.loads(response.text)
communitydata = get_community(data, 'TIMEA - Travelers in the Middle East Archive')

Step 2: Extract all collections.

In [5]:
allcollections = get_all_collections(communitydata)
allcollections['url'] = 'https://scholarship.rice.edu/rest/collections/' + allcollections['id'] + '/items'

handles = []
collectionnames = []
collectionurls = []

for idx, row in allcollections.iterrows():
    url = row['url']
    
    response = requests.get(url)
    jresponse = json.loads(response.text)

    # annoyingly, there are collections with no items.
    #  this test will throw out such collections.
    if len(jresponse) == 0:
        continue

    data = jresponse[0]
    handles.append(data['handle'].split("/")[1])
    collectionnames.append(row['name'])
    collectionurls.append(url)
    
allcollections = pd.DataFrame({'handle': handles, 'collectionname': collectionnames, 'link': collectionurls})
allcollections

Unnamed: 0,handle,collectionname,link
0,9305,TIMEA Historical Maps and Plans,https://scholarship.rice.edu/rest/collections/...
1,13086,TIMEA Research,https://scholarship.rice.edu/rest/collections/...
2,103799,TIMEA Texts,https://scholarship.rice.edu/rest/collections/...
3,10329,TIMEA Visual Materials,https://scholarship.rice.edu/rest/collections/...


Step 3: Find all item records that live in any collection. Tag each item with the collection it came from.

In [6]:
combine = []
parentcollections = []
names = []

for idx, row in allcollections.iterrows():
    response = requests.get(row['link'] + "?limit=1000")
    items = json.loads(response.text)

    for item in items:
        itemno = item['handle'].split('/')[1]
        combine.append(itemno)
        names.append(item['name'])
        parentcollections.append(row['handle'])

allitems = pd.DataFrame({'itemhandle': combine, 'handle': parentcollections, 'name': names})
allitems['link'] = 'https://scholarship.rice.edu/handle/1911/' + allitems['itemhandle'] + '/statistics'
allitems

Unnamed: 0,itemhandle,handle,name,link
0,9305,9305,Cairo (Masr El-Kahira),https://scholarship.rice.edu/handle/1911/9305/...
1,9417,9305,Map of Cook's Steamer and Dahabeah Service of ...,https://scholarship.rice.edu/handle/1911/9417/...
2,9299,9305,The Delta (Lower Egypt),https://scholarship.rice.edu/handle/1911/9299/...
3,9324,9305,Environs of Alexandria 1:150.000,https://scholarship.rice.edu/handle/1911/9324/...
4,103392,9305,The Suez Canal from the English & French Admir...,https://scholarship.rice.edu/handle/1911/10339...
5,103393,9305,The Suez Canal from the English & French Admir...,https://scholarship.rice.edu/handle/1911/10339...
6,103394,9305,Cairo,https://scholarship.rice.edu/handle/1911/10339...
7,9304,9305,Environs of Alexandria 1:150.000,https://scholarship.rice.edu/handle/1911/9304/...
8,9346,9305,Le Caire (Masr El-Kahira),https://scholarship.rice.edu/handle/1911/9346/...
9,9407,9305,District of Thebes,https://scholarship.rice.edu/handle/1911/9407/...


Step 4: Query each item record to find all associated files, and for each file, extract how many views it has.

In [7]:
allrecords = pd.DataFrame({"itemhandle": [], "filename":[], "views": []})

for idx, row in allitems.iterrows():
    url = row['link']
    r = requests.get(url)
    page = r.text
    soup = bs(page,'html.parser')
   
    res = soup.find_all('table', {'id':"aspect_statistics_StatisticsTransformer_table_list-table"})
    headings = soup.find_all('h3', class_='ds-table-head')
   
    labels = []
    data = []
    handles = []
    
    index = 0
   
    for heading in headings:
        if heading.string == 'File Visits':
            record = res[index]
            labels.extend(list(map(lambda rec: rec.string, record.find_all('td', class_='labelcell'))))
            data.extend(list(map(lambda rec: rec.string, record.find_all('td', class_='datacell'))))
            handles.extend(list(map(lambda rec: row['handle'], record.find_all('td', class_='datacell'))))
        index += 1
       
    itemfiles = pd.DataFrame({'itemhandle': handles, 'filename': labels, 'views': data})
    allrecords = pd.concat([allrecords, itemfiles], ignore_index=True)
    
allrecords

Unnamed: 0,itemhandle,filename,views
0,9305,197540,1671
1,9305,BaeEg2a_228be.jpg(legacy),733
2,9305,BaeEg2a_228be.jpg,103
3,9305,BaeEg2a_228be.jp2,42
4,9305,197585,2088
5,9305,BudNi1895_f05.jpg(legacy),344
6,9305,BudNi1895_f05.jpg,131
7,9305,BudNi1895_f05.jp2,33
8,9305,197589,1961
9,9305,BaeEg2a_007b.jpg(legacy),1730


Step 5: Merge the tables we have created to produce a final dataframe with only File Name, Views, and Collection Name as columns.

In [8]:
final = allrecords.merge(allitems.merge(allcollections, on='handle'),on='itemhandle')
final = final[['filename', 'views', 'collectionname']]
final['views'] = final.views.astype(int)
final

Unnamed: 0,filename,views,collectionname
0,197540,1671,TIMEA Historical Maps and Plans
1,BaeEg2a_228be.jpg(legacy),733,TIMEA Historical Maps and Plans
2,BaeEg2a_228be.jpg,103,TIMEA Historical Maps and Plans
3,BaeEg2a_228be.jp2,42,TIMEA Historical Maps and Plans
4,197585,2088,TIMEA Historical Maps and Plans
5,BudNi1895_f05.jpg(legacy),344,TIMEA Historical Maps and Plans
6,BudNi1895_f05.jpg,131,TIMEA Historical Maps and Plans
7,BudNi1895_f05.jp2,33,TIMEA Historical Maps and Plans
8,197589,1961,TIMEA Historical Maps and Plans
9,BaeEg2a_007b.jpg(legacy),1730,TIMEA Historical Maps and Plans


Step 6: Group by collection name and aggregate the number of views for all files in each community.

In [9]:
final.groupby('collectionname').count()

Unnamed: 0_level_0,filename,views
collectionname,Unnamed: 1_level_1,Unnamed: 2_level_1
TIMEA Historical Maps and Plans,595,595
TIMEA Research,28,28
TIMEA Texts,699,699
TIMEA Visual Materials,3930,3930


In [10]:
final.groupby('collectionname').views.sum()

collectionname
TIMEA Historical Maps and Plans     303800
TIMEA Research                        8851
TIMEA Texts                        1296840
TIMEA Visual Materials             1019152
Name: views, dtype: int64