### Aggregation of Digital Scholarship Archive File Data

The following code uses the REST API provided by Rice's Digital Scholarship Archive (https://scholarship.rice.edu/rest) to aggregate statistics for file views present in the archive by community and collection.

In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
def get_community(jsonroot, name):
    """
    Recursively find a community in dictionary jsonroot 
    with the given name.
    Returns None if no match is found.
    """
    community = None
    for subcommunity in jsonroot['community']:
        if subcommunity['name'] == name:
            return subcommunity
        subsearch = get_community(subcommunity, name)
        if subsearch:
            return subsearch
        
    return None

In [3]:
def get_all_collections(jsonroot, prefix=""):
    """
    Recursively find all collections in dictionary jsonroot.
    Returns a dataframe with columns id, name, and handle.
    """
    ids = []
    names = []
    handles = []

    if not prefix[-3:] == " > ":
        prefix = str(jsonroot['name']) + " > "
    
    for collection in jsonroot['collection']:
        ids.append(collection['id'])
        names.append(prefix + str(collection['name']))
        handles.append(collection['handle'])
        
    collections = pd.DataFrame({'id': ids, 'name': names, 'handle': handles})
    
    for subcommunity in jsonroot['community']:
        subframe = get_all_collections(subcommunity, prefix=(prefix + str(subcommunity['name']) + " > "))
        collections = pd.concat([collections, subframe], ignore_index=True)
    
    return collections

Step 1: Find the community of interest.

In [4]:
# the hierarchy page contains the entire tree-like structure of the Digital Scholarship Archive,
#   where a community is an interior node and a collection is a leaf with associated file data.
hierarchyurl = 'https://scholarship.rice.edu/rest/hierarchy'

response = requests.get(hierarchyurl)
data = json.loads(response.text)
communitydata = get_community(data, 'Chao Center for Asian Studies')

Step 2: Extract all collections.

In [5]:
allcollections = get_all_collections(communitydata)
list(allcollections.name)[:5]

['Chao Center for Asian Studies > Bobby Joe Moon family genealogy collection (MS 581)',
 "Chao Center for Asian Studies > Braes Republican Women's Club records, 1980-2009 (MS 601)",
 'Chao Center for Asian Studies > Chinese American Citizens Alliance records (Houston Chapter) (MS 606)',
 'Chao Center for Asian Studies > Chuan-Hua Gershom Lowe, National Party, Chinese diplomat papers, 1923-1996 (MS 588)',
 'Chao Center for Asian Studies > Dr. Ed C. M. Chen collection of oral history interviews with Asian American Houstonians, 1980s (MS 646)']

In [6]:
allcollections = get_all_collections(communitydata)
allcollections['url'] = 'https://scholarship.rice.edu/rest/collections/' + allcollections['id'] + '/items'

handles = []
collectionnames = []
collectionurls = []

for idx, row in allcollections.iterrows():
    url = row['url']
    
    response = requests.get(url)
    jresponse = json.loads(response.text)

    # annoyingly, there are collections with no items.
    #  this test will throw out such collections.
    if len(jresponse) == 0:
        continue

    data = jresponse[0]
    handles.append(data['handle'].split("/")[1])
    collectionnames.append(row['name'])
    collectionurls.append(url)
    
allcollections = pd.DataFrame({'handle': handles, 'collectionname': collectionnames, 'link': collectionurls})
allcollections

Unnamed: 0,handle,collectionname,link
0,75178,Chao Center for Asian Studies > Bobby Joe Moon...,https://scholarship.rice.edu/rest/collections/...
1,77623,Chao Center for Asian Studies > Braes Republic...,https://scholarship.rice.edu/rest/collections/...
2,75797,Chao Center for Asian Studies > Chinese Americ...,https://scholarship.rice.edu/rest/collections/...
3,77631,Chao Center for Asian Studies > Chuan-Hua Gers...,https://scholarship.rice.edu/rest/collections/...
4,64031,Chao Center for Asian Studies > Dr. Ed C. M. C...,https://scholarship.rice.edu/rest/collections/...
5,101509,Chao Center for Asian Studies > Edward Chen fa...,https://scholarship.rice.edu/rest/collections/...
6,75180,Chao Center for Asian Studies > Gene and Hedy ...,https://scholarship.rice.edu/rest/collections/...
7,68486,Chao Center for Asian Studies > Vietnamese Ame...,https://scholarship.rice.edu/rest/collections/...
8,100395,Chao Center for Asian Studies > Ephemera Archi...,https://scholarship.rice.edu/rest/collections/...
9,100667,Chao Center for Asian Studies > Ephemera Archi...,https://scholarship.rice.edu/rest/collections/...


Step 3: Find all item records that live in any collection. Tag each item with the collection it came from.

In [7]:
combine = []
parentcollections = []
names = []

pagesize = 1000

for idx, row in allcollections.iterrows():
    # handle pagination
    offset = 0
    stop = False
    
    while not stop:
        # get a page of data
        response = requests.get(row['link'], params={'limit': pagesize, 'offset': offset})
        items = json.loads(response.text)

        # extract item handle, item name, and tag with parent collection.
        for item in items:
            itemno = item['handle'].split('/')[1]
            combine.append(itemno)
            names.append(item['name'])
            parentcollections.append(row['handle'])
            
        # if we have fewer than pagesize records, then we are done.
        #  otherwise, get the next page.
        if len(items) < pagesize:
            stop = True
        else:
            offset += pagesize

allitems = pd.DataFrame({'itemhandle': combine, 'handle': parentcollections, 'name': names})
allitems['link'] = 'https://scholarship.rice.edu/handle/1911/' + allitems['itemhandle'] + '/statistics'
allitems

Unnamed: 0,itemhandle,handle,name,link
0,75178,75178,Moon family at time of arrival in United State...,https://scholarship.rice.edu/handle/1911/75178...
1,75176,75178,"Joe Bros. store, Cleveland, Mississippi",https://scholarship.rice.edu/handle/1911/75176...
2,75177,75178,Newsclipping featuring image of parents of Bob...,https://scholarship.rice.edu/handle/1911/75177...
3,77623,77623,Braes Republican Women's Club swearing in cere...,https://scholarship.rice.edu/handle/1911/77623...
4,77624,77623,Greater Houston Council of Federated Republica...,https://scholarship.rice.edu/handle/1911/77624...
5,77622,77623,Braes Republican Women's Club bylaws and philo...,https://scholarship.rice.edu/handle/1911/77622...
6,77625,77623,Braes Republican Women scrapbook pages featuri...,https://scholarship.rice.edu/handle/1911/77625...
7,75797,75797,Miss Chinatown Houston Pageant Day proclamation,https://scholarship.rice.edu/handle/1911/75797...
8,75798,75797,Chinese American Citizens Alliance Houston Lod...,https://scholarship.rice.edu/handle/1911/75798...
9,75799,75797,Houston Lodge Chinese American Citizens Allian...,https://scholarship.rice.edu/handle/1911/75799...


Step 4: Query each item record to find all associated files, and for each file, extract how many views it has.

In [8]:
allrecords = pd.DataFrame({"itemhandle": [], "filename":[], "views": []})

for idx, row in allitems.iterrows():
    url = row['link']
    r = requests.get(url)
    page = r.text
    soup = bs(page,'html.parser')
   
    res = soup.find_all('table', {'id':"aspect_statistics_StatisticsTransformer_table_list-table"})
    headings = soup.find_all('h3', class_='ds-table-head')
   
    labels = []
    data = []
    handles = []
    
    index = 0
   
    for heading in headings:
        if heading.string == 'File Visits':
            record = res[index]
            labels.extend(list(map(lambda rec: rec.string, record.find_all('td', class_='labelcell'))))
            data.extend(list(map(lambda rec: rec.string, record.find_all('td', class_='datacell'))))
            handles.extend(list(map(lambda rec: row['handle'], record.find_all('td', class_='datacell'))))
        index += 1
       
    itemfiles = pd.DataFrame({'itemhandle': handles, 'filename': labels, 'views': data})
    allrecords = pd.concat([allrecords, itemfiles], ignore_index=True)
    
allrecords

Unnamed: 0,itemhandle,filename,views
0,75178,wrc03901.jpg(legacy),72
1,75178,wrc03901.jp2(legacy),24
2,75178,wrc03901.jp2,18
3,75178,wrc03901.jpg,2
4,75178,wrc03899.jpg(legacy),65
5,75178,wrc03899.jpg,35
6,75178,wrc03899.jp2,23
7,75178,wrc03899.jp2(legacy),14
8,75178,wrc03900.pdf(legacy),256
9,75178,wrc03900.pdf,58


Step 5: Merge the tables we have created to produce a final dataframe with only File Name, Views, and Collection Name as columns.

In [9]:
final = allrecords.merge(allitems.merge(allcollections, on='handle'),on='itemhandle')
final = final[['filename', 'views', 'collectionname']]
final['views'] = final.views.astype(int)
final

Unnamed: 0,filename,views,collectionname
0,wrc03901.jpg(legacy),72,Chao Center for Asian Studies > Bobby Joe Moon...
1,wrc03901.jp2(legacy),24,Chao Center for Asian Studies > Bobby Joe Moon...
2,wrc03901.jp2,18,Chao Center for Asian Studies > Bobby Joe Moon...
3,wrc03901.jpg,2,Chao Center for Asian Studies > Bobby Joe Moon...
4,wrc03899.jpg(legacy),65,Chao Center for Asian Studies > Bobby Joe Moon...
5,wrc03899.jpg,35,Chao Center for Asian Studies > Bobby Joe Moon...
6,wrc03899.jp2,23,Chao Center for Asian Studies > Bobby Joe Moon...
7,wrc03899.jp2(legacy),14,Chao Center for Asian Studies > Bobby Joe Moon...
8,wrc03900.pdf(legacy),256,Chao Center for Asian Studies > Bobby Joe Moon...
9,wrc03900.pdf,58,Chao Center for Asian Studies > Bobby Joe Moon...


Step 6: Group by collection name and aggregate the number of views for all files in each community.

In [10]:
final.groupby('collectionname').filename.count()

collectionname
Chao Center for Asian Studies > Bobby Joe Moon family genealogy collection (MS 581)                                                                10
Chao Center for Asian Studies > Braes Republican Women's Club records, 1980-2009 (MS 601)                                                          13
Chao Center for Asian Studies > Chinese American Citizens Alliance records (Houston Chapter) (MS 606)                                              50
Chao Center for Asian Studies > Chuan-Hua Gershom Lowe, National Party, Chinese diplomat papers, 1923-1996 (MS 588)                                 8
Chao Center for Asian Studies > Dr. Ed C. M. Chen collection of oral history interviews with Asian American Houstonians, 1980s (MS 646)           105
Chao Center for Asian Studies > Edward Chen family collection, 1920-2010 (MS 561)                                                                   6
Chao Center for Asian Studies > Ephemera Archive > Chinese Commercial Advertisement A

In [11]:
final.groupby('collectionname').views.sum()

collectionname
Chao Center for Asian Studies > Bobby Joe Moon family genealogy collection (MS 581)                                                                 567
Chao Center for Asian Studies > Braes Republican Women's Club records, 1980-2009 (MS 601)                                                           930
Chao Center for Asian Studies > Chinese American Citizens Alliance records (Houston Chapter) (MS 606)                                              2361
Chao Center for Asian Studies > Chuan-Hua Gershom Lowe, National Party, Chinese diplomat papers, 1923-1996 (MS 588)                                 322
Chao Center for Asian Studies > Dr. Ed C. M. Chen collection of oral history interviews with Asian American Houstonians, 1980s (MS 646)            6086
Chao Center for Asian Studies > Edward Chen family collection, 1920-2010 (MS 561)                                                                  4908
Chao Center for Asian Studies > Ephemera Archive > Chinese Commercial Adv