### Aggregation of Digital Scholarship Archive File Data

The following code uses the REST API provided by Rice's Digital Scholarship Archive (https://scholarship.rice.edu/rest) to aggregate statistics for file views present in the archive by community and collection.

In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup as bs

In [2]:
def get_community(jsonroot, name):
    """
    Recursively find a community in dictionary jsonroot 
    with the given name.
    Returns None if no match is found.
    """
    community = None
    for subcommunity in jsonroot['community']:
        if subcommunity['name'] == name:
            return subcommunity
        subsearch = get_community(subcommunity, name)
        if subsearch:
            return subsearch
        
    return None

In [3]:
def get_all_collections(jsonroot):
    """
    Recursively find all collections in dictionary jsonroot.
    Returns a dataframe with columns id, name, and handle.
    """
    ids = []
    names = []
    handles = []
    
    for collection in jsonroot['collection']:
        ids.append(collection['id'])
        names.append(collection['name'])
        handles.append(collection['handle'])
        
    collections = pd.DataFrame({'id': ids, 'name': names, 'handle': handles})
    
    for subcommunity in jsonroot['community']:
        subframe = get_all_collections(subcommunity)
        collections = pd.concat([collections, subframe], ignore_index=True)
    
    return collections

Step 1: Find the community of interest.

In [4]:
# the hierarchy page contains the entire tree-like structure of the Digital Scholarship Archive,
#   where a community is an interior node and a collection is a leaf with associated file data.
hierarchyurl = 'https://scholarship.rice.edu/rest/hierarchy'

response = requests.get(hierarchyurl)
data = json.loads(response.text)
communitydata = get_community(data, 'Subway Advertisement Archive')

Step 2: Extract all collections.

In [5]:
allcollections = get_all_collections(communitydata)
allcollections['url'] = 'https://scholarship.rice.edu/rest/collections/' + allcollections['id'] + '/items'

handles = []
collectionnames = []
collectionurls = []

for idx, row in allcollections.iterrows():
    url = row['url']
    
    response = requests.get(url)
    jresponse = json.loads(response.text)

    # annoyingly, there are collections with no items.
    #  this test will throw out such collections.
    if len(jresponse) == 0:
        continue

    data = jresponse[0]
    handles.append(data['handle'].split("/")[1])
    collectionnames.append(row['name'])
    collectionurls.append(url)
    
allcollections = pd.DataFrame({'handle': handles, 'collectionname': collectionnames, 'link': collectionurls})
allcollections

Unnamed: 0,handle,collectionname,link
0,57879,Beijing,https://scholarship.rice.edu/rest/collections/...
1,41175,Hongkong,https://scholarship.rice.edu/rest/collections/...
2,41543,Kaoshiung,https://scholarship.rice.edu/rest/collections/...
3,40034,Shanghai,https://scholarship.rice.edu/rest/collections/...
4,42080,Singapore,https://scholarship.rice.edu/rest/collections/...
5,43568,Taipei,https://scholarship.rice.edu/rest/collections/...


Step 3: Find all item records that live in any collection. Tag each item with the collection it came from.

In [6]:
combine = []
parentcollections = []
names = []

pagesize = 1000

for idx, row in allcollections.iterrows():
    # handle pagination
    offset = 0
    stop = False
    
    while not stop:
        # get a page of data
        response = requests.get(row['link'], params={'limit': pagesize, 'offset': offset})
        items = json.loads(response.text)

        # extract item handle, item name, and tag with parent collection.
        for item in items:
            itemno = item['handle'].split('/')[1]
            combine.append(itemno)
            names.append(item['name'])
            parentcollections.append(row['handle'])
            
        # if we have fewer than pagesize records, then we are done.
        #  otherwise, get the next page.
        if len(items) < pagesize:
            stop = True
        else:
            offset += pagesize

allitems = pd.DataFrame({'itemhandle': combine, 'handle': parentcollections, 'name': names})
allitems['link'] = 'https://scholarship.rice.edu/handle/1911/' + allitems['itemhandle'] + '/statistics'
allitems

Unnamed: 0,itemhandle,handle,name,link
0,57879,57879,"China Putian Cell Phone, China Putian. www.cap...",https://scholarship.rice.edu/handle/1911/57879...
1,57904,57879,"Motorola Cell Phone, Motorola C300.",https://scholarship.rice.edu/handle/1911/57904...
2,57914,57879,"China Merchants Securities Co., Ltd. (CCSC),",https://scholarship.rice.edu/handle/1911/57914...
3,57934,57879,"Olympus Camera, Olympus",https://scholarship.rice.edu/handle/1911/57934...
4,57935,57879,"Colgate toothbrush, Colgate",https://scholarship.rice.edu/handle/1911/57935...
5,57943,57879,"Cadbury Chocolate, Cadbury Chocolate",https://scholarship.rice.edu/handle/1911/57943...
6,57946,57879,"Beijing Metro Service Standards,",https://scholarship.rice.edu/handle/1911/57946...
7,57947,57879,"Sparkle Fashion, Sparkle. Everyday fashion.",https://scholarship.rice.edu/handle/1911/57947...
8,57949,57879,"China Orchestra,",https://scholarship.rice.edu/handle/1911/57949...
9,57954,57879,"South Australia Trip, Perth. Western Australia...",https://scholarship.rice.edu/handle/1911/57954...


Step 4: Query each item record to find all associated files, and for each file, extract how many views it has.

In [7]:
allrecords = pd.DataFrame({"itemhandle": [], "filename":[], "views": []})

for idx, row in allitems.iterrows():
    url = row['link']
    r = requests.get(url)
    page = r.text
    soup = bs(page,'html.parser')
   
    res = soup.find_all('table', {'id':"aspect_statistics_StatisticsTransformer_table_list-table"})
    headings = soup.find_all('h3', class_='ds-table-head')
   
    labels = []
    data = []
    handles = []
    
    index = 0
   
    for heading in headings:
        if heading.string == 'File Visits':
            record = res[index]
            labels.extend(list(map(lambda rec: rec.string, record.find_all('td', class_='labelcell'))))
            data.extend(list(map(lambda rec: rec.string, record.find_all('td', class_='datacell'))))
            handles.extend(list(map(lambda rec: row['handle'], record.find_all('td', class_='datacell'))))
        index += 1
       
    itemfiles = pd.DataFrame({'itemhandle': handles, 'filename': labels, 'views': data})
    allrecords = pd.concat([allrecords, itemfiles], ignore_index=True)
    
allrecords

Unnamed: 0,itemhandle,filename,views
0,57879,587293,69
1,57879,putiancellphonea02b1.jpg(legacy),66
2,57879,putiancellphonea02b2.jpg(legacy),50
3,57879,putiancellphonea02b3.jpg(legacy),49
4,57879,putiancellphonea02b2.jpg.jpg(legacy),29
5,57879,587291,27
6,57879,putiancellphonea02b1.jpg,18
7,57879,putiancellphonea02b3.jpg,3
8,57879,putiancellphonea02b2.jpg,2
9,57879,587412,91


Step 5: Merge the tables we have created to produce a final dataframe with only File Name, Views, and Collection Name as columns.

In [8]:
final = allrecords.merge(allitems.merge(allcollections, on='handle'),on='itemhandle')
final = final[['filename', 'views', 'collectionname']]
final['views'] = final.views.astype(int)
final

Unnamed: 0,filename,views,collectionname
0,587293,69,Beijing
1,putiancellphonea02b1.jpg(legacy),66,Beijing
2,putiancellphonea02b2.jpg(legacy),50,Beijing
3,putiancellphonea02b3.jpg(legacy),49,Beijing
4,putiancellphonea02b2.jpg.jpg(legacy),29,Beijing
5,587291,27,Beijing
6,putiancellphonea02b1.jpg,18,Beijing
7,putiancellphonea02b3.jpg,3,Beijing
8,putiancellphonea02b2.jpg,2,Beijing
9,587412,91,Beijing


Step 6: Group by collection name and aggregate the number of views for all files in each community.

In [9]:
final.groupby('collectionname').filename.count()

collectionname
Beijing      4730
Hongkong     1755
Kaoshiung     171
Shanghai     4957
Singapore    1965
Taipei       4090
Name: filename, dtype: int64

In [10]:
final.groupby('collectionname').views.sum()

collectionname
Beijing      249143
Hongkong     113365
Kaoshiung      7800
Shanghai     264322
Singapore    161815
Taipei       191555
Name: views, dtype: int64