In [1]:
%load_ext autoreload
%autoreload 2

import htrc_features
import htrc_features.resolvers
from htrc_features import Volume, resolvers, FeatureReader, caching
import os
from pathlib import Path


# A r


A resolver chain. Works from the bottom up; so first looks for a feather file in /drobo/hathi-ef; if it doesn't exist, creates it from
the json.bz2 file in the same directory; if **that** doesn't exist, fetches over http. Will likely need to be edited.
In general, my preference is for this stuff to live in a local json file.

In [2]:
my_resolver = resolvers.make_resolver_chain([
    {"method": "http"},
    {"method": "stubbytree", "format": "json", "compression": "bz2", "dir": "/drobo/hathi-ef"},
    {"method": "stubbytree", "format": "feather", "compression": "zstd", "dir": "/drobo/hathi-ef"}
])

In [3]:
import pandas as pd
# All the hathi book swith 'novel' in the title.
novels = pd.read_csv("novels.csv", names = ["title", "id"])

In [4]:
n = novels.sample(frac = 1, random_state=1)

In [6]:
novels.head()

Unnamed: 0,title,id
0,Dorothy Forster : a novel /--- v.1,uiuo.ark:/13960/t5h99d66w
1,Lost for love : a novel /--- v.3,uiuo.ark:/13960/t19k4w11x
2,The Hampstead mystery : a novel /--- v.3,uiuo.ark:/13960/t2988r476
3,"The king's mirror; a novel,",uc2.ark:/13960/t07w6fx15
4,"The coast of Bohemia; a novel,",coo1.ark:/13960/t8x92rx3w


In [5]:
from pathlib import Path

In [9]:
import gzip
import json
output = gzip.open(Path("input.unigrams.txt.gz").expanduser(), "w")
catalog = open(Path("jsoncatalog.txt").expanduser(), "w")

for i, id in enumerate(n.id):
    print(i)
    if i > 10:
        # This is just demo code.
        break
    # A progress bar.
    fract = int(80*i/n.shape[0])
    print(f"\r{i:4}" + "=" * fract + "_" * (80 - fract) + "|", end="")
    # end progress bar.
    
    # Open the volume with htrc-feature-reader
    try:
        v = Volume(id, id_resolver=my_resolver)
    except FileNotFoundError:
        print(id, "not found")
        continue
    # Start the metadata field we'll fill up.
    meta = {}
    for k in ['id', 'pub_date', 'language', 'access_profile', 'genre', 'contributor', 'handle_url',  'lcc', 'source_institution', 'pub_place', 'title', 'publisher']:
        meta[k] = v.parser.meta[k]
    try:
        meta['first_author_birth'] = int(re.findall("[0-9]{4}", meta['contributor'])[0])
    except:
        pass
        # print(meta)
    try:
        # For this project, I only wanted English stuff.
        
        if not "language" in meta:
            continue
        if not "eng" in meta['language']:
            continue
            
        # IMPORTANT--getting the tokenlist out. Here I'm building a bookworm
        # where the documents are 5000 word chunks.
            
        chunked = v.tokenlist(chunk = True, chunk_target = 5000)\
          .reset_index().groupby(["chunk", "token"])['count']\
          .sum().reset_index().groupby("chunk")
        for ix, group in chunked:
            # For each chunk. 
            
            # IMPORTANT Using pandas to_csv with a formfeed line terminator to give the output type that Bookworm wants.
            
            tokencount = group[["token", "count"]].to_csv(header = False, line_terminator="\f", index = False)
            
            # Tab is a valid character in some tokens in htrc-features, but screws things up for Bookworm. Kludgy.
            tokencount = tokencount.replace("\t","{tab}")
            
            # Populate the metadata more.
            meta['filename'] = meta['id'] + "-" + str(ix)
            meta['searchstring'] = f"<a href={meta['handle_url']}>{meta['title']}</a>"
            meta['third'] = 1 + int(3 * ix/(len(chunked) + 1e-4))
            meta['sixth'] = 1 + int(6 * ix/(len(chunked) + 1e-4))
            meta['twelfth'] = 1 + int(12 * ix/(len(chunked) + 1e-4))
            
            # Write out the csv to input.unigrams.txt.gz
            output.write(f"{meta['filename']}\t{tokencount}\n".encode("utf-8"))
            catalog.write(json.dumps(meta) + "\n")
    except:
        print("Some error on", id)
        continue
output.close()
catalog.close()

0
   0________________________________________________________________________________|1
   1________________________________________________________________________________|2
   2________________________________________________________________________________|3
   3________________________________________________________________________________|4
   4________________________________________________________________________________|5
   5________________________________________________________________________________|6
   6________________________________________________________________________________|7
   7________________________________________________________________________________|8
   8________________________________________________________________________________|9
   9________________________________________________________________________________|10
  10________________________________________________________________________________|11


In [None]:
v = Volume(id, id_resolver=my_resolver)

In [None]:
v.tokenlist().sample(10)

In [None]:
output.close()
catalog.close()

In [None]:
"""
{
  "plottype": "linechart",
  "smoothingSpan": 0,
  "host": "http://localhost:10012/",
  "database": "fiction",
  "aesthetic": {
    "x": "sixth",
    "y": "WordsPerMillion"
  },
  "search_limits": {"word":["sleep"]},
  "vega": {
    "width": 400,
    "title": "Number of books in the Hathi trust by year, top 12 languages."
  }
}
"""