In [2]:
# For development, use local paths.

import sys
sys.path.append("..")

In [3]:
# Load local
%load_ext autoreload
%autoreload 2

In [4]:
import nonconsumptive as nc


# Feature counts from text files.

This notebook creating a set of document-level feature count files akin to those distributed by the Hathi Trust, but from Project Gutenberg text files in the folder `{nonconsumptive_root}/sample_inputs/gutenberg/texts`.
Metadata is read from a file called "metadata.json" and bound to files based on their filenames.

Files are stored as parquet, which allows for fast processing.

# Create a corpus

First, create a corpus. Every corpus has to be build from a strategy for retrieving texts, and a strategy for retrieving metadata.

Ideally these will be disentangled. Some strategies might include:

* metadata from { csv, yaml header block, TEI header blocks }
* text from { set of files }
* ids from { filename, filename plus directory, first column of mallet input,  etc. }

The ids allow looking up the texts in the metadata.



In [5]:
import tempfile
from pathlib import Path
Path(tempfile.gettempdir() + "/gutenberg4").mkdir(exist_ok = True)

gutenberg = nc.Corpus(
    texts = "../sample_inputs/gutenberg/texts",
    metadata = "../sample_inputs/gutenberg/metadata.ndjson",
    dir = Path(tempfile.gettempdir() + "/gutenberg4"),
    compression = "gz",
    cache_set = {"tokenization"}
)

## Metadata

The metadata is stored internally as a pyarrow table with some wrappers to ensure type integrity.

Based on internal data and column types, this will leverage some Bookworm code to determine that "date"  or "year" are date type columns.
It should also be able to discriminate between "categorical" types (or in library parlance, "controlled vocabulary" fields and free entry ones, perhaps with additional help from configuration files.


In [6]:
gutenberg.metadata.tb.to_pandas().head(4)



 None 




Unnamed: 0,htid,pubdate,title,author,@id
0,dul1.ark+=13960=t3kw6ns1s,1851,"Moby-Dick; or, The Whale","Melville, Herman",15
1,coo.31924014152700,1894,Far from the Madding Crowd,"Hardy, Thomas",27
2,nyp.33433075744890,1905,The Scarlet Pimpernel,"Orczy, Emmuska Orczy, Baroness",60
3,hvd.32044004480208,1917,A Princess of Mars,"Burroughs, Edgar Rice",62


Individual entries can be retrieved by their identifier. The identifier field should be called 'filename' or 'id,' or (ultimately) specified in the definition.

In [7]:
gutenberg.metadata.get("27")

{'htid': 'coo.31924014152700',
 'pubdate': 1894,
 'title': 'Far from the Madding Crowd',
 'author': 'Hardy, Thomas',
 '@id': '27'}

In [8]:
gutenberg.get_document("27")

<DOCUMENT> {"htid": "coo.31924014152700", "pubdate": 1894, "title": "Far from the Madding Crowd", "author": "Hardy, Thomas", "@id": "27"}

Getting from the corpus as a whole returns a "Document" item. 

You can also access components of any stream (tokenization, token_counts, bigram_counts, etc.) directly through
the get_id function. This is not especially fast because it has to look up locations, and generally if you're working on the full corpus you should just iterate through in order.

In [9]:
gutenberg.tokenization.get_id("15")['token'][3320:3340]

Building cache


<pyarrow.lib.StringArray object at 0x11a116b40>
[
  ",",
  "you",
  "would",
  "make",
  "them",
  "speak",
  "like",
  "great",
  "whales",
  ".”",
  "—",
  "_Goldsmith",
  "to",
  "Johnson_",
  ".",
  "“",
  "In",
  "the",
  "afternoon",
  "we"
]

# Documents

The documents part of the corpus is structured as an iterator, because it's generally foolhardy to read in all the documents at once.

Right now, the text of the document is read at iteration. Ultimately, the strategy would be to read only the parts of the document 
as needed from the corpus. (For example, if you request feature counts, it's fine if the raw document isn't there.)

In [10]:
gutenberg.documents # Is an iterable.

<generator object Corpus.documents at 0x11a0952d0>

## An individual document

`random` and `first` are convenience methods to get a single document.

In [11]:
one_book = gutenberg.first()
one_book

<DOCUMENT> {"htid": "dul1.ark+=13960=t3kw6ns1s", "pubdate": 1851, "title": "Moby-Dick; or, The Whale", "author": "Melville, Herman", "@id": "15"}

In [12]:
counts = gutenberg.token_counts.get_id(one_book.metadata['@id'])
# Capital lettered words among the 200 most common.
counts.to_pandas().sort_values("count")[-200:].query("token.str.match('[A-Z]')")

Building cache


Unnamed: 0,token,count
758,There,150
131,A,155
241,Pequod,174
987,What,174
397,Starbuck,198
695,For,199
2274,Captain,205
470,He,229
286,In,239
399,Queequeg,252


## Wordcounts

Wordcounts are a basic element of nonconsumptive reading that can be used in analysis or stored. They are returned as a pyarrow RecordBatch.

In [13]:
import pyarrow as pa
pa.Table.from_batches([one_book.wordcounts]).to_pandas().query("token.str.match('whal')").head(5)

Unnamed: 0,token,count
500,whale,914
598,whales,239
1413,whale_,3
1829,whalemen,70
2443,whaling,118


## Metadata on wordcounts

The schema includes metadata. Figuring out how to dress this up into full json-ld  is a major goal.

In [14]:
one_book.wordcounts.schema.metadata

{b'nc_metadata': b'{"htid": "dul1.ark+=13960=t3kw6ns1s", "pubdate": 1851, "title": "Moby-Dick; or, The Whale", "author": "Melville, Herman", "@id": "15"}'}

In [16]:
for p in gutenberg.token_counts:
    pass

In [18]:
one_book.wordcounts



pyarrow.RecordBatch
token: string
count: uint32

## Iterating over feature counts

We can now iterate over the token count files to get a list of--say--which books use words the most.

In [17]:
whales = []

for batch in gutenberg.token_counts:
    print(id)
    words = counts.to_pandas()['count'].sum()
    
    title = meta['title']
    whale_counts = counts.to_pandas().query("token=='whale'")['count'].sum()
    whales.append((whale_counts, title, words))

whales.sort(reverse = True)
whales[:10]

<built-in function id>


NameError: name 'meta' is not defined

# SRP embeddings

SRP provides a way of grouping documents based on their wordcounts that is yet more
nonconsumptive than wordcounts themselves. It uses random projection to scatter tokenized
counts in a high-dimensional space according to vocabulary; books with similar vocabularies
should be close in this space.

In the version associated with `nonconsumptive`, I also include a further reduction in SRP size
from the 1280-dimensional float vectors to a binary embedding that takes up 1/64 the size; 
these are suitable for making quick, rough comparisons between books.

Here, we look at the books that are closest in the binary space (technically known 
as a "Hamming" space). Even with only 160 bytes of information per book and no language model,
almost all of the most common books are those that have a shared author.

In [18]:
from nonconsumptive.extensions import embed_to_SRP

In [19]:
b = embed_to_SRP(gutenberg, flush_every = 5)

flushing ['15', '76', '64', '158', '170', '234']
flushing ['95', '224', '78', '139', '144']
flushing ['103', '172', '91', '176', '164']
flushing ['62', '140', '27', '178', '105']
flushing ['174', '142', '60', '121', '133']
flushing ['72', '222', '84', '171', '161']
flushing ['173', '126', '86', '94', '141']
flushing ['177', '165', '215', '73', '179']


In [20]:
gutenberg.token_counts.__iter__().__next__()

pyarrow.RecordBatch
token: string
count: uint32

In [81]:
srp_data = pa.feather.read_table(b)
bits = srp_data['SRP_bits']
ids = srp_data['id']


In [34]:
import numpy as np
bitwise = [np.frombuffer(b.as_buffer(), np.int32) for b in bits]

In [35]:
POPCOUNT_TABLE16 = np.zeros(2**16, dtype=int) #has to be an array

for index in range(len(POPCOUNT_TABLE16)):
    POPCOUNT_TABLE16[index] = (index & 1) + POPCOUNT_TABLE16[index >> 1]

def popcount32_table16(v):
    return (POPCOUNT_TABLE16[ v        & 0xffff] +
            POPCOUNT_TABLE16[(v >> 16) & 0xffff])

def count1s(v):
    return popcount32_table16(v).sum()

In [85]:
counts = []
for i in range(len(bitwise)-1):
    for j in range(i+1, len(bitwise)):
        d = count1s(bitwise[i] ^ bitwise[j])
        counts.append((d, ids[i].as_py(), ids[j].as_py()))
counts.sort()

In [86]:
import pandas as pd
dists = pd.DataFrame(counts, columns = ['hamming_dist', 'id1', 'id2'])


In [87]:
id1 = srp_data['id'][id_1]
id2 = srp_data['id'][id_2]

In [88]:
metadata = gutenberg.metadata.tb.to_pandas()[['author', 'title', '@id']]

In [91]:
joined = metadata.merge(dists, left_on = '@id', right_on = 'id1').merge(metadata, right_on = '@id', left_on = 'id2')
joined.sort_values("hamming_dist").head(20)

Unnamed: 0,author_x,title_x,@id_x,hamming_dist,id1,id2,author_y,title_y,@id_y
805,"Austen, Jane",Sense and Sensibility,161,198,161,141,"Austen, Jane",Mansfield Park,141
577,"Austen, Jane",Persuasion,105,207,105,161,"Austen, Jane",Sense and Sensibility,161
796,"Austen, Jane",Persuasion,105,207,105,141,"Austen, Jane",Mansfield Park,141
490,"Burroughs, Edgar Rice",The Gods of Mars,64,213,64,72,"Burroughs, Edgar Rice","Thuvia, Maid of Mars",72
478,"Austen, Jane",Emma,158,217,158,105,"Austen, Jane",Persuasion,105
804,"Austen, Jane",Emma,158,220,158,141,"Austen, Jane",Mansfield Park,141
523,"Austen, Jane",Persuasion,105,223,105,121,"Austen, Jane",Northanger Abbey,121
578,"Austen, Jane",Northanger Abbey,121,224,121,161,"Austen, Jane",Sense and Sensibility,161
584,"Austen, Jane",Emma,158,224,158,161,"Austen, Jane",Sense and Sensibility,161
335,"James, Henry",Confidence,178,224,178,177,"James, Henry",The American,177
