# BookCorpusOpen searches

In [None]:
__author__ = "Christopher Potts"

## Set-up

In [None]:
import glob
import os
from joblib import Parallel, delayed
import tqdm
import pandas as pd
from nltk.tokenize import sent_tokenize

import utils

In [None]:
BOOKS_HOME = "books1/epubtxt"

## Word counts

In [None]:
def word_counts(filename):
    from nltk.tokenize import TreebankWordTokenizer
    tokenizer = TreebankWordTokenizer()
    wc = 0
    with open(filename) as f:
        wc += len(tokenizer.tokenize(f.read()))
    return wc

def word_counts_parallel(dirname=BOOKS_HOME):
    filenames = glob.glob(os.path.join(dirname, "*.epub.txt"))
    pbar = tqdm.tqdm(filenames)
    data = Parallel(n_jobs=10)(delayed(word_counts)(f) for f in pbar)
    return sum(data)

In [None]:
word_counts_parallel()

## Sentences with counts

In [None]:
def books_reader(filename):
    sent_count = 0
    data = []
    with open(filename) as f:
        sents = sent_tokenize(f.read())
        sent_count += len(sents)
        for sent in sents:
            if utils.is_match(sent):
                data.append({"filename": os.path.basename(filename), "sentence": sent})
    return sent_count, data

def books_reader_parallel(dirname=BOOKS_HOME):
    filenames = glob.glob(f"{dirname}/*.epub.txt")
    pbar = tqdm.tqdm(filenames)
    data = Parallel(n_jobs=10)(delayed(books_reader)(f) for f in pbar)
    sent_count = sum([c for c, _ in data])
    examples = []
    for _, exs in data:
        examples += exs
    return sent_count, examples

In [None]:
sent_count, matches = books_reader_parallel()

In [None]:
sent_count

In [None]:
len(matches)

In [None]:
df = pd.DataFrame(matches)

In [None]:
df = df.sample(frac=1.0, random_state=1234)

In [None]:
df.to_csv("samples/pipp-bookcorpusopen.csv")

In [None]:
sample = df.iloc[: 1000]

In [None]:
sample.to_csv("samples/pipp-sample-bookcorpusopen.csv")