In [None]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

In [None]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
def parse_book(fname):
    sentences = []
    try:
        with open(fname, "r") as src:
            content = "\n".join(src.read().split("*** START OF THIS PROJECT GUTENBERG EBOOK")[-1].split("\n")[1:])
            content = content.strip()
            for text in content.split("\n\n"):
                text = text.replace("\n", " ")
                sentences += tokenizer.tokenize(text)
    except:
        return []
    
    cleaned_sentences = [
        sent.strip()
        for sent in sentences
        if sent.strip()
    ]
    return pd.Series(cleaned_sentences).sample(int(0.1 * len(cleaned_sentences)), random_state=42).values

In [None]:
paragraphs = set()
for book in tqdm(os.listdir("data/raw")):
    if not book.endswith(".txt"):
        continue
    paragraphs |= set(parse_book(f"data/raw/{book}"))

In [None]:
len(paragraphs)

In [None]:
import gc

In [None]:
gc.collect()

In [None]:
texts = pd.Series(sorted(paragraphs))
texts.head()

In [None]:
texts = texts.str.replace("^\[[0-9]+\]", "")
texts.head()

In [None]:
texts = sorted(texts, key=len)
texts = pd.Series(texts)
texts.head()

In [None]:
texts.shape

In [None]:
texts = texts.drop_duplicates()
texts.shape

In [None]:
lengths = texts.apply(len)

In [None]:
texts = texts[(lengths >= lengths.quantile(0.05)) & (lengths <= lengths.quantile(0.999))]
texts

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
model = model.cuda()

In [None]:
embeddings = model.encode(
    texts.tolist(),
    show_progress_bar=True,
    normalize_embeddings=True,
)
embeddings.shape

In [None]:
pd.DataFrame({
    "text": texts.tolist()
}).to_csv("data/guttenberg-sentences-base.csv", index=False)

In [None]:
np.save("data/guttenberg-sentences-embeddings.npy", embeddings)