In [8]:
import os 
from dataclasses import dataclass
from typing import Callable, Optional
from spacy.lang.en import English
import pandas as pd
from sentence_transformers import SentenceTransformer

TEXT_FOLDER_PATH = "./scrapers/out_text"
ACTS_PARQUET_PATH = "./acts.parquet"

# Parsing

From text file to DataFrame

In [9]:
@dataclass
class Act:
    filename: str
    char_count: int
    word_count: int
    sentence_count_raw: int
    token_count: float
    text: str

    sentences_spacy: list[str]
    sentence_count_spacy: int

    nlp = English()
    nlp.add_pipe("sentencizer")
    nlp.max_length = 9999999

    @classmethod
    def from_text(cls, filename: str, text: str):
        doc = cls.nlp(text)
        sentences = [str(sentence) for sentence in doc.sents]

        return Act(
            filename=filename,
            char_count=len(text),
            word_count=len(text.split(" ")), 
            sentence_count_raw=len(text.split(".")),
            token_count=len(text.split(" ")) / 4,
            text=text,
            sentences_spacy=sentences, 
            sentence_count_spacy=len(sentences)
        ) 

def get_acts(text_folder_path: str, formatter: Optional[Callable] = None) -> list[Act]:
    acts : list[Act] = []

    for dpath, dnames, fnames in os.walk(text_folder_path):
        for fname in fnames:
            fpath = os.path.join(dpath, fname)
            with open(fpath) as f:
                text = f.read()
                if formatter:
                    text = formatter(text)
                if len(text) < 1000:
                    print(f"[WARN] {fpath} is too short ({len(text)} chars), not adding to list")
                    continue
                acts.append(Act.from_text(fname, text))
    return acts

def format_text(text: str):
    cleaned_text = text.replace("\n", " ").strip() 
    return cleaned_text


In [10]:
if not os.path.isfile(ACTS_PARQUET_PATH):
    acts = get_acts(TEXT_FOLDER_PATH, format_text)
    df = pd.DataFrame(acts)
    df.to_parquet(ACTS_PARQUET_PATH)
else:
    print(f"[INFO] reading from {ACTS_PARQUET_PATH}")
    df = pd.read_parquet(ACTS_PARQUET_PATH)

[INFO] reading from ./acts.parquet


In [11]:
df.head()

Unnamed: 0,filename,char_count,word_count,sentence_count_raw,token_count,text,sentences_spacy,sentence_count_spacy
0,MCA1987-Minors’-Contracts-Act-1987.txt,1563,273,12,68.25,Disapplication of Infants Relief Act 1874 1. ...,"[Disapplication of Infants Relief Act 1874 1.,...",8
1,ISA1987-Intoxicating-Substances-Act-1987.txt,45626,7359,186,1839.75,Short title 1. This Act is the Intoxicating S...,"[Short title 1., This Act is the Intoxicating...",54
2,TCMPA2000-Traditional-Chinese-Medicine-Practit...,61884,9305,227,2326.25,Short title 1. This Act is the Traditional Ch...,"[Short title 1., This Act is the Traditional ...",47
3,MSA1995-Merchant-Shipping-Act-1995.txt,193962,32590,765,8147.5,Short title 1. This Act is the Merchant Shipp...,"[Short title 1., This Act is the Merchant Shi...",272
4,CAMC1999A2007-Carriage-by-Air-(Montreal-Conven...,7986,1357,40,339.25,Short title 1. This Act is the Carriage by Ai...,"[Short title 1., This Act is the Carriage by ...",21


In [12]:
df.describe().round(1)

Unnamed: 0,char_count,word_count,sentence_count_raw,token_count,sentence_count_spacy
count,496.0,496.0,496.0,496.0,496.0
mean,78006.7,13993.4,237.2,3498.4,60.2
std,152539.6,45697.7,375.1,11424.4,106.6
min,1092.0,183.0,7.0,45.8,3.0
25%,11144.2,1842.0,46.2,460.5,18.0
50%,34446.0,5450.5,117.5,1362.6,37.0
75%,92415.8,14658.5,294.5,3664.6,67.0
max,1777447.0,893234.0,3791.0,223308.5,1738.0


# Chunking

In [13]:
CHUNK_SIZE = 5 # sentences

def split_list(input_list: list[str], slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).
    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[min(i-2,0):max(i + slice_size + 2, len(input_list))] for i in range(0, len(input_list), slice_size)]

def get_chunk(sents):
    chunks: list[str] = split_list(sents, CHUNK_SIZE)
    return chunks

df["chunks"] = df["sentences_spacy"].apply(get_chunk)
df["chunk_count"] = df["chunks"].apply(lambda chunks: len(chunks)) 

In [14]:
import pandas as pd
import re

def x(df:pd.DataFrame):
    count = 0
    items = []
    for i, row in df.iterrows():
        for chunk in row["chunks"]:
            chunk_dict = {}
            joined_chunk = "".join(chunk).replace("  ", " ").strip()
            joined_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_chunk) 
            chunk_dict["chunk"] = joined_chunk
            # Get stats about the chunk
            chunk_dict["chunk_char_count"] = len(joined_chunk)
            chunk_dict["chunk_word_count"] = len([word for word in joined_chunk.split(" ")])
            chunk_dict["chunk_token_count"] = len(joined_chunk) / 4 # 1 token = ~4 characters
            if chunk_dict["chunk_word_count"] < 30:
                count += 1
                continue
            items.append(chunk_dict)
    print(f"[WARN] {count} chunks filtered out")
    return items

items = x(df)

[WARN] 57 chunks filtered out


In [15]:
len(items)

6115

In [21]:
chunk_df = pd.DataFrame(items)

In [22]:
chunk_df.describe().round(1)

Unnamed: 0,chunk_char_count,chunk_word_count,chunk_token_count
count,6115.0,6115.0,6115.0
mean,223959.6,55361.7,55989.9
std,312096.4,113408.9,78024.1
min,167.0,30.0,41.8
25%,40828.0,6544.0,10207.0
50%,106602.0,16947.0,26650.5
75%,246634.0,42523.0,61658.5
max,1777136.0,485547.0,444284.0


In [23]:
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="mps") 

In [29]:
%%time

chunk_df["embedding"] = chunk_df["chunk"].apply(lambda chunk: embedding_model.encode(chunk, batch_size=32))

CPU times: user 57min 57s, sys: 34min 57s, total: 1h 32min 54s
Wall time: 1d 7h 12min 14s
