In [19]:
from pathlib import Path
import re
import unicodedata
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from collections import Counter
import pandas as pd
from typing import List, Dict
from nltk.stem import WordNetLemmatizer


nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/agastyaharta/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/agastyaharta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/agastyaharta/nltk_data...


True

In [2]:
raw_dir = Path("data_raw")
raw_files = sorted(raw_dir.glob("*.txt"))

print(f"Found {len(raw_files)} .txt files")
for f in raw_files[:10]:
    print(" -", f.name)

def read_text(fp: Path) -> str:
    try:
        return fp.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        return fp.read_text(encoding="latin-1")

raw_texts = {fp.stem: read_text(fp) for fp in raw_files}
print("Loaded books:", list(raw_texts.keys())[:10])

Found 10 .txt files
 - Crime_and_Punishment.txt
 - Notes_from_the_Underground.txt
 - Poor_Folk.txt
 - Short_Stories.txt
 - The_Brothers_Karamazov.txt
 - The_Gambler.txt
 - The_Grand_Inquisitor.txt
 - The_Idiot.txt
 - The_Possessed _or_The_Devils.txt
 - White_Nights_and_Other_Stories.txt
Loaded books: ['Crime_and_Punishment', 'Notes_from_the_Underground', 'Poor_Folk', 'Short_Stories', 'The_Brothers_Karamazov', 'The_Gambler', 'The_Grand_Inquisitor', 'The_Idiot', 'The_Possessed _or_The_Devils', 'White_Nights_and_Other_Stories']


In [None]:
lengths = pd.Series({k: len(v) for k, v in raw_texts.items()}).sort_values(ascending=False)
print(lengths)

print("\nShortest book:", lengths.index[-1], lengths.iloc[-1])
print("Longest book:", lengths.index[0], lengths.iloc[0])

In [3]:
def normalise_text_base(text: str) -> str:
    text = text.replace("\r", "\n")
    text = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", text)  # strip non-printing
    text = re.sub(r"\s+", " ", text).strip()
    return text

books_base = {book_id: normalise_text_base(txt) for book_id, txt in raw_texts.items()}

sample_book = next(iter(books_base))
print(sample_book, "->", books_base[sample_book][:300])

Crime_and_Punishment -> CRIME AND PUNISHMENT PART I CHAPTER I On an exceptionally hot evening early in July a young man came out of the garret in which he lodged in S. Place and walked slowly, as though in hesitation, towards K. bridge. He had successfully avoided meeting his landlady on the staircase. His garret was under


In [4]:
def chunk_by_words(text: str, chunk_words: int = 800, overlap_words: int = 200,
                   min_tail_words: int = 300) -> List[str]:
    assert 0 <= overlap_words < chunk_words
    words = text.split()
    step = chunk_words - overlap_words

    chunks = []
    for start in range(0, len(words), step):
        window = words[start:start + chunk_words]
        if len(window) < min_tail_words:
            break
        chunks.append(" ".join(window))
    return chunks

docs = []
for book_id, text in books_base.items():
    chunks = chunk_by_words(text, chunk_words=800, overlap_words=200, min_tail_words=300)
    for i, ch in enumerate(chunks):
        docs.append({
            "book_id": book_id,
            "chunk_index": i,
            "chunk_id": f"{book_id}__{i:04d}",
            "text_base": ch
        })

df_docs = pd.DataFrame(docs)
print("chunk-documents:", df_docs.shape)
df_docs

chunk-documents: (2395, 4)


Unnamed: 0,book_id,chunk_index,chunk_id,text_base
0,Crime_and_Punishment,0,Crime_and_Punishment__0000,CRIME AND PUNISHMENT PART I CHAPTER I On an ex...
1,Crime_and_Punishment,1,Crime_and_Punishment__0001,of mind; he walked along not observing what wa...
2,Crime_and_Punishment,2,Crime_and_Punishment__0002,the other into the street. This house was let ...
3,Crime_and_Punishment,3,Crime_and_Punishment__0003,"paused, as though hesitating; then stepped on ..."
4,Crime_and_Punishment,4,Crime_and_Punishment__0004,he reflected. So she carries the keys in a poc...
...,...,...,...,...
2390,White_Nights_and_Other_Stories,192,White_Nights_and_Other_Stories__0192,"man had actually gone off his head, was utterl..."
2391,White_Nights_and_Other_Stories,193,White_Nights_and_Other_Stories__0193,"must, while completely unconscious, have dragg..."
2392,White_Nights_and_Other_Stories,194,White_Nights_and_Other_Stories__0194,"back, called up Okeanov, asked for the key of ..."
2393,White_Nights_and_Other_Stories,195,White_Nights_and_Other_Stories__0195,"bed it at once aroused suspicion, and some of ..."


Inspecting chunks of documents per books

In [5]:
counts = df_docs.groupby("book_id")["chunk_id"].count().sort_values(ascending=False)
print(counts)
print("\nTotal chunks:", len(df_docs))

book_id
The_Brothers_Karamazov            600
The_Possessed _or_The_Devils      423
The_Idiot                         412
Crime_and_Punishment              345
White_Nights_and_Other_Stories    197
Short_Stories                     135
The_Gambler                       102
Poor_Folk                          91
Notes_from_the_Underground         74
The_Grand_Inquisitor               16
Name: chunk_id, dtype: int64

Total chunks: 2395


In [16]:
lemmatiser = WordNetLemmatizer()

my_stopwords = set(stopwords.words("english"))
my_stopwords |= {
    "said", "say", "tell",
    "would", "could", "must", "shall",
    "one",
    "though", "even", "well", "like"
}

def _to_wordnet_pos(tag: str):
    # penn tag -> wordnet pos
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN

def preprocess_for_lda(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    tokens = text.split()
    tokens = [t for t in tokens if t not in my_stopwords and len(t) > 2]

    tags = nltk.pos_tag(tokens)
    tokens = [lemmatiser.lemmatize(t, pos=_to_wordnet_pos(tag)) for t, tag in tags]

    return " ".join(tokens)

In [17]:
def preprocess_for_bert(text: str) -> str:
    text = re.sub(r"_+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.lower()
    return text

In [20]:
df_docs["text_lda"] = df_docs["text_base"].apply(preprocess_for_lda)
df_docs["text_bert"] = df_docs["text_base"].apply(preprocess_for_bert)

before = len(df_docs)
df_docs = df_docs[df_docs["text_lda"].str.len() > 0].reset_index(drop=True)
print("dropped empty lda chunks:", before - len(df_docs))

#df_docs[["book_id", "chunk_id", "text_lda", "text_bert"]]
df_docs.head(7)

dropped empty lda chunks: 0


Unnamed: 0,book_id,chunk_index,chunk_id,text_base,text_lda,text_bert
0,Crime_and_Punishment,0,Crime_and_Punishment__0000,CRIME AND PUNISHMENT PART I CHAPTER I On an ex...,crime punishment part chapter exceptionally ho...,crime and punishment part i chapter i on an ex...
1,Crime_and_Punishment,1,Crime_and_Punishment__0001,of mind; he walked along not observing what wa...,mind walk along observe care observe time time...,of mind; he walked along not observing what wa...
2,Crime_and_Punishment,2,Crime_and_Punishment__0002,the other into the street. This house was let ...,street house let tiny tenement inhabit work pe...,the other into the street. this house was let ...
3,Crime_and_Punishment,3,Crime_and_Punishment__0003,"paused, as though hesitating; then stepped on ...",pause hesitating step side point door room let...,"paused, as though hesitating; then stepped on ..."
4,Crime_and_Punishment,4,Crime_and_Punishment__0004,he reflected. So she carries the keys in a poc...,reflect carry key pocket right bunch steel rin...,he reflected. so she carries the keys in a poc...
5,Crime_and_Punishment,5,Crime_and_Punishment__0005,drank off the first glassful. At once he felt ...,drank first glassful felt easier thought becom...,drank off the first glassful. at once he felt ...
6,Crime_and_Punishment,6,Crime_and_Punishment__0006,an atmosphere might well make a man drunk. The...,atmosphere might make man drunk chance meeting...,an atmosphere might well make a man drunk. the...


In [21]:
counts = df_docs.groupby("book_id")["chunk_id"].count().sort_values(ascending=False)
print(counts)
print("\nTotal chunks:", len(df_docs))

book_id
The_Brothers_Karamazov            600
The_Possessed _or_The_Devils      423
The_Idiot                         412
Crime_and_Punishment              345
White_Nights_and_Other_Stories    197
Short_Stories                     135
The_Gambler                       102
Poor_Folk                          91
Notes_from_the_Underground         74
The_Grand_Inquisitor               16
Name: chunk_id, dtype: int64

Total chunks: 2395


In [22]:
print("empty lda chunks:", (df_docs["text_lda"].str.len() == 0).sum())
print("\nexample lda snippet:\n", df_docs["text_lda"].iloc[0][:400])
print("\nexample raw snippet:\n", df_docs["text_base"].iloc[0][:400])

empty lda chunks: 0

example lda snippet:
 crime punishment part chapter exceptionally hot even early july young man come garret lodged place walk slowly hesitation towards bridge successfully avoided meeting landlady staircase garret roof high five storied house cupboard room landlady provide garret dinner attendance lived floor every time go obliged pas kitchen door invariably stand open time pass young man sick frighten feel make scowl 

example raw snippet:
 CRIME AND PUNISHMENT PART I CHAPTER I On an exceptionally hot evening early in July a young man came out of the garret in which he lodged in S. Place and walked slowly, as though in hesitation, towards K. bridge. He had successfully avoided meeting his landlady on the staircase. His garret was under the roof of a high, five-storied house and was more like a cupboard than a room. The landlady who p


In [23]:
out_dir = Path("data_cleaned")
out_dir.mkdir(exist_ok=True)

out_path = out_dir / "dostoevsky_chunks.csv"
df_docs.to_csv(out_path, index=False)

print("saved:", out_path.resolve())
print("rows:", len(df_docs), "cols:", df_docs.shape[1])

saved: /Users/agastyaharta/Desktop/wne_uw/3/project/text_mining_final/text_mining_topic_modelling/data_cleaned/dostoevsky_chunks.csv
rows: 2395 cols: 6


### PREVIOUS WORKS!

In [None]:
def prelim_clean(text: str) -> str:
    return re.sub(r"[^a-zA-Z0-9 \n\.\']", '', text)

prelim_texts = {}
for book, text in texts.items():
    prelim = prelim_clean(text)
    prelim_texts[book] = prelim

print(prelim_texts[first_key][:500])

In [None]:
def remove_extra_spaces(text: str) -> str:
    return re.sub(r' +', ' ', text)

def remove_punctuation_remaining(text: str) -> str:
    return re.sub(r'[^\w\s]', '', text)

def remove_numbers(text: str) -> str:
    return re.sub(r'\d', '', text)

clean_texts = {}
for book, text in prelim_texts.items():
    t = remove_extra_spaces(text)
    t = remove_punctuation_remaining(t)
    t = remove_numbers(t)
    clean_texts[book] = t

print(clean_texts[first_key][:500])

In [None]:
lower_texts = {book: text.lower() for book, text in clean_texts.items()}
print(lower_texts[first_key][:500])

In [None]:
stop_words = set(stopwords.words("english"))

def remove_stopwords(text: str) -> list[str]:
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    return [w for w in tokens if w not in stop_words and w.strip() != ""]

filtered_tokens = {}
filtered_texts = {}

for book, text in lower_texts.items():
    tokens_kept = remove_stopwords(text)
    filtered_tokens[book] = tokens_kept
    filtered_texts[book] = " ".join(tokens_kept)

print("Original token sample:", word_tokenize(lower_texts[first_key])[:40])
print("After stopword removal:", filtered_tokens[first_key][:40])

maybe i'll do word count here and see what is the most recent word and see if i should remove them

In [None]:
all_tokens = []
for tokens in filtered_tokens.values():
    all_tokens.extend(tokens)

freq = Counter(all_tokens)
freq.most_common(30)

In [None]:
custom_stopwords = {
    "said","say","tell",
    "would","could","must","shall",
    "one",
    "though","even","well","like",
    "come","go","went","see"
}

In [None]:
ps = PorterStemmer()

stemmed_tokens = {}
stemmed_texts = {}

for book, tokens in filtered_tokens.items(): 
    stems = [ps.stem(w) for w in tokens]
    stemmed_tokens[book] = stems
    stemmed_texts[book] = " ".join(stems)

In [None]:
BOOK_OUT = Path("data_cleaned")
BOOK_OUT.mkdir(parents=True, exist_ok=True)

for book, text in stemmed_texts.items():
    (BOOK_OUT / f"{book}_clean.txt").write_text(text, encoding="utf-8")

print("Saved cleaned books to:", BOOK_OUT)