In [1]:
from pathlib import Path
import re
import unicodedata
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from collections import Counter
import pandas as pd
from typing import List, Dict
from nltk.stem import WordNetLemmatizer


nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/agastyaharta/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/agastyaharta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/agastyaharta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
raw_dir = Path("data_raw")
raw_files = sorted(raw_dir.glob("*.txt"))

print(f"Found {len(raw_files)} .txt files")
for f in raw_files[:10]:
    print(" -", f.name)

def read_text(fp: Path) -> str:
    try:
        return fp.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        return fp.read_text(encoding="latin-1")

raw_texts = {fp.stem: read_text(fp) for fp in raw_files}
print("Loaded books:", list(raw_texts.keys())[:10])

Found 10 .txt files
 - Crime_and_Punishment.txt
 - Notes_from_the_Underground.txt
 - Poor_Folk.txt
 - Short_Stories.txt
 - The_Brothers_Karamazov.txt
 - The_Gambler.txt
 - The_Grand_Inquisitor.txt
 - The_Idiot.txt
 - The_Possessed _or_The_Devils.txt
 - White_Nights_and_Other_Stories.txt
Loaded books: ['Crime_and_Punishment', 'Notes_from_the_Underground', 'Poor_Folk', 'Short_Stories', 'The_Brothers_Karamazov', 'The_Gambler', 'The_Grand_Inquisitor', 'The_Idiot', 'The_Possessed _or_The_Devils', 'White_Nights_and_Other_Stories']


In [3]:
lengths = pd.Series({k: len(v) for k, v in raw_texts.items()}).sort_values(ascending=False)
print(lengths)

print("\nShortest book:", lengths.index[-1], lengths.iloc[-1])
print("Longest book:", lengths.index[0], lengths.iloc[0])

The_Brothers_Karamazov            1930493
The_Possessed _or_The_Devils      1433585
The_Idiot                         1347584
Crime_and_Punishment              1130549
White_Nights_and_Other_Stories     647262
Short_Stories                      446001
The_Gambler                        331469
Poor_Folk                          291758
Notes_from_the_Underground         238698
The_Grand_Inquisitor                53221
dtype: int64

Shortest book: The_Grand_Inquisitor 53221
Longest book: The_Brothers_Karamazov 1930493


In [4]:
def normalise_text_base(text: str) -> str:
    text = text.replace("\r", "\n")
    text = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", " ", text)  # strip non-printing
    text = re.sub(r"\s+", " ", text).strip()
    return text

books_base = {book_id: normalise_text_base(txt) for book_id, txt in raw_texts.items()}

sample_book = next(iter(books_base))
print(sample_book, "->", books_base[sample_book][:300])

Crime_and_Punishment -> CRIME AND PUNISHMENT PART I CHAPTER I On an exceptionally hot evening early in July a young man came out of the garret in which he lodged in S. Place and walked slowly, as though in hesitation, towards K. bridge. He had successfully avoided meeting his landlady on the staircase. His garret was under


In [5]:
def chunk_by_words(text: str, chunk_words: int = 800, overlap_words: int = 200,
                   min_tail_words: int = 300) -> List[str]:
    assert 0 <= overlap_words < chunk_words
    words = text.split()
    step = chunk_words - overlap_words

    chunks = []
    for start in range(0, len(words), step):
        window = words[start:start + chunk_words]
        if len(window) < min_tail_words:
            break
        chunks.append(" ".join(window))
    return chunks

docs = []
for book_id, text in books_base.items():
    chunks = chunk_by_words(text, chunk_words=350, overlap_words=50, min_tail_words=150)
    for i, ch in enumerate(chunks):
        docs.append({
            "book_id": book_id,
            "chunk_index": i,
            "chunk_id": f"{book_id}__{i:04d}",
            "text_base": ch
        })

df_docs = pd.DataFrame(docs)
print("chunk-documents:", df_docs.shape)
df_docs

chunk-documents: (4787, 4)


Unnamed: 0,book_id,chunk_index,chunk_id,text_base
0,Crime_and_Punishment,0,Crime_and_Punishment__0000,CRIME AND PUNISHMENT PART I CHAPTER I On an ex...
1,Crime_and_Punishment,1,Crime_and_Punishment__0001,"the street, he became acutely aware of his fea..."
2,Crime_and_Punishment,2,Crime_and_Punishment__0002,of mind; he walked along not observing what wa...
3,Crime_and_Punishment,3,Crime_and_Punishment__0003,one side in a most unseemly fashion. Not shame...
4,Crime_and_Punishment,4,Crime_and_Punishment__0004,the other into the street. This house was let ...
...,...,...,...,...
4782,White_Nights_and_Other_Stories,389,White_Nights_and_Other_Stories__0389,be amiss to cut up the whole mattress with sci...
4783,White_Nights_and_Other_Stories,390,White_Nights_and_Other_Stories__0390,"bed it at once aroused suspicion, and some of ..."
4784,White_Nights_and_Other_Stories,391,White_Nights_and_Other_Stories__0391,"not a million, though it did turn out to be a ..."
4785,White_Nights_and_Other_Stories,392,White_Nights_and_Other_Stories__0392,beyond his means. The landlady wailed without ...


Inspecting chunks of documents per books

In [6]:
counts = df_docs.groupby("book_id")["chunk_id"].count().sort_values(ascending=False)
print(counts)
print("\nTotal chunks:", len(df_docs))

book_id
The_Brothers_Karamazov            1201
The_Possessed _or_The_Devils       846
The_Idiot                          824
Crime_and_Punishment               689
White_Nights_and_Other_Stories     394
Short_Stories                      269
The_Gambler                        204
Poor_Folk                          181
Notes_from_the_Underground         148
The_Grand_Inquisitor                31
Name: chunk_id, dtype: int64

Total chunks: 4787


In [7]:
lemmatiser = WordNetLemmatizer()

my_stopwords = set(stopwords.words("english"))
my_stopwords |= {
    "said", "say", "tell",
    "would", "could", "must", "shall",
    "one",
    "though", "even", "well", "like", "know", "come", "look", "see", "go", "make", "man", "time",
    
    # Crime and Punishment
    "rodion", "romanovich", "raskolnikov", "sofya", "semyonovna", "marmeladova",
    "sonya", "sonia", "avdotya", "dunya", "dmitri", "prokofych", "razumikhin",
    "porfiry", "petrovich", "pyotr", "luzhin", "arkady", "ivanovich",
    "svidrigailov", "pulcheria", "alexandrovna", "semyon", "zakharovich",
    "marmeladov", "katerina", "ivanovna", "alyona", "lizaveta",

    # Notes from Underground
    "liza", "zverkov", "simonov", "ferfichkin", "trudolyubov", "apollon",
    "anton", "antonich", "syetochkin",

    # Poor Folk
    "makar", "alekseyevich", "devushkin", "varvara", "dobroselova", "bykov",

    # The Gambler
    "alexei", "general", "sagoryanski", "polina", "des", "grieux", "astley",
    "blanche", "cominges", "antonida", "vasilevna", "tarasevitcheva",
    "maria", "filippovna", "wurmerhelm", "potapyts", "marfa",

    # The Idiot
    "lev", "nikolayevich", "myshkin", "anastassya", "nastassya", "filippovna",
    "barashkov", "parfyon", "semyonovich", "rogozhin", "aglaya", "yepanchin",
    "gavril", "ganya", "ardalionovich", "ivolgin", "ivan", "fyodorovich",
    "lizaveta", "prokofyevna", "adelaida", "alexandra", "afanasy", "totsky",
    "lebedev", "ippolit", "terentyev", "ardalion", "kolya", "nikolay", "varya",

    # Demons / The Devils
    "nikolai", "vsevolodovich", "stavrogin", "verkhovensky",
    "stepanovich", "stepan", "trofimovich", "stavrogina", "pavlovich",
    "shatov", "aleksey", "nilych", "kirillov", "lebyadkin", "marya",
    "lebyadkina",

    # The Brothers Karamazov + Grand Inquisitor frame
    "fyodor", "pavlovich", "karamazov", "mitya", "pavel", "smerdyakov",
    "agrafena", "svetlova", "grushenka", "verkhovtseva", "zosima", "grigory",
    "vasilievich", "alyosha",

    # Grand Inquisitor parable tokens
    "christ", "jesus", "inquisitor", "seville",

    # White Nights
    "nastenka", "matryona", "fyokla",

    # Other stories mentioned (common editions)
    "vasya", "ivanovitch", "julian", "mastakovich",
    "polzunkov", "fedosey", "nikolaitch", "osip", "mihalitch",
    "prokharchin",
}

def _to_wordnet_pos(tag: str):
    # penn tag -> wordnet pos
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN

def preprocess_for_lda(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    tokens = [t for t in text.split() if len(t) > 2]

    tags = nltk.pos_tag(tokens)
    lemmas = [lemmatiser.lemmatize(t, pos=_to_wordnet_pos(tag)) for t, tag in tags]

    # stopword removal AFTER lemmatisation (critical)
    lemmas = [t for t in lemmas if t not in my_stopwords and len(t) > 2]

    return " ".join(lemmas)

In [8]:
def preprocess_for_bert(text: str) -> str:
    text = re.sub(r"_+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.lower()
    return text

In [9]:
df_docs["text_lda"] = df_docs["text_base"].apply(preprocess_for_lda)
df_docs["text_bert"] = df_docs["text_base"].apply(preprocess_for_bert)

before = len(df_docs)
df_docs = df_docs[df_docs["text_lda"].str.len() > 0].reset_index(drop=True)
print("dropped empty lda chunks:", before - len(df_docs))

#df_docs[["book_id", "chunk_id", "text_lda", "text_bert"]]
df_docs.head(7)

dropped empty lda chunks: 0


Unnamed: 0,book_id,chunk_index,chunk_id,text_base,text_lda,text_bert
0,Crime_and_Punishment,0,Crime_and_Punishment__0000,CRIME AND PUNISHMENT PART I CHAPTER I On an ex...,crime punishment part chapter exceptionally ho...,crime and punishment part i chapter i on an ex...
1,Crime_and_Punishment,1,Crime_and_Punishment__0001,"the street, he became acutely aware of his fea...",street become acutely aware fear want attempt ...,"the street, he became acutely aware of his fea..."
2,Crime_and_Punishment,2,Crime_and_Punishment__0002,of mind; he walked along not observing what wa...,mind walk along observe care observe mutter so...,of mind; he walked along not observing what wa...
3,Crime_and_Punishment,3,Crime_and_Punishment__0003,one side in a most unseemly fashion. Not shame...,side unseemly fashion shame however quite anot...,one side in a most unseemly fashion. not shame...
4,Crime_and_Punishment,4,Crime_and_Punishment__0004,the other into the street. This house was let ...,street house let tiny tenement inhabit work pe...,the other into the street. this house was let ...
5,Crime_and_Punishment,5,Crime_and_Punishment__0005,"clearly before him.... He started, his nerves ...",clearly start nerve terribly overstrain little...,"clearly before him.... he started, his nerves ..."
6,Crime_and_Punishment,6,Crime_and_Punishment__0006,"paused, as though hesitating; then stepped on ...",pause hesitate step side point door room let v...,"paused, as though hesitating; then stepped on ..."


In [10]:
counts = df_docs.groupby("book_id")["chunk_id"].count().sort_values(ascending=False)
print(counts)
print("\nTotal chunks:", len(df_docs))

book_id
The_Brothers_Karamazov            1201
The_Possessed _or_The_Devils       846
The_Idiot                          824
Crime_and_Punishment               689
White_Nights_and_Other_Stories     394
Short_Stories                      269
The_Gambler                        204
Poor_Folk                          181
Notes_from_the_Underground         148
The_Grand_Inquisitor                31
Name: chunk_id, dtype: int64

Total chunks: 4787


In [11]:
print("empty lda chunks:", (df_docs["text_lda"].str.len() == 0).sum())
print("\nexample lda snippet:\n", df_docs["text_lda"].iloc[0][:400])
print("\nexample raw snippet:\n", df_docs["text_base"].iloc[0][:400])

empty lda chunks: 0

example lda snippet:
 crime punishment part chapter exceptionally hot early july young garret lodge place walk slowly hesitation towards bridge successfully avoid meet landlady staircase garret roof high five storied house cupboard room landlady provide garret dinner attendance live floor every obliged pas kitchen door invariably stand open pass young sick frighten feeling scowl feel ashamed hopelessly debt landlady af

example raw snippet:
 CRIME AND PUNISHMENT PART I CHAPTER I On an exceptionally hot evening early in July a young man came out of the garret in which he lodged in S. Place and walked slowly, as though in hesitation, towards K. bridge. He had successfully avoided meeting his landlady on the staircase. His garret was under the roof of a high, five-storied house and was more like a cupboard than a room. The landlady who p


In [12]:
out_dir = Path("data_cleaned")
out_dir.mkdir(exist_ok=True)

out_path = out_dir / "dostoevsky_chunks.csv"
df_docs.to_csv(out_path, index=False)

print("saved:", out_path.resolve())
print("rows:", len(df_docs), "cols:", df_docs.shape[1])

saved: /Users/agastyaharta/Desktop/wne_uw/3/project/text_mining_final/text_mining_topic_modelling/data_cleaned/dostoevsky_chunks.csv
rows: 4787 cols: 6


### PREVIOUS WORKS!

In [13]:
# def prelim_clean(text: str) -> str:
#     return re.sub(r"[^a-zA-Z0-9 \n\.\']", '', text)

# prelim_texts = {}
# for book, text in texts.items():
#     prelim = prelim_clean(text)
#     prelim_texts[book] = prelim

# print(prelim_texts[first_key][:500])

In [14]:
# def remove_extra_spaces(text: str) -> str:
#     return re.sub(r' +', ' ', text)

# def remove_punctuation_remaining(text: str) -> str:
#     return re.sub(r'[^\w\s]', '', text)

# def remove_numbers(text: str) -> str:
#     return re.sub(r'\d', '', text)

# clean_texts = {}
# for book, text in prelim_texts.items():
#     t = remove_extra_spaces(text)
#     t = remove_punctuation_remaining(t)
#     t = remove_numbers(t)
#     clean_texts[book] = t

# print(clean_texts[first_key][:500])

In [15]:
# lower_texts = {book: text.lower() for book, text in clean_texts.items()}
# print(lower_texts[first_key][:500])

In [16]:
# stop_words = set(stopwords.words("english"))

# def remove_stopwords(text: str) -> list[str]:
#     text = re.sub(r'\s+', ' ', text).strip()
#     tokens = word_tokenize(text)
#     return [w for w in tokens if w not in stop_words and w.strip() != ""]

# filtered_tokens = {}
# filtered_texts = {}

# for book, text in lower_texts.items():
#     tokens_kept = remove_stopwords(text)
#     filtered_tokens[book] = tokens_kept
#     filtered_texts[book] = " ".join(tokens_kept)

# print("Original token sample:", word_tokenize(lower_texts[first_key])[:40])
# print("After stopword removal:", filtered_tokens[first_key][:40])

maybe i'll do word count here and see what is the most recent word and see if i should remove them

In [17]:
# all_tokens = []
# for tokens in filtered_tokens.values():
#     all_tokens.extend(tokens)

# freq = Counter(all_tokens)
# freq.most_common(30)

In [18]:
# custom_stopwords = {
#     "said","say","tell",
#     "would","could","must","shall",
#     "one",
#     "though","even","well","like",
#     "come","go","went","see"
# }

In [19]:
# ps = PorterStemmer()

# stemmed_tokens = {}
# stemmed_texts = {}

# for book, tokens in filtered_tokens.items(): 
#     stems = [ps.stem(w) for w in tokens]
#     stemmed_tokens[book] = stems
#     stemmed_texts[book] = " ".join(stems)

In [20]:
# BOOK_OUT = Path("data_cleaned")
# BOOK_OUT.mkdir(parents=True, exist_ok=True)

# for book, text in stemmed_texts.items():
#     (BOOK_OUT / f"{book}_clean.txt").write_text(text, encoding="utf-8")

# print("Saved cleaned books to:", BOOK_OUT)