In [1]:
from pathlib import Path
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter

In [2]:
RAW_DIR = Path("data_raw")
raw_files = sorted(RAW_DIR.glob("*.txt"))

print(f"Found {len(raw_files)} .txt files")
for f in raw_files[:10]:
    print(" -", f.name)

def read_text(fp: Path) -> str:
    try:
        return fp.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        return fp.read_text(encoding="latin-1")

raw_texts = {fp.stem: read_text(fp) for fp in raw_files}
print("Loaded books:", list(raw_texts.keys())[:5])

Found 10 .txt files
 - Crime_and_Punishment.txt
 - Notes_from_the_Underground.txt
 - Poor_Folk.txt
 - Short_Stories.txt
 - The_Brothers_Karamazov.txt
 - The_Gambler.txt
 - The_Grand_Inquisitor.txt
 - The_Idiot.txt
 - The_Possessed _or_The_Devils.txt
 - White_Nights_and_Other_Stories.txt
Loaded books: ['Crime_and_Punishment', 'Notes_from_the_Underground', 'Poor_Folk', 'Short_Stories', 'The_Brothers_Karamazov']


In [3]:
texts = {fp.stem: read_text(fp) for fp in raw_files}

first_key = next(iter(texts))
print("Example book:", first_key)
print(texts[first_key][:500])

Example book: Crime_and_Punishment
CRIME AND PUNISHMENT



PART I



CHAPTER I

On an exceptionally hot evening early in July a young man came out of
the garret in which he lodged in S. Place and walked slowly, as though
in hesitation, towards K. bridge.

He had successfully avoided meeting his landlady on the staircase. His
garret was under the roof of a high, five-storied house and was more
like a cupboard than a room. The landlady who provided him with garret,
dinners, and attendance, lived on the floor below, and every time
h


In [4]:
def prelim_clean(text: str) -> str:
    return re.sub(r"[^a-zA-Z0-9 \n\.\']", '', text)

prelim_texts = {}
for book, text in texts.items():
    prelim = prelim_clean(text)
    prelim_texts[book] = prelim

print(prelim_texts[first_key][:500])

CRIME AND PUNISHMENT



PART I



CHAPTER I

On an exceptionally hot evening early in July a young man came out of
the garret in which he lodged in S. Place and walked slowly as though
in hesitation towards K. bridge.

He had successfully avoided meeting his landlady on the staircase. His
garret was under the roof of a high fivestoried house and was more
like a cupboard than a room. The landlady who provided him with garret
dinners and attendance lived on the floor below and every time
he went o


In [5]:
def remove_extra_spaces(text: str) -> str:
    return re.sub(r' +', ' ', text)

def remove_punctuation_remaining(text: str) -> str:
    return re.sub(r'[^\w\s]', '', text)

def remove_numbers(text: str) -> str:
    return re.sub(r'\d', '', text)

clean_texts = {}
for book, text in prelim_texts.items():
    t = remove_extra_spaces(text)
    t = remove_punctuation_remaining(t)
    t = remove_numbers(t)
    clean_texts[book] = t

print(clean_texts[first_key][:500])

CRIME AND PUNISHMENT



PART I



CHAPTER I

On an exceptionally hot evening early in July a young man came out of
the garret in which he lodged in S Place and walked slowly as though
in hesitation towards K bridge

He had successfully avoided meeting his landlady on the staircase His
garret was under the roof of a high fivestoried house and was more
like a cupboard than a room The landlady who provided him with garret
dinners and attendance lived on the floor below and every time
he went out he


In [6]:
lower_texts = {book: text.lower() for book, text in clean_texts.items()}
print(lower_texts[first_key][:500])

crime and punishment



part i



chapter i

on an exceptionally hot evening early in july a young man came out of
the garret in which he lodged in s place and walked slowly as though
in hesitation towards k bridge

he had successfully avoided meeting his landlady on the staircase his
garret was under the roof of a high fivestoried house and was more
like a cupboard than a room the landlady who provided him with garret
dinners and attendance lived on the floor below and every time
he went out he


In [7]:
stop_words = set(stopwords.words("english"))

def remove_stopwords(text: str) -> list[str]:
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    return [w for w in tokens if w not in stop_words and w.strip() != ""]

filtered_tokens = {}
filtered_texts = {}

for book, text in lower_texts.items():
    tokens_kept = remove_stopwords(text)
    filtered_tokens[book] = tokens_kept
    filtered_texts[book] = " ".join(tokens_kept)

print("Original token sample:", word_tokenize(lower_texts[first_key])[:40])
print("After stopword removal:", filtered_tokens[first_key][:40])

Original token sample: ['crime', 'and', 'punishment', 'part', 'i', 'chapter', 'i', 'on', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'july', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 's', 'place', 'and', 'walked', 'slowly', 'as', 'though', 'in', 'hesitation', 'towards', 'k', 'bridge']
After stopword removal: ['crime', 'punishment', 'part', 'chapter', 'exceptionally', 'hot', 'evening', 'early', 'july', 'young', 'man', 'came', 'garret', 'lodged', 'place', 'walked', 'slowly', 'though', 'hesitation', 'towards', 'k', 'bridge', 'successfully', 'avoided', 'meeting', 'landlady', 'staircase', 'garret', 'roof', 'high', 'fivestoried', 'house', 'like', 'cupboard', 'room', 'landlady', 'provided', 'garret', 'dinners', 'attendance']


maybe i'll do word count here and see what is the most recent word and see if i should remove them

In [8]:
all_tokens = []
for tokens in filtered_tokens.values():
    all_tokens.extend(tokens)

freq = Counter(all_tokens)
freq.most_common(30)

[('one', 5555),
 ('would', 4298),
 ('said', 4082),
 ('know', 3602),
 ('though', 3273),
 ('man', 3123),
 ('could', 2975),
 ('come', 2809),
 ('dont', 2796),
 ('like', 2793),
 ('time', 2733),
 ('even', 2603),
 ('see', 2423),
 ('go', 2326),
 ('well', 2304),
 ('say', 2217),
 ('must', 2099),
 ('nothing', 1951),
 ('suddenly', 1914),
 ('little', 1905),
 ('last', 1874),
 ('prince', 1835),
 ('something', 1759),
 ('tell', 1734),
 ('never', 1687),
 ('away', 1678),
 ('old', 1671),
 ('thought', 1669),
 ('went', 1657),
 ('shall', 1590)]

In [9]:
ps = PorterStemmer()

stemmed_tokens = {}
stemmed_texts = {}

for book, tokens in filtered_tokens.items(): 
    stems = [ps.stem(w) for w in tokens]
    stemmed_tokens[book] = stems
    stemmed_texts[book] = " ".join(stems)

In [10]:
BOOK_OUT = Path("data_cleaned")
BOOK_OUT.mkdir(parents=True, exist_ok=True)

for book, text in stemmed_texts.items():
    (BOOK_OUT / f"{book}_clean.txt").write_text(text, encoding="utf-8")

print("Saved cleaned books to:", BOOK_OUT)

Saved cleaned books to: data_cleaned
