In [13]:
import random, re, math
from pathlib import Path
from typing import List, Dict, Tuple
from collections import Counter
import numpy as np
import pandas as pd

# 0. Data Preprocessing

In [14]:
token = re.compile(r"[A-Za-z][A-Za-z0-9_']+")

In [15]:
def strip_first4(text: str) -> str:
    lines = text.splitlines()
    return "\n".join(lines[4:])

In [16]:
def tokenize(text: str) -> List[str]:
    return [t.lower() for t in token.findall(text)]

# 1.  Data Refining

In [17]:
def list_class_files(root: Path) -> List[Tuple[str, Path]]:
    if not root.exists() or not root.is_dir():
        raise ValueError(f"[list_class_files] Not a directory: {root}")
    
    return [
        (d.name, f)
        for d in sorted(p for p in root.iterdir() if p.is_dir())
        for f in sorted(d.iterdir())
        if f.is_file()
    ]

In [18]:
def stratified_half_split(items: List[Tuple[str, Path]], seed: int = 42):
    rng = random.Random(seed)
    by_cls: Dict[str, List[Path]] = {}
    for c, fp in items:
        by_cls.setdefault(c, []).append(fp)

    train, test = [], []
    for c, files in by_cls.items():
        rng.shuffle(files)
        k = len(files) // 2
        train.extend(files[:k])
        test.extend(files[k:])
    return train, test

# 2. NB

In [19]:
def train_naive_bayes(docs: List[List[str]], labels: List[str], stop_top_k: int = 200):
    classes = sorted(set(labels))
    N_doc = len(docs)
    N_c = Counter(labels)
    logprior = {c: math.log(N_c[c] / N_doc + 1e-15) for c in classes}

    bigdoc = {c: Counter() for c in classes}
    for toks, y in zip(docs, labels):
        bigdoc[y].update(toks)

    total_counts = Counter()
    for c in classes:
        total_counts.update(bigdoc[c])

    #V_set = set(total_counts.keys())
    stop_words = set([w for w, _ in total_counts.most_common(stop_top_k)])

    V_set = set(total_counts.keys()) - stop_words
    V_size = len(V_set)

    loglikelihood = {c: {} for c in classes}
    for c in classes:
        total_c = sum(bigdoc[c].values())
        denom_log = math.log(total_c + V_size)
        for w in V_set:
            num = bigdoc[c][w] + 1
            loglikelihood[c][w] = math.log(num) - denom_log
    
    return V_set, logprior, loglikelihood

In [20]:
def predict_nb(doc_tokens: List[str], Vset, logprior, loglikelihood) -> str:
    tf = Counter([w for w in doc_tokens if w in Vset])
    best_c, best_score = None, float('-inf')
    for c in logprior.keys():
        s = logprior[c]
        llc = loglikelihood[c]
        for w, cnt in tf.items():
            s += cnt * llc[w]
        if s > best_score:
            best_c, best_score = c, s
    
    return best_c

In [21]:
def predict_many_nb(test_docs: List[List[str]], Vset, logprior, loglikelihood) -> List[str]:
    return [predict_nb(toks, Vset, logprior, loglikelihood) for toks in test_docs]

# 3. Pipeline

In [22]:
def load_and_tokenize(paths: List[Path]) -> List[List[str]]:
    docs = []
    for fp in paths:
        try:
            raw = fp.read_text(encoding = "utf-8", errors = "ignore")
        except UnicodeDecodeError:
            raw = fp.read_text(encoding = "latin-1", errors = "ignore")
        body = strip_first4(raw)
        docs.append(tokenize(body))
    
    return docs

In [23]:
def run_pipeline(root: Path, seed: int):
    items = list_class_files(root)
    train_paths, test_paths = stratified_half_split(items, seed = seed)

    y_train = [p.parent.name for p in train_paths]
    y_test = [p.parent.name for p in test_paths]

    Xtr_tokens = load_and_tokenize(train_paths)
    Xte_tokens = load_and_tokenize(test_paths)

    V_set, logprior, loglikelihood = train_naive_bayes(Xtr_tokens, y_train)
    y_hat = predict_many_nb(Xte_tokens, V_set, logprior, loglikelihood)

    acc = np.mean([predicted == trained for predicted, trained in zip(y_hat, y_test)])
    print(f"[RESULT] Test Accuracy: {acc:.4f}")

    cm = pd.crosstab(pd.Series(y_test, name = "True"),
                     pd.Series(y_hat, name = "Pred"),
                     dropna=False)
    print(cm)

    
    cm.to_csv("confusion_matrix.csv", encoding = "utf-8")
    print("[INFO] Confusion matrix saved to confusion_matrix.csv")


    with open("accuracy.txt", "w", encoding="utf-8") as f:
        f.write(f"{acc:.4f}\n")
    print("[INFO] Accuracy saved to accuracy.txt")


In [24]:
from pathlib import Path
run_pipeline(Path("/Users/gwaec/umd_lectures_projects/cmsc422/hw1/20_newsgroups"), seed=42)

[RESULT] Test Accuracy: 0.8383
Pred                      alt.atheism  comp.graphics  comp.os.ms-windows.misc  \
True                                                                            
alt.atheism                       410              0                        0   
comp.graphics                       0            434                        1   
comp.os.ms-windows.misc             0             84                      195   
comp.sys.ibm.pc.hardware            1             22                        6   
comp.sys.mac.hardware               1             12                        2   
comp.windows.x                      0             46                        0   
misc.forsale                        1             10                        2   
rec.autos                           2              2                        0   
rec.motorcycles                     0              0                        0   
rec.sport.baseball                  0              1                        1 