# Streamlined Term-Level Analysis (spaCy + NLTK)
- spaCy tokenization & **lemmatization**
- NLTK + custom stopwords
- **Per-document** bigrams
- URL / emoji / simple system-message filtering
- Unigram & bigram **TF/DF** + **MI**

In [27]:
# !pip install spacy nltk pandas tqdm
# !python -m spacy download en_core_web_sm

import re, math, os
from collections import Counter
from typing import Iterable, List, Tuple, Set, Dict
import pandas as pd

import nltk
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

import spacy

def load_spacy():
    try:
        return spacy.load("en_core_web_sm")
    except Exception:
        nlp = spacy.blank("en")
        try:
            nlp.add_pipe("lemmatizer", config={"mode":"lookup"})
        except Exception:
            pass
        print("Using blank 'en' pipeline (install en_core_web_sm for better lemmatization).")
        return nlp

nlp = load_spacy()

In [28]:
class Config:
    lowercase = True
    keep_only_alpha = True
    min_token_len = 2
    use_lemma = True
    remove_numbers = True
    
    # Stopwords
    use_nltk_stopwords = True
    extra_stopwords = {
        "server", "joined", "scroll", "papyrus", "image", "brett", "olsen", "entry", "start", "thread", "moshe", "levy", "casey", "handmer", "mae", "sawatzky", "like", "value", "seldon", "ben"
    }
    
    # Filters
    system_message_patterns = [
        re.compile(r"^\s*Joined the server\.?\s*$", re.I),
        re.compile(r"^\s*Started a thread\.?\s*$", re.I),
    ]
    url_inline_re = re.compile(r"(https?://\S+|www\.\S+)", re.I)
    url_only_re   = re.compile(r"^\s*(https?://\S+\s*)+$", re.I)
    emoji_re = re.compile(
        "["
        "\U0001F300-\U0001F5FF"
        "\U0001F600-\U0001F64F"
        "\U0001F680-\U0001F6FF"
        "\U0001F700-\U0001F77F"
        "\U0001F780-\U0001F7FF"
        "\U0001F800-\U0001F8FF"
        "\U0001F900-\U0001F9FF"
        "\U0001FA00-\U0001FA6F"
        "\U0001FA70-\U0001FAFF"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+")
    greek_re = re.compile(r"[\u0370-\u03FF\u1F00-\u1FFF]")
        
cfg = Config()

def contains_greek(text: str) -> bool:
    return bool(cfg.greek_re.search(text or ""))


In [29]:
from nltk.corpus import stopwords as nltk_stop

def build_stopword_set() -> Set[str]:
    sw = set()
    if cfg.use_nltk_stopwords:
        sw |= set(nltk_stop.words("english"))
    sw |= set(cfg.extra_stopwords)
    return {w.lower() for w in sw}

STOPWORDS = build_stopword_set()
len(STOPWORDS)

218

In [30]:
def is_system_message(text: str) -> bool:
    for pat in cfg.system_message_patterns:
        if pat.search(text):
            return True
    return False

def strip_urls(text: str) -> str:
    if cfg.url_only_re.match(text or ""):
        return ""
    return cfg.url_inline_re.sub(" ", text or "")

def strip_emoji(text: str) -> str:
    return cfg.emoji_re.sub(" ", text or "")

def clean_text(text: str) -> str:
    if not text:
        return ""
    if is_system_message(text):
        return ""
    text = strip_urls(text)
    text = strip_emoji(text)
    return re.sub(r"\s+", " ", text).strip()

In [31]:
def doc_to_tokens(doc_text: str, nlp, stopwords: Set[str]) -> List[str]:
    t = clean_text(doc_text)
    if not t:
        return []
    sp = nlp(t)
    toks = []
    for token in sp:
        if cfg.keep_only_alpha and not token.text.isalpha():
            continue
        if cfg.remove_numbers and any(ch.isdigit() for ch in token.text):
            continue
        raw = token.text.lower() if cfg.lowercase else token.text
        norm = token.lemma_.lower() if (cfg.use_lemma and token.lemma_) else raw
        if contains_greek(norm):
            norm = "GREEK_LETTERS"  # keep uppercase on purpose
        if len(norm) < cfg.min_token_len:
            continue
        if norm in stopwords:
            continue
        toks.append(norm)
    return toks

def docs_to_token_lists(texts: Iterable[str]) -> List[List[str]]:
    return [doc_to_tokens(t, nlp, STOPWORDS) for t in texts]

In [32]:
def bigrams_from_tokens_per_doc(tokens_per_doc: List[List[str]]) -> List[List[Tuple[str,str]]]:
    bigram_docs = []
    for toks in tokens_per_doc:
        bigram_docs.append([(toks[i], toks[i+1]) for i in range(len(toks)-1)] if len(toks) > 1 else [])
    return bigram_docs

In [33]:
def unigram_counts(tokens_per_doc: List[List[str]]):
    tf, df = Counter(), Counter()
    for toks in tokens_per_doc:
        tf.update(toks)
        df.update(set(toks))
    return tf, df

def bigram_counts(bigrams_per_doc: List[List[Tuple[str,str]]]):
    tf, df = Counter(), Counter()
    for bigs in bigrams_per_doc:
        tf.update(bigs)
        df.update(set(bigs))
    return tf, df

In [34]:
def mutual_information_bigrams(unigram_tf: Counter, bigram_tf: Counter, smoothing: int = 1):
    T = sum(bigram_tf.values()) + smoothing
    mi = {}
    for (w1,w2), c12 in bigram_tf.items():
        c1 = unigram_tf.get(w1, 0)
        c2 = unigram_tf.get(w2, 0)
        num = (c12 + smoothing) * T
        den = (c1 + smoothing) * (c2 + smoothing)
        mi[(w1,w2)] = math.log2(num / den)
    return mi

In [35]:
def analyze_corpus(docs, top_k=5000, output_dir="/Users/nikhil/PycharmProjects/vesuvius_discord_study/term_level/outputs", pretokenized=False):
    """
    docs: either Iterable[str] (if pretokenized=False) or List[List[str]] (if pretokenized=True)
    """
    if pretokenized:
        tokens_per_doc = docs
    else:
        tokens_per_doc = docs_to_token_lists(docs)

    bigrams_per_doc = bigrams_from_tokens_per_doc(tokens_per_doc)
    uni_tf, uni_df = unigram_counts(tokens_per_doc)
    bi_tf,  bi_df  = bigram_counts(bigrams_per_doc)

    df_uni = (pd.DataFrame([(t, uni_tf[t], uni_df[t]) for t in uni_tf], columns=["term","tf","df"])
                .sort_values(["tf","df","term"], ascending=[False,False,True]).head(top_k))
    df_bi = (pd.DataFrame([(" ".join(p), bi_tf[p], bi_df[p]) for p in bi_tf], columns=["bigram","tf","df"])
                .sort_values(["tf","df","bigram"], ascending=[False,False,True]).head(top_k))

    mi = mutual_information_bigrams(uni_tf, bi_tf, smoothing=1)
    df_mi = (pd.DataFrame([(f"{w1} {w2}", bi_tf[(w1,w2)], m) for (w1,w2), m in mi.items()], columns=["bigram","tf","mi"])
                .sort_values(["mi","tf","bigram"], ascending=[False,False,True]).head(top_k))

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        df_uni.to_csv(os.path.join(output_dir, "unigrams.csv"), index=False)
        df_bi.to_csv(os.path.join(output_dir, "bigrams.csv"), index=False)
        df_mi.to_csv(os.path.join(output_dir, "bigrams_mi.csv"), index=False)

    return {"unigrams": df_uni, "bigrams": df_bi, "bigrams_mi": df_mi}


## Loading JSON and running term analysis

In [36]:
import json
import os

def load_json_records_with_tokens(folder: str) -> List[Dict]:
    """
    Returns records with full-pipeline normalization:
      - URLs/emoji/system-message stripping
      - spaCy tokenization + lemmatization
      - NLTK+custom stopwords
    Output schema:
      { author, timestamp, channel, raw, tokens, context }
    where `context` is a comma-separated string of tokens.
    """
    records = []
    files = [f for f in os.listdir(folder) if f.lower().endswith(".json")]
    for fname in files:
        fpath = os.path.join(folder, fname)
        try:
            with open(fpath, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception as e:
            print(f"⚠️ Skipped {fname}: {e}")
            continue

        channel = (data.get("channel", {}) or {}).get("name") or data.get("name") or os.path.splitext(fname)[0]
        for m in data.get("messages", []):
            raw = (m.get("content") or "").strip()
            if not raw:
                continue

            # Full normalization through your doc_to_tokens pipeline
            tokens = doc_to_tokens(raw, nlp, STOPWORDS)
            if not tokens:
                continue

            author = (m.get("author") or {}).get("name") or (m.get("author") or {}).get("id") or "Unknown"
            ts     = m.get("timestamp") or ""
            records.append({
                "author": author,
                "timestamp": ts,
                "channel": channel,
                "raw": raw,
                "tokens": tokens,
                # 🔽 change: comma-separated tokens rather than space-joined
                "context": ", ".join(tokens),
            })
    print(f"Loaded {len(records)} normalized messages from {len(files)} files.")
    return records

records = load_json_records_with_tokens("/Users/nikhil/PycharmProjects/vesuvius_discord_study/term_level/filtered_JSON")
docs_tokens = [r["tokens"] for r in records]  # pretokenized


Loaded 7649 normalized messages from 2 files.


In [37]:
import pandas as pd
import os

results = analyze_corpus(docs_tokens, top_k=100000, output_dir="/Users/nikhil/PycharmProjects/vesuvius_discord_study/term_level/outputs", pretokenized=True)


# Build the cleaned-messages DataFrame
df_msgs = pd.DataFrame(
    [{k: r[k] for k in ["author","timestamp","channel","raw","context"]} for r in records],
    columns=["author","timestamp","channel","raw","context"]
)
output_dir="/Users/nikhil/PycharmProjects/vesuvius_discord_study/term_level/outputs"
xlsx_path = os.path.join(output_dir, "term_level_analysis.xlsx")

# Choose an engine
engine = None
try:
    import xlsxwriter  # noqa
    engine = "xlsxwriter"
except Exception:
    engine = "openpyxl"

with pd.ExcelWriter(xlsx_path, engine=engine) as writer:
    results["unigrams"].to_excel(writer, sheet_name="unigrams", index=False)
    results["bigrams"].to_excel(writer,  sheet_name="bigrams",  index=False)
    results["bigrams_mi"].to_excel(writer, sheet_name="bigrams_mi", index=False)
    df_msgs.to_excel(writer, sheet_name="messages_cleaned", index=False)

print("Wrote Excel to:", xlsx_path)

# Quick sanity check: how many rows remained identical after FULL normalization?
same = (df_msgs["raw"] == df_msgs["context"]).sum()
print(f"Rows where raw==context after full normalization: {same} / {len(df_msgs)}")
df_msgs.head(5)


Wrote Excel to: /Users/nikhil/PycharmProjects/vesuvius_discord_study/term_level/outputs/term_level_analysis.xlsx
Rows where raw==context after full normalization: 114 / 7649


Unnamed: 0,author,timestamp,channel,raw,context
0,mjq13,2023-03-15T03:44:06.141+00:00,general,"I see this sentence ""To make it easier to try...","see, sentence, make, easy, try, datum, also, r..."
1,natfriedman,2023-03-15T04:26:56.585+00:00,general,"Good find, we will remove that!","good, find, remove"
2,natfriedman,2023-03-15T04:27:10.993+00:00,general,We are now providing the data files untarred s...,"provide, datum, file, untarred, fetch, whateve..."
3,natfriedman,2023-03-15T04:29:56.175+00:00,general,Each .tif file in the full scroll sans is 8 mi...,"file, full, san, micrometer, tall, want, grab,..."
4,natfriedman,2023-03-15T15:48:38.026+00:00,general,Hello!,hello
