# Topic Interpretability

Compute Diversity, Coherence, or other measures of topic interpretability.
This notebook uses octis on Python3.11

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..')))  # Adjust as needed
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '..','scripts')))  # Adjust as needed
import pandas as pd
import numpy as np
from scripts.my_text_cleaning import clean_dataframe
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence
import ast

chosen_dataset = 'covid_tweets_en'
chosen_dataset = 'cop26_tweets_en'
chosen_dataset = 'ukraine_tweets_en'

#ur_df = pd.read_parquet('./../../data/raw/'+chosen_dataset+'.parquet')
doc_info = pd.read_csv('./../../data/processed/document_info_'+chosen_dataset+'.csv')[['Document', 'Topic']]
doc_info.Topic = doc_info.Topic.astype(int)
topic_info = pd.read_csv('./../../data/processed/topic_info_'+chosen_dataset+'_with_MMR.csv')[['Topic', 'Representation', 'MMR']]
topic_info.Representation = topic_info.Representation.apply(ast.literal_eval)
embeddings = np.load('./../../data/processed/'+chosen_dataset+'.parquet.npy')
#unique_docs, unique_embeddings = clean_dataframe(ur_df, embeddings, 'text')
print(chosen_dataset)
print(f"N of documents: {len(doc_info)}")
print(f"N of topics: {len(topic_info)-1}")

ukraine_tweets_en
N of documents: 787872
N of topics: 138


In [2]:
import pandas as pd
import re
import ast
import spacy
import pickle
import gzip
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from octis.evaluation_metrics.coherence_metrics import Coherence
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
import os

# =====================================
# Setup
# =====================================
nlp = spacy.load("en_core_web_sm")
spacy.require_gpu()
print("GPU enabled:", spacy.prefer_gpu())

stop_words = set(stopwords.words('english'))

# =====================================
# Cleaning functions
# =====================================
def clean_text(text: str) -> str:
    text = str(text)
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002700-\U000027BF"
        "\U0001F900-\U0001F9FF"
        "\U00002600-\U000026FF"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# =====================================
# Document cleaning with caching
# =====================================
def clean_documents_cached(doc_df: pd.DataFrame, text_col: str, cache_path: str, batch_size=3000):
    if os.path.exists(cache_path):
        print(f"Loading preprocessed documents from {cache_path} ...")
        with gzip.open(cache_path, 'rb') as f:
            doc_texts = pickle.load(f)
        return doc_texts

    print("Cleaning documents...")
    cleaned_texts = [clean_text(str(doc)) for doc in doc_df[text_col]]
    tokenized_texts = [doc.split() for doc in cleaned_texts]

    print("Lemmatizing documents...")
    lemmatized_texts = []
    for doc_batch in tqdm(nlp.pipe([" ".join(doc) for doc in tokenized_texts],
                                   batch_size=batch_size), total=len(tokenized_texts), desc="Processing docs"):
        lemmatized_texts.append([token.lemma_ for token in doc_batch if token.lemma_ != ''])

    print(f"Saving preprocessed documents to {cache_path} ...")
    with gzip.open(cache_path, 'wb') as f:
        pickle.dump(lemmatized_texts, f)

    return lemmatized_texts

# =====================================
# Topic cleaning
# =====================================
def clean_topics(topic_df: pd.DataFrame, represetation_col:str, topic_col: str = 'Topic', exclude_topics=None, min_words=3):
    exclude_topics = exclude_topics or []
    filtered = topic_df.loc[~topic_df[topic_col].isin(exclude_topics), :].copy()
    # Ensure Representation column contains lists
    filtered[topic_col] = filtered[topic_col].apply(
        lambda x: ast.literal_eval(x) if isinstance(x, str) else x
    )

    cleaned_topics = []
    for topic in tqdm(filtered[represetation_col], total=len(filtered), desc="Processing topics"):
        topic_clean = [clean_text(str(t)) for t in topic if t]
        if not topic_clean:
            continue
        doc = list(nlp.pipe([" ".join(topic_clean)], batch_size=1))[0]
        lemmatized_topic = [token.lemma_ for token in doc if token.lemma_ != '']
        if len(lemmatized_topic) >= min_words:
            cleaned_topics.append(lemmatized_topic)
    return cleaned_topics

# =====================================
# Compute coherence and diversity
# =====================================
def compute_coherence_diversity(doc_texts, topic_list, coherence_measure='c_v', topics = None, exclude_topics=None):
    # Automatically set topk to the minimum topic length
    print(f"{len(doc_texts)} documents, {len(topic_list)} topics")
    if exclude_topics is not None and topics is not None:
        doc_texts = [doc for i, doc in enumerate(doc_texts) if topics[i] not in exclude_topics]
    print(f"{len(doc_texts)} documents, {len(topic_list)} topics")
    min_topic_len = min(len(t) for t in topic_list)
    topk = min(10, min_topic_len)
    print(f"Using topk={topk} for coherence computation (min topic length={min_topic_len})")

    coherence = Coherence(texts=doc_texts, measure=coherence_measure, topk=topk)
    coherence_dict = {"topics": topic_list}
    coherence_score = coherence.score(coherence_dict)

    diversity = TopicDiversity(topk=topk)
    diversity_dict = {"topics": topic_list}
    diversity_score = diversity.score(diversity_dict)

    return coherence_score, diversity_score

GPU enabled: True


In [3]:
# Clean and cache documents (once per dataset)
doc_cache_path = './../../data/processed/'+chosen_dataset+'clean_lemmatized_docs.pkl.gz'
doc_texts = clean_documents_cached(doc_info, text_col='Document', cache_path=doc_cache_path)

Cleaning documents...
Lemmatizing documents...


Processing docs:   0%|          | 0/787872 [00:00<?, ?it/s]

Saving preprocessed documents to ./../../data/processed/ukraine_tweets_enclean_lemmatized_docs.pkl.gz ...


In [6]:
# Clean topics, excluding topic -1 if needed
exclude_topics = None
exclude_topics = [-1]  # you can change this dynamically
representation = 'MMR'
representation = 'Representation'
topics_cleaned = clean_topics(topic_info, represetation_col=representation, exclude_topics=exclude_topics, min_words=1)
# Compute coherence & diversity on saved documents and filtered topics
coherence_score, diversity_score = compute_coherence_diversity(doc_texts, topics_cleaned, coherence_measure='c_v', exclude_topics=exclude_topics, topics=doc_info['Topic'].tolist())
print("Coherence:", coherence_score)
print("Diversity:", diversity_score)

Processing topics:   0%|          | 0/138 [00:00<?, ?it/s]

787872 documents, 138 topics
404922 documents, 138 topics
Using topk=10 for coherence computation (min topic length=10)
Coherence: 0.6096611121621709
Diversity: 0.5007246376811594


| Dataset | N Docs | Min Clust Size |N Topics | Noise % | RV | C_V | Diversity | WED |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | 
| Cop26 | 105,383 | 400 | 27 |  22.15% | 39.67% | 49.12% | 49.64% | 80.88% | 
| Covid | 545,032 | 250 | 163 | 41.74% | 34.75% | 55.13% | 43.78% | 78.43% |
| Ukraine | 787,872 | 500 | 137 | 48.61% | 32.35% | 61.03% | 49.71% | 74.04% |

## Examine Topics
Manual examination of some of the topics

In [None]:
import pandas as pd
import random

# -----------------------------
# INPUTS: adjust to your data
# -----------------------------
# doc_info: DataFrame with at least columns ['Document', 'Topic', 'Representative_document']
# topic_info: DataFrame with at least columns ['Representation'] (list of top words per topic)
# num_topics_sample: number of topics to inspect
# num_docs_per_topic: number of documents to show per topic

num_topics_sample = 10
num_docs_per_topic = 3

# -----------------------------
# Helper: sample topics
# -----------------------------
def sample_topics(topic_info, num_samples):
    total_topics = len(topic_info)
    if num_samples >= total_topics:
        return list(range(total_topics))
    
    # sample proportionally to topic size (number of assigned documents)
    topic_sizes = doc_info['Topic'].value_counts().reindex(range(total_topics), fill_value=0).tolist()
    probabilities = [s / sum(topic_sizes) for s in topic_sizes]
    sampled_idx = random.choices(range(total_topics), k=num_samples, weights=probabilities)
    return sampled_idx

# -----------------------------
# Helper: show representative documents
# -----------------------------
def show_topic(topic_idx):
    print(f"\n=== Topic {topic_idx} ===")
    top_words = topic_info.loc[topic_idx, 'Representation']
    print("Top words:", top_words)
    
    # get documents assigned to this topic
    docs = doc_info.loc[doc_info['Topic'] == topic_idx, 'Representative_document']
    
    if docs.empty:
        print("No documents assigned to this topic.")
        return
    
    print(f"Showing {min(num_docs_per_topic, len(docs))} representative documents:")
    for i, doc in enumerate(docs.head(num_docs_per_topic), start=1):
        print(f"Doc {i}: {doc[:300]}{'...' if len(doc) > 300 else ''}")

# -----------------------------
# Run inspection
# -----------------------------
sampled_topics = sample_topics(topic_info, num_topics_sample)

for idx in sampled_topics:
    show_topic(idx)


## Word-Embedding-based Diversity


In [2]:
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_distances
import numpy as np

# -------------------------
# 1. Load topics from BERTopic
# -------------------------

topic_words = topic_info['Representation'].tolist()

# -------------------------
# 2. Load pre-trained embeddings
# -------------------------
# Example: GoogleNews word2vec (download first from GoogleNews-vectors-negative300.bin.gz)
path_to_embeddings = "../../models/GoogleNews-vectors-negative300.bin.gz"
w2v = KeyedVectors.load_word2vec_format(path_to_embeddings, binary=True)

# -------------------------
# 3. Compute WED
# -------------------------
def compute_wed(topics, model):
    all_words = set(w for topic in topics for w in topic if w in model)
    vectors = {w: model[w] for w in all_words}

    pairwise_distances = []
    for topic in topics:
        words_in_vocab = [w for w in topic if w in vectors]
        for i in range(len(words_in_vocab)):
            for j in range(i + 1, len(words_in_vocab)):
                v1, v2 = vectors[words_in_vocab[i]], vectors[words_in_vocab[j]]
                dist = cosine_distances([v1], [v2])[0][0]
                pairwise_distances.append(dist)

    return np.mean(pairwise_distances) if pairwise_distances else 0.0

wed_score = compute_wed(topic_words, w2v)
print(f"Word Embedding-based Diversity (WED): {wed_score:.4f}")


Word Embedding-based Diversity (WED): 0.7404


### Old Code

In [4]:
import pandas as pd
import re
import ast
import spacy
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from octis.evaluation_metrics.coherence_metrics import Coherence

# =====================================
# Setup
# =====================================
nlp = spacy.load("en_core_web_sm")  # small model
spacy.require_gpu()  # enable GPU
print("GPU enabled:", spacy.prefer_gpu())

stop_words = set(stopwords.words('english')).union(set(stopwords.words('spanish'))).union(['rt', 'via', '…'])

# =====================================
# Cleaning functions
# =====================================
def clean_text(text: str) -> str:
    """Remove emojis, mentions, links, hashtags, extra whitespace."""
    text = str(text)
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002700-\U000027BF"
        "\U0001F900-\U0001F9FF"
        "\U00002600-\U000026FF"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_tokens(tokens):
    """Remove stopwords and lemmatize using spaCy (GPU)."""
    tokens_nostop = [w for w in tokens if w.lower() not in stop_words]
    doc = nlp(" ".join(tokens_nostop))
    return [token.lemma_ for token in doc if token.lemma_ != '']

# =====================================
# Document cleaning with GPU + tqdm
# =====================================
def clean_documents(series: pd.Series, batch_size=3000) -> list:
    """Clean documents using nlp.pipe with progress bar."""
    print("Cleaning documents...")
    cleaned_texts = [clean_text(str(doc)) for doc in series]
    print("Tokenizing documents...")
    tokenized_texts = [doc.split() for doc in cleaned_texts]
    print("Lemmatizing documents...")
    lemmatized_texts = []
    # tqdm wrapper for nlp.pipe
    for doc_batch in tqdm(nlp.pipe([" ".join(doc) for doc in tokenized_texts], batch_size=batch_size), 
                          total=len(tokenized_texts), desc="Processing docs"):
        lemmatized_texts.append([token.lemma_ for token in doc_batch if token.lemma_ != ''])
    print("Document cleaning completed.")
    return lemmatized_texts

# =====================================
# Topic cleaning with GPU + tqdm
# =====================================
def clean_topics(series: pd.Series, min_words=3, batch_size=500) -> list:
    """Clean topic words, lemmatize, remove short topics."""
    cleaned_topics = []
    print("Cleaning topics...")
    # Ensure series contains lists
    series = series.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # Use tqdm with manual iteration for topics
    for topic in tqdm(series, total=len(series), desc="Processing topics"):
        topic_clean = [token for token in topic if token]
        topic_clean = [clean_text(str(t)) for t in topic_clean]
        if not topic_clean:
            continue

        # Lemmatize using nlp.pipe with batch_size=1
        doc = list(nlp.pipe([" ".join(topic_clean)], batch_size=1))[0]
        lemmatized_topic = [token.lemma_ for token in doc if token.lemma_ != '']

        if len(lemmatized_topic) >= min_words:
            cleaned_topics.append(lemmatized_topic)

    return cleaned_topics

# =====================================
# Compute coherence with auto topk
# =====================================
def compute_coherence(doc_series, topic_series):
    # Clean data
    doc_texts = clean_documents(doc_series)
    topics_cleaned = clean_topics(topic_series, min_words=1)  # min_words=1 to keep all topics

    # Automatically set topk to the minimum topic length
    min_topic_len = min(len(t) for t in topics_cleaned)
    topk = min(10, min_topic_len)  # default 10 or shorter if needed

    print(f"Using topk={topk} for coherence computation (min topic length={min_topic_len})")

    coherence = Coherence(texts=doc_texts, measure='c_v', topk=topk)
    coherence_dict = {"topics": topics_cleaned}
    coherence_score = coherence.score(coherence_dict)

    diversity = TopicDiversity(topk=topk)
    diversity_dict = {"topics": topics_cleaned}
    diversity_score = diversity.score(diversity_dict)
    return coherence_score, diversity_score

GPU enabled: True


In [11]:

topic_coherence_score, topic_diversity_score = compute_coherence(doc_info['Document'], topic_info['Representation'])
print(topic_coherence_score, topic_diversity_score)

Cleaning documents...
Tokenizing documents...
Lemmatizing documents...


Processing docs:   0%|          | 0/105383 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
topic_coherence_score, topic_diversity_score = compute_coherence(doc_info.query("Topic != -1")['Document'], topic_info.query("Topic != -1")['Representation'])
print(topic_coherence_score, topic_diversity_score)


Cleaning documents...
Tokenizing documents...
Lemmatizing documents...


Processing docs:   0%|          | 0/372470 [00:00<?, ?it/s]

METRICS NOISELESS

Cop26: 
Coherence = 0.5711331303192272
Diversity = 0.53125

Covid
Coherence = 0.5350817136076244
Diversity = 0.4392

Ukraine
Coherence = 0.6236733698956629
Diversity = 0.4772151898734177

METRICS WITH NOISE

Cop26: 
Coherence = 0.5514339316450143
Diversity = 0.5224489795918368

Covid
Coherence = 0.5435738180981629
Diversity = 0.4357142857142857

Ukraine
Coherence = 0.6041440436052993
Diversity = 0.4742138364779874