### Class  for evaluating BERTopic models using OCTIS framework

In [1]:
# %% [code]
import os
import sys
import numpy as np
import pandas as pd
from typing import Dict, List, Any, Tuple
from collections import Counter
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
import networkx as nx
from bertopic import BERTopic
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
import traceback

# Define the LemmaTokenizer so that it is available during unpickling.
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        # Tokenize and lemmatize the document.
        return [self.wnl.lemmatize(token) for token in word_tokenize(doc)]

# Download required NLTK data
try:
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    nltk.download('punkt')
    NLTK_AVAILABLE = True

except Exception as e:
    print(f"Warning: NLTK data download failed: {e}")
    NLTK_AVAILABLE = False

class OCTISEvaluator:
    """
    OCTIS-based evaluator for comprehensive topic model assessment.
    
    This class implements evaluation metrics from the OCTIS framework, combined with
    recommendations from the BERTopic authors. It calculates coherence, diversity,
    quality, clustering, and significance metrics.
    """
    
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.model = None
        self.topics = {}
        self.topic_words = []
        self.documents = []
        self.tokenized_docs = []
        self.metrics = {}
        
    def load_model(self) -> bool:
        """Load the BERTopic model."""
        try:
            print(f"Loading model from {self.model_path}...")
            self.model = BERTopic.load(self.model_path)
            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False
            
    def extract_topics(self) -> bool:
        """Extract topics from the model."""
        try:
            if not self.model:
                print("Model not loaded. Call load_model() first.")
                return False
            self.topics = self.model.get_topics()
            # Convert topics for coherence calculation
            self.topic_words = []
            for topic_id, words in self.topics.items():
                if topic_id != -1:  # Skip outlier topic
                    top_words = [word for word, _ in words[:10]]
                    self.topic_words.append(top_words)
            return True
        except Exception as e:
            print(f"Error extracting topics: {e}")
            return False
            
    def load_documents(self, docs_path: str) -> bool:
        """Load and preprocess documents."""
        try:
            if docs_path.endswith('.xlsx'):
                df = pd.read_excel(docs_path)
                display(df.head(2))
                display(df.dtypes)
                self.documents = df["abstract_content_clean_en"].fillna("").tolist()
            else:
                with open(docs_path, 'r', encoding='utf-8') as f:
                    self.documents = f.readlines()
            self._tokenize_documents()
            
            return True
        except Exception as e:
            print(f"Error loading documents: {e}")
            return False
            
    def _tokenize_documents(self):
        """Tokenize documents for coherence calculation."""
        if not NLTK_AVAILABLE:
            print("NLTK not available. Install nltk for tokenization.")
            return
        try:
            tokenizer = WordNetLemmatizer()
            self.tokenized_docs = []
            for doc in self.documents:
                if not isinstance(doc, str):
                    doc = str(doc)
                tokens = word_tokenize(doc.lower())
                tokens = [tokenizer.lemmatize(token) for token in tokens]
                tokens = [token for token in tokens if len(token) > 2 and any(c.isalpha() for c in token)]
                if tokens:
                    self.tokenized_docs.append(tokens)
        except Exception as e:
            print(f"Error in tokenization: {e}")
            traceback.print_exc()

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Instance

In [2]:
model_path = "model/bertopic_model"
docs_path = "data/archives_articles_contents.xlsx"

In [3]:
# STEP 1: Ingest and clean data
articles_contents_df = pd.read_excel(docs_path)

In [4]:
# STEP 2: Load model
evaluator = OCTISEvaluator(model_path=model_path)
if evaluator.load_model():

    # create the topics using pre-saved model 
    topic_model = evaluator.model
    abstracts_and_contents = articles_contents_df['abstract_content_clean_en'].tolist()
    topics, probabilities = topic_model.fit_transform(abstracts_and_contents)

else:
    print("Failed to load model.")

Loading model from model/bertopic_model...


Batches: 100%|██████████| 76/76 [06:28<00:00,  5.11s/it]
2025-03-24 01:07:39,833 - BERTopic - Embedding - Completed ✓
2025-03-24 01:07:39,843 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-24 01:08:04,941 - BERTopic - Dimensionality - Completed ✓
2025-03-24 01:08:04,945 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-24 01:08:05,493 - BERTopic - Cluster - Completed ✓
2025-03-24 01:08:05,495 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-03-24 01:08:20,364 - BERTopic - Representation - Completed ✓
2025-03-24 01:08:20,374 - BERTopic - Topic reduction - Reducing number of topics
2025-03-24 01:08:20,376 - BERTopic - Topic reduction - Number of topics (35) is equal or higher than the clustered topics(35).
2025-03-24 01:08:20,377 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-03-24 01:08:30,688 - BERTopic - Representation - Completed ✓


In [None]:
# STEP 3: Evaluate the model -> Topic diversity
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

topics = topic_model.get_topics()
print(f"Topics: {topics}")
# print(f"Type of topics: {type(topics)}")

# Convert the dictionary from BERTopic to a list of lists
topics_list = [[word for word, _ in words] for _, words in topic_model.get_topics().items()]
print(f"Topics_list: {type(topics_list)}")
# print(f"Type of topics_list: {type(topics_list)}")

model_output = {"topics": topics_list}
# print(f"Model_output: {model_output}")
# print(f"Type of model_output: type({model_output})")

# Initialize the metric
metric = TopicDiversity(topk=10) # topk - the number of words to include in a topic

# Compute score
try:
    topic_score = metric.score(model_output)
    print(f"Topic diversity score: {topic_score}")
except ValueError as e:
    print(f"Error occurred: {e}")


Topic diversity score: 0.9514285714285714


In [None]:
# STEP 4: Evaluate the model -> Topic coherence
from octis.evaluation_metrics.coherence_metrics import Coherence

topics = topic_model.get_topics()
print(f"Topics: {topics}")
# print(f"Type of topics: {type(topics)}")

# Convert the dictionary from BERTopic to a list of lists
topics_list = [[word for word, _ in words] for _, words in topic_model.get_topics().items()]
# print(f"Topics_list: {type(topics_list)}")
# print(f"Type of topics_list: {type(topics_list)}")

model_output = {"topics": topics_list}
# print(f"Model_output: {model_output}")
# print(f"Type of model_output: type({model_output})")

# Initialize the metric
metric = Coherence(topk=10, texts=topics_list, measure="c_v")

# Compute score
try:
    topic_score = metric.score(model_output)
    print(f"Topic coherence score: {topic_score}")
except ValueError as e:
    print(f"Error occurred: {e}")

Topic coherence score: 0.9988282741686131


In [7]:
# SAMPLE

# from octis.evaluation_metrics.coherence_metrics import Coherence

# # Example tokenized documents (texts used to calculate coherence)
# texts = [
#     ["open", "source", "software", "linux", "development"],
#     ["digital", "divide", "inequality", "skill", "gap"],
#     ["bot", "disinformation", "election", "tweet", "propaganda"],
#     ["library", "public", "book", "librarian", "community"],
#     ["fan", "movie", "fiction", "photo", "art"]
# ]

# # Example topics (list of lists containing topic terms)
# topics = [
#     ["open", "source", "software", "linux", "development"],
#     ["digital", "divide", "inequality", "skill", "gap"],
#     ["bot", "disinformation", "election", "tweet", "propaganda"],
#     ["library", "public", "book", "librarian", "community"],
#     ["fan", "movie", "fiction", "photo", "art"]
# ]

# # Initialize the Coherence metric
# metric = Coherence(topk=5, texts=None, measure="c_npmi")

# # Compute coherence score
# coherence_score = metric.score({"topics": topics})

# # Display the coherence score
# print(f"Coherence Score: {coherence_score}")