### Class  for evaluating BERTopic models using OCTIS framework

In [1]:
# %% [code]
import os
import sys
import numpy as np
import pandas as pd
from typing import Dict, List, Any, Tuple
from collections import Counter
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
import networkx as nx
from bertopic import BERTopic
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
import traceback

# Define the LemmaTokenizer so that it is available during unpickling.
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        # Tokenize and lemmatize the document.
        return [self.wnl.lemmatize(token) for token in word_tokenize(doc)]

# Download required NLTK data
try:
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    nltk.download('punkt')
    NLTK_AVAILABLE = True
except Exception as e:
    print(f"Warning: NLTK data download failed: {e}")
    NLTK_AVAILABLE = False

class OCTISEvaluator:
    """
    OCTIS-based evaluator for comprehensive topic model assessment.
    
    This class implements evaluation metrics from the OCTIS framework, combined with
    recommendations from the BERTopic authors. It calculates coherence, diversity,
    quality, clustering, and significance metrics.
    """
    
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.model = None
        self.topics = {}
        self.topic_words = []
        self.documents = []
        self.tokenized_docs = []
        self.metrics = {}
        
    def load_model(self) -> bool:
        """Load the BERTopic model."""
        try:
            print(f"Loading model from {self.model_path}...")
            self.model = BERTopic.load(self.model_path)
            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False
            
    def extract_topics(self) -> bool:
        """Extract topics from the model."""
        try:
            if not self.model:
                print("Model not loaded. Call load_model() first.")
                return False
            self.topics = self.model.get_topics()
            # Convert topics for coherence calculation
            self.topic_words = []
            for topic_id, words in self.topics.items():
                if topic_id != -1:  # Skip outlier topic
                    top_words = [word for word, _ in words[:10]]
                    self.topic_words.append(top_words)
            return True
        except Exception as e:
            print(f"Error extracting topics: {e}")
            return False
            
    def load_documents(self, docs_path: str) -> bool:
        """Load and preprocess documents."""
        try:
            if docs_path.endswith('.xlsx'):
                df = pd.read_excel(docs_path)
                display(df.head(2))
                display(df.dtypes)
                self.documents = df["abstract_content_clean_en"].fillna("").tolist()
            else:
                with open(docs_path, 'r', encoding='utf-8') as f:
                    self.documents = f.readlines()
            self._tokenize_documents()
            
            return True
        except Exception as e:
            print(f"Error loading documents: {e}")
            return False
            
    def _tokenize_documents(self):
        """Tokenize documents for coherence calculation."""
        if not NLTK_AVAILABLE:
            print("NLTK not available. Install nltk for tokenization.")
            return
        try:
            tokenizer = WordNetLemmatizer()
            self.tokenized_docs = []
            for doc in self.documents:
                if not isinstance(doc, str):
                    doc = str(doc)
                tokens = word_tokenize(doc.lower())
                tokens = [tokenizer.lemmatize(token) for token in tokens]
                tokens = [token for token in tokens if len(token) > 2 and any(c.isalpha() for c in token)]
                if tokens:
                    self.tokenized_docs.append(tokens)
        except Exception as e:
            print(f"Error in tokenization: {e}")
            traceback.print_exc()
            
    def calculate_coherence(self) -> Dict[str, float]:
        """
        Calculate coherence metrics: c_v, u_mass, and c_npmi.
        """
        if not self.tokenized_docs or not self.topic_words:
            print("Documents or topics not available.")
            return {}
        try:
            dictionary = Dictionary(self.tokenized_docs)
            coherence_scores = {}
            for measure in ['c_v', 'u_mass', 'c_npmi']:
                try:
                    coherence_model = CoherenceModel(
                        topics=self.topic_words,
                        texts=self.tokenized_docs,
                        dictionary=dictionary,
                        coherence=measure
                    )
                    score = coherence_model.get_coherence()
                    coherence_scores[f"coherence_{measure}"] = score
                except Exception as e:
                    print(f"Error calculating {measure} coherence: {e}")
            return coherence_scores
        except Exception as e:
            print(f"Error in coherence calculation: {e}")
            return {}
            
    def calculate_diversity(self) -> Dict[str, float]:
        """
        Calculate diversity metrics including topic diversity and word overlap.
        """
        try:
            all_words = [word for topic in self.topic_words for word in topic]
            unique_words = set(all_words)
            diversity = len(unique_words) / len(all_words) if all_words else 0
            total_overlap = 0
            max_overlap = 0
            min_overlap = float('inf')
            overlaps = []
            for i in range(len(self.topic_words)):
                for j in range(i+1, len(self.topic_words)):
                    set_i = set(self.topic_words[i])
                    set_j = set(self.topic_words[j])
                    overlap = len(set_i.intersection(set_j))
                    total_overlap += overlap
                    max_overlap = max(max_overlap, overlap)
                    min_overlap = min(min_overlap, overlap)
                    overlaps.append(overlap)
            avg_overlap = total_overlap / len(overlaps) if overlaps else 0
            overlap_std = np.std(overlaps) if overlaps else 0
            return {
                "topic_diversity": diversity,
                "avg_word_overlap": avg_overlap,
                "max_word_overlap": max_overlap,
                "min_word_overlap": min_overlap,
                "overlap_std": overlap_std,
                "unique_words_ratio": len(unique_words) / len(self.topic_words) if self.topic_words else 0
            }
        except Exception as e:
            print(f"Error calculating diversity: {e}")
            return {}
            
    def calculate_topic_quality(self) -> Dict[str, float]:
        """
        Calculate topic quality metrics including average words per topic and word score statistics.
        """
        try:
            words_per_topic = [len(topic) for topic in self.topic_words]
            avg_words = np.mean(words_per_topic)
            std_words = np.std(words_per_topic)
            all_scores = []
            for topic_id, words in self.topics.items():
                if topic_id != -1:
                    scores = [score for _, score in words]
                    all_scores.extend(scores)
            return {
                "avg_words_per_topic": avg_words,
                "std_words_per_topic": std_words,
                "avg_word_score": np.mean(all_scores) if all_scores else 0,
                "min_word_score": min(all_scores) if all_scores else 0,
                "max_word_score": max(all_scores) if all_scores else 0,
                "word_score_std": np.std(all_scores) if all_scores else 0
            }
        except Exception as e:
            print(f"Error calculating topic quality: {e}")
            return {}
            
    def calculate_clustering_metrics(self) -> Dict[str, float]:
        """
        Calculate clustering metrics: silhouette score and Calinski-Harabasz score.
        """
        try:
            if not hasattr(self.model, 'embedding_model') or not self.documents:
                return {}
            embeddings = self.model.embedding_model.encode(self.documents)
            doc_topics = self.model.transform(self.documents)[0]
            try:
                silhouette = silhouette_score(embeddings, doc_topics)
            except:
                silhouette = 0
            try:
                calinski = calinski_harabasz_score(embeddings, doc_topics)
            except:
                calinski = 0
            return {
                "silhouette_score": silhouette,
                "calinski_harabasz_score": calinski
            }
        except Exception as e:
            print(f"Error calculating clustering metrics: {e}")
            return {}
            
    def calculate_topic_significance(self) -> Dict[str, float]:
        """
        Calculate topic significance metrics: entropy and topic size statistics.
        """
        try:
            if not self.model or not hasattr(self.model, 'topic_sizes_'):
                return {}
            topic_sizes = self.model.topic_sizes_
            total_docs = sum(topic_sizes.values())
            probs = np.array(list(topic_sizes.values())) / total_docs
            entropy = -np.sum(probs * np.log2(probs + 1e-10))
            sizes = np.array(list(topic_sizes.values()))
            return {
                "topic_entropy": entropy,
                "avg_topic_size": np.mean(sizes),
                "std_topic_size": np.std(sizes),
                "min_topic_size": np.min(sizes),
                "max_topic_size": np.max(sizes)
            }
        except Exception as e:
            print(f"Error calculating topic significance: {e}")
            return {}
            
    def evaluate(self) -> Dict[str, Any]:
        """
        Run comprehensive evaluation following OCTIS framework and return metrics.
        """
        print("\nStarting OCTIS-based evaluation...")
        if not self.model and not self.load_model():
            return {}
        if not self.topics and not self.extract_topics():
            return {}
        metrics = {}
        print("\nCalculating coherence metrics...")
        coherence = self.calculate_coherence()
        if coherence:
            metrics.update(coherence)
        print("Calculating diversity metrics...")
        diversity = self.calculate_diversity()
        if diversity:
            metrics.update(diversity)
        print("Calculating quality metrics...")
        quality = self.calculate_topic_quality()
        if quality:
            metrics.update(quality)
        print("Calculating clustering metrics...")
        clustering = self.calculate_clustering_metrics()
        if clustering:
            metrics.update(clustering)
        print("Calculating significance metrics...")
        significance = self.calculate_topic_significance()
        if significance:
            metrics.update(significance)
        self.metrics = metrics
        return metrics
        
    def display_results(self):
        """
        Display evaluation results as formatted DataFrames in the notebook.
        """
        try:
            # Build DataFrame for Table 1: Top 15 Topics
            topics_data = []
            if hasattr(self.model, 'topic_sizes_'):
                top_topics = sorted(self.model.topic_sizes_.items(), key=lambda x: x[1], reverse=True)[:15]
                total_docs = sum(self.model.topic_sizes_.values())
                rep_docs = {}
                if hasattr(self.model, 'representative_docs_'):
                    rep_docs = self.model.representative_docs_
                for topic_id, size in top_topics:
                    if topic_id != -1:
                        topic_words = [f"{word} ({score:.3f})" for word, score in self.topics[topic_id][:10]]
                        topic_label = self.model.topic_labels_[topic_id] if hasattr(self.model, 'topic_labels_') else f"Topic {topic_id}"
                        doc_percent = (size / total_docs) * 100
                        rep_doc = rep_docs.get(topic_id, "-") if rep_docs else "-"
                        topics_data.append({
                            "Topic ID": topic_id,
                            "Topic Label": topic_label,
                            "Key Terms": ", ".join(topic_words),
                            "Document %": f"{doc_percent:.1f}%",
                            "Representative Article": rep_doc
                        })
            topics_df = pd.DataFrame(topics_data)
            
            # Build DataFrame for Table 3: Evaluation Metrics
            metrics_data = []
            metrics_info = {
                'topic_diversity': {
                    'name': 'Topic Diversity',
                    'benchmark': '>0.7 considered good',
                    'interpret': lambda x: 'Excellent - minimal vocabulary overlap' if x > 0.9 
                            else 'Good - distinct vocabularies' if x > 0.7 
                            else 'Fair'
                },
                'avg_word_overlap': {
                    'name': 'Average Word Overlap',
                    'benchmark': '<0.05 considered good',
                    'interpret': lambda x: 'Very low - semantically distinct' if x < 0.02
                            else 'Good - clear separation' if x < 0.05
                            else 'Fair'
                },
                'coherence_c_v': {
                    'name': 'C_v Coherence',
                    'benchmark': '>0.4 considered good',
                    'interpret': lambda x: 'Excellent - highly coherent' if x > 0.5
                            else 'Good - semantically related' if x > 0.4
                            else 'Fair'
                },
                'coherence_u_mass': {
                    'name': 'U Measure',
                    'benchmark': '<-8.0 acceptable',
                    'interpret': lambda x: 'Good - strong specificity' if x < -10
                            else 'Acceptable' if x < -8
                            else 'Fair'
                },
                'silhouette_score': {
                    'name': 'Silhouette Coefficient',
                    'benchmark': '>0 considered good',
                    'interpret': lambda x: 'Good - clear separation' if x > 0.2
                            else 'Fair' if x > 0
                            else 'Poor'
                }
            }
            for metric_key, info in metrics_info.items():
                if metric_key in self.metrics:
                    value = self.metrics[metric_key]
                    interpretation = info['interpret'](value)
                    metrics_data.append({
                        "Metric": info['name'],
                        "Value": value,
                        "Interpretation": interpretation,
                        "Benchmark": info['benchmark']
                    })
            metrics_df = pd.DataFrame(metrics_data)
            
            # Optional: Display some introductory text with corpus statistics
            num_topics = len([t for t in self.topics.keys() if t != -1])
            total_docs = sum(self.model.topic_sizes_.values()) if hasattr(self.model, 'topic_sizes_') else 0
            vocab_size = len(set([word for topic in self.topics.values() for word, _ in topic]))
            avg_words_per_topic = np.mean([len(words) for words in self.topics.values()])
            intro_text = (f"The BERTopic model successfully identified {num_topics} major thematic areas within the corpus.\n"
                        f"Corpus Statistics: {total_docs:,} articles processed, with an effective vocabulary of {vocab_size:,} unique terms, "
                        f"and an average of {avg_words_per_topic:.1f} key terms per topic.\n")
            print(intro_text)
            
            # Display the DataFrames in Jupyter Notebook
            print("Table 1: Top 15 Topics Identified in Corpus")
            display(topics_df)
            print("\nTable 3: BERTopic Model Evaluation Metrics")
            display(metrics_df)
            
        except Exception as e:
            print(f"Error displaying results: {e}")
            traceback.print_exc()
    
    # The following helper methods are stubs for additional analysis sections.
    def _write_topic_analysis_section(self, output_func):
        output_func("## 5.1 In-Depth Analysis of Key Topics\n")
        significant_topics = self._get_significant_topics()
        for topic_id in significant_topics:
            if topic_id in self.topics:
                topic_words = self.topics[topic_id]
                topic_label = self.model.topic_labels_[topic_id] if hasattr(self.model, 'topic_labels_') else f"Topic {topic_id}"
                topic_size = self.model.topic_sizes_[topic_id] if hasattr(self.model, 'topic_sizes_') else 0
                total_docs = sum(self.model.topic_sizes_.values()) if hasattr(self.model, 'topic_sizes_') else 1
                topic_percentage = (topic_size / total_docs) * 100
                output_func(f"### {topic_label}\n")
                output_func(f"This topic covers {topic_percentage:.1f}% of the corpus with key terms:\n")
                for word, score in topic_words[:10]:
                    output_func(f"- **{word}** ({score:.3f})\n")
    
    def _write_temporal_analysis_section(self, output_func):
        output_func("## 5.2 Temporal Analysis\n")
        output_func("Temporal analysis not implemented.\n")
                
    def _write_lifecycle_analysis_section(self, output_func):
        output_func("## 5.3 Topic Lifecycle Analysis\n")
        output_func("Lifecycle analysis not implemented.\n")
                
    def _write_trending_topics_section(self, output_func):
        output_func("## 5.4 Trending and Declining Topics\n")
        output_func("Trending topics analysis not implemented.\n")
            
    def _write_cooccurrence_section(self, output_func):
        output_func("## 5.5 Topic Co-occurrence Patterns\n")
        output_func("Co-occurrence analysis not implemented.\n")
                
    def _write_similarity_section(self, output_func):
        output_func("## 5.6 Topic Similarity Mapping\n")
        output_func("Similarity mapping not implemented.\n")
                
    def _write_hierarchy_section(self, output_func):
        output_func("## 5.7 Topic Hierarchy\n")
        output_func("Hierarchy analysis not implemented.\n")
                
    def _get_significant_topics(self) -> List[int]:
        if not hasattr(self.model, 'topic_sizes_'):
            return []
        topic_scores = {}
        for topic_id, size in self.model.topic_sizes_.items():
            if topic_id != -1:
                coherence = self.metrics.get(f'topic_coherence_{topic_id}', 0)
                topic_scores[topic_id] = (size * coherence) if coherence > 0 else size
        return sorted(topic_scores.keys(), key=lambda x: topic_scores[x], reverse=True)[:5]
    
    def _analyze_temporal_patterns(self, evolution: Dict) -> Dict[str, List[Tuple[int, str]]]:
        return {}
        
    def _analyze_topic_lifecycles(self) -> Dict[str, List[int]]:
        return {}
        
    def _analyze_topic_trends(self) -> Tuple[List[Tuple[int, float]], List[Tuple[int, float]]]:
        return [], []
        
    def _analyze_topic_clusters(self) -> Dict[str, List[int]]:
        return {}
        
    def _analyze_topic_similarities(self) -> Dict[str, List[Tuple[int, int, float]]]:
        return {}
        
    def _analyze_topic_hierarchy(self) -> Dict[str, List[int]]:
        return {}


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Instance

In [2]:
# %% [code]
# Update these paths with the correct locations for your BERTopic model and document file.
model_path = "model/bertopic_model"      # e.g., "./model/bertopic_model"
docs_path = "data/archives_articles_contents.xlsx"   # or "your_documents.xlsx" if using an Excel file

# Create an evaluator instance.
evaluator = OCTISEvaluator(model_path=model_path)

# Load the model.
if evaluator.load_model():
    # Extract topics from the model.
    evaluator.extract_topics()
    
    # Load documents (ensure the document file exists and the column name is correct for Excel files).
    if evaluator.load_documents(docs_path):
        # Run the evaluation and print the evaluation metrics.
        metrics = evaluator.evaluate()
        print("\nEvaluation Metrics:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        
        # Display the full evaluation report directly in the notebook.
        evaluator.display_results()
    else:
        print("Failed to load documents.")
else:
    print("Failed to load model.")


Loading model from model/bertopic_model...


Unnamed: 0,article_url,article_title,article_title_clean,article_publication_date,author,author_clean,keyword,abstract,abstract_clean,abstract_clean_en,lang,archive_url,archive_title,archive_title_clean,archive_publication_date,content_url,content,content_clean,abstract_content_clean_en
0,https://firstmonday.org/ojs/index.php/fm/artic...,The Lives and Death of Moore's Law,the lives and death of moore's law,2002-11-04,Ilkka Tuomi,Ilkka Tuomi,,Moore's Law has been an important benchmark f...,moore's law has been an important benchmark f...,moore's law has been an important benchmark f...,en,https://firstmonday.org/ojs/index.php/fm/issue...,,,2002-11-04,https://firstmonday.org/ojs/index.php/fm/artic...,The lives and death of Moore's Law\nMoore’s La...,the lives and death of moore's law moore s law...,moore's law has been an important benchmark f...
1,https://firstmonday.org/ojs/index.php/fm/artic...,Terms of public service: Framing mobile privac...,terms of public service: framing mobile privac...,2019-11-01,Pawel Popiel,Pawel Popiel,"mobile privacy, framing, privacy coverage, pri...",Engaging normative theories of the press and r...,engaging normative theories of the press and r...,engaging normative theories of the press and r...,en,https://firstmonday.org/ojs/index.php/fm/issue...,,,2019-11-01,https://firstmonday.org/ojs/index.php/fm/artic...,Terms of public service: Framing mobile privac...,terms of public service: framing mobile privac...,engaging normative theories of the press and r...


article_url                          object
article_title                        object
article_title_clean                  object
article_publication_date     datetime64[ns]
author                               object
author_clean                         object
keyword                              object
abstract                             object
abstract_clean                       object
abstract_clean_en                    object
lang                                 object
archive_url                          object
archive_title                        object
archive_title_clean                 float64
archive_publication_date     datetime64[ns]
content_url                          object
content                              object
content_clean                        object
abstract_content_clean_en            object
dtype: object


Starting OCTIS-based evaluation...

Calculating coherence metrics...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Calculating diversity metrics...
Calculating quality metrics...
Calculating clustering metrics...
Error calculating clustering metrics: 'SentenceTransformerBackend' object has no attribute 'encode'
Calculating significance metrics...

Evaluation Metrics:
coherence_c_v: 0.525482328992997
coherence_u_mass: nan
coherence_c_npmi: nan
topic_diversity: 0.9514285714285714
avg_word_overlap: 0.031932773109243695
max_word_overlap: 3
min_word_overlap: 0
overlap_std: 0.22601136771783853
unique_words_ratio: 9.514285714285714
avg_words_per_topic: 10.0
std_words_per_topic: 0.0
avg_word_score: 0.218897328192405
min_word_score: 0.1477035672955659
max_word_score: 0.5187392100088899
word_score_std: 0.051049585521936634
topic_entropy: 5.010404555984249
avg_topic_size: 68.94285714285714
std_topic_size: 27.994839174536494
min_topic_size: 23
max_topic_size: 125
The BERTopic model successfully identified 35 major thematic areas within the corpus.
Corpus Statistics: 2,413 articles processed, with an effective 

Unnamed: 0,Topic ID,Topic Label,Key Terms,Document %,Representative Article
0,6,6_education_learning_distance education_student,"education (0.240), learning (0.220), distance ...",5.2%,[ globalization represents a significant threa...
1,15,15_twitter_tweet_political_social medium,"twitter (0.242), tweet (0.178), political (0.1...",5.1%,[the social live streaming service twitch was ...
2,33,33_museum_archive_preservation_digital collection,"museum (0.297), archive (0.228), preservation ...",5.1%,[ syracuse university library's belfer audio l...
3,7,7_queer_shame_hiv_woman,"queer (0.256), shame (0.233), hiv (0.233), wom...",4.4%,[the internet and hiv biomedical technologies ...
4,17,17_censorship_russia_authoritarian_internet,"censorship (0.211), russia (0.200), authoritar...",4.4%,[ in october the bbc aired a short series of r...
5,16,16_journal_publishing_open access_scholarly pu...,"journal (0.301), publishing (0.295), open acce...",4.4%,"[ historically, agricultural research and educ..."
6,2,2_bot_qanon_troll_election,"bot (0.256), qanon (0.232), troll (0.212), ele...",3.9%,[an internet troll is a person who deliberatel...
7,32,32_economy_sharing economy_gift_gift economy,"economy (0.225), sharing economy (0.225), gift...",3.8%,[ web . has been a dominant concept in recent ...
8,30,30_political participation_political_civic_par...,"political participation (0.215), political (0....",3.8%,[the late case of the facebook content moderat...
9,0,0_open source_floss_linux_source software,"open source (0.355), floss (0.313), linux (0.2...",3.6%,[ starting with eric raymond's groundbreaking ...



Table 3: BERTopic Model Evaluation Metrics


Unnamed: 0,Metric,Value,Interpretation,Benchmark
0,Topic Diversity,0.951429,Excellent - minimal vocabulary overlap,>0.7 considered good
1,Average Word Overlap,0.031933,Good - clear separation,<0.05 considered good
2,C_v Coherence,0.525482,Excellent - highly coherent,>0.4 considered good
3,U Measure,,Fair,<-8.0 acceptable
