<a href="https://colab.research.google.com/github/banned-books/project_banned_books/blob/main/unsupervised_topic_modeling/Evaluation_Functions_BERTopic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount GDrive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install git+https://github.com/MaartenGr/BERTopic.git

## Import Libraries

In [None]:
# Import data cleaning & manipulation libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from itertools import combinations

# Import libraries for NLP work
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance, PartOfSpeech
from scipy import linalg
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import nltk
nltk.download('punkt')
nltk.download('stopwords')

%matplotlib inline
np.set_printoptions(suppress=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# BERTopic Modeling | Custom Evaluation Functions


### Calculate the coherence score evaluation method

We delved into the BERTopic docs and issues channel to find this Gensim CoherenceModel solution proposed by the author of BERTopic: https://github.com/MaartenGr/BERTopic/issues/90





In [None]:
def calculate_coherence(docs, topics):
  """
  Calculate the coherence of a BERTopic model.

  Parameters
  ----------
  docs: per BERTopic docs
  topics: the BERTopic topics
  
  """

  # Preprocess Documents
  documents = pd.DataFrame({"Document": docs,
                            "ID": range(len(docs)),
                            "Topic": topics})

  documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
  cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

  # Extract vectorizer and analyzer from BERTopic
  vectorizer = topic_model.vectorizer_model
  analyzer = vectorizer.build_analyzer()

  # Use .get_feature_names_out() if you get an error with .get_feature_names()
  words = vectorizer.get_feature_names_out()

  # Extract features for Topic Coherence evaluation
  tokens = [analyzer(doc) for doc in cleaned_docs]
  dictionary = corpora.Dictionary(tokens)
  corpus = [dictionary.doc2bow(token) for token in tokens]

  # Extract words in each topic if they are non-empty and exist in the dictionary
  topic_words = []

  for topic in range(len(set(topics))-topic_model._outliers):
      words = list(zip(*topic_model.get_topic(topic)))[0]
      words = [word for word in words if word in dictionary.token2id]
      topic_words.append(words)

  topic_words = [words for words in topic_words if len(words) > 0]

  # Evaluate Coherence
  coherence_model = CoherenceModel(topics=topic_words, 
                                  texts=tokens, 
                                  corpus=corpus,
                                  dictionary=dictionary, 
                                  coherence='c_v') # You may also use 'c_uci', 'c_npmi', or 'u_mass'. 

  coherence = coherence_model.get_coherence()

  return coherence

### Calculate the perplexity score evaluation method

This took some time to figure out. We calculated the `log_perplexity` from the `probs` variable and then converted it back to a perplexity score.

In [None]:
def calculate_perplexity():
  """ Calculate the perplexity of a BERTopic model."""
  
  log_perplexity = -1 * np.mean(np.log(np.sum(probs)))
  perplexity = np.exp(log_perplexity)

  return perplexity, log_perplexity

### Calculate the proportion of unique words evaluation method

We calculate the proportion of unique words in a series of topics (to aid with understanding topic diversity).

This code was pieced together through many Stack Overflow posts around building topic model evaluations from scratch.

In [None]:
def proportion_unique_words(topic_model, topk):
    """
    Calculate the proportion of unique words.

    Parameters
    ----------
    topic_model: fitted BERTopic model
    topk: top k words on which the topic diversity will be computed
    
    """

    topics_list = topic_model.get_topics()

    topics = [[words for words, _ in topic_model.get_topic(topic)] 
                  for topic in range(len(set(topics_list))-1)]

    if topk > len(topics[0]):
        raise Exception('Words in these topics are less than '+ str(topk))

    else:
        unique_words = set()

        for topic in topics:
            unique_words = unique_words.union(set(topic[:topk]))
            
        puw = len(unique_words) / (topk * len(topics))
        
        return puw


### Calculate the average pairwise Jaccard distance between topics (to aid with understanding topic diversity).

In [None]:
def pairwise_jaccard_diversity(topic_model, topk):
    '''
    Calculate the average pairwise Jaccard distance between the topics 
  
    Parameters
    ----------
    topic_model: fitted BERTopic model
    topk: top k words on which the topic diversity will be computed
    
    Returns
    -------
    pjd: average pairwise jaccard distance
    '''

    topics_list = topic_model.get_topics()
    
    topics = [[words for words, _ in topic_model.get_topic(topic)] 
                  for topic in range(len(set(topics_list))-1)]

    dist = 0
    count = 0

    for list1, list2 in combinations(topics, 2):
        js = 1 - len(set(list1).intersection(set(list2)))/len(set(list1).union(set(list2)))
        dist = dist + js
        count = count + 1

    return dist/count