In [17]:
import re
import sys
import json
import socket
import logging
import pickle
import pandas as pd
import numpy as np
import httpx
import h5py
import os
import umap
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.colors import Normalize
from matplotlib.colors import ListedColormap
from scipy.interpolate import griddata
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords

# Ensure you have the NLTK stopwords downloaded
import nltk
nltk.download('stopwords')

from tqdm import tqdm
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

from gensim import corpora
from gensim.models.ldamodel import LdaModel
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from bertopic.representation import TextGeneration
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from scipy.spatial import ConvexHull
from datetime import datetime 

import openai
from bertopic.representation import OpenAI

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ru007471\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Functions

In [18]:
## BERTOpic Modelling
def create_topic_model(docs, embeddings, nr_topics=36, n_gram_range=(1, 3), min_cluster_size=50):
    """
    Create and apply a BERTopic model to a given set of documents and embeddings.

    Parameters:
    docs (list of str): The documents to be analyzed.
    embeddings (ndarray): Precomputed embeddings for the documents.
    nr_topics (int): Maximum number of topics to extract.
    n_gram_range (tuple): The range of n-grams to consider for topic tokenization.
    min_cluster_size (int): Minimum size of clusters.

    Returns:
    pd.DataFrame: DataFrame containing the topic information for each document.
    """

    # Step 1 - Extract embeddings
    embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors=30,  # Neighboring sample points (Increasing means more global view)
                      n_components=20, # Reduced dimensions of the embeddings
                      min_dist=0.0, 
                      metric='euclidean')

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, 
                            metric='euclidean', 
                            cluster_selection_method='eom', 
                            prediction_data=True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english",  # Removes stopwords
                                       ngram_range=n_gram_range)  # No. of words in a topic

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with 
    # a `bertopic.representation` model
    representation_model = {
      "KeyBERET": KeyBERTInspired(
        top_n_words=15,  # The top n words to extract per topic (default 10)
        nr_repr_docs=12,  # The number of representative documents to extract per cluster (default 5)
        nr_samples=300  # The number of candidate documents to extract per cluster (default 500)
      )
    }

    # All steps together
    topic_model = BERTopic(
      embedding_model=embedding_model,          # Step 1 - Extract embeddings
      umap_model=umap_model,                    # Step 2 - Reduce dimensionality
      hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
      vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
      ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
      representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic representations
      nr_topics=nr_topics,                      # Maximum number of topics
      n_gram_range=n_gram_range                 # No. of words per topic
    )

    # Fit the model
    topics, ini_probs = topic_model.fit_transform(docs, embeddings)

    # Making the outliers part of the dataset
    new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf")
    topic_model.update_topics(docs, topics=new_topics)
        
    # Create result DataFrame
    #df_res = pd.DataFrame({'Topic': topics, 'Probability': ini_probs})
    df_res = pd.DataFrame({'Topic': new_topics, 'Probability': ini_probs})
    
    # Convert the list of topic labels to a DataFrame
    df_topic = pd.DataFrame({'Topic': range(len(topic_model.get_topic_info())), 'Name': topic_model.generate_topic_labels(nr_words=10, topic_prefix=True, word_length=None, separator='_', aspect=None)})
    
    df_res = df_res.merge(df_topic, how='left', on='Topic')
    df_res.rename(columns={'Name': 'label',
                           'Topic': 'cluster'}, inplace=True)

    return df_res