<a href="https://colab.research.google.com/github/amandeep25/Topic_modelling/blob/main/Octis_metrics_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OCTIS with LDA

In [None]:
!pip install octis

In [2]:
from octis.models.LDA import LDA
from octis.dataset.dataset import Dataset
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [3]:
# Define dataset
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

In [4]:
# Create Model
model = LDA(num_topics=20, alpha=0.1)

In [5]:
# Train the model using default partitioning choice
output = model.train_model(dataset)

print(*list(output.keys()), sep="\n") # Print the output identifiers



topic-word-matrix
topics
topic-document-matrix
test-topic-document-matrix


In [6]:
for t in output['topics'][:5]:
  print(" ".join(t))

key chip encryption work keyboard algorithm public send system phone
people time question make give sin answer man point word
armenian turkish russian genocide people government population greek war village
file program image window include version application software widget list
post problem good mail time make list read thing people


In [7]:
# Initialize metric
cv_ = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_v')

In [8]:
# Initialize metric
npmi = Coherence(texts=dataset.get_corpus(), topk=10, measure='c_npmi')

In [9]:
# Initialize metric
topic_diversity = TopicDiversity(topk=10)

In [11]:
# Retrieve metrics score
topic_diversity_score = topic_diversity.score(output)
print("Topic diversity: "+str(topic_diversity_score))

cv = cv_.score(output)
print("Coherence: "+str(cv))

Topic diversity: 0.72
Coherence: 0.5515237487783734


# OCTIS with Bertopic

In [None]:
!pip install bertopic
!pip install octis

In [2]:
from bertopic import BERTopic
from octis.dataset.dataset import Dataset
from octis.models.model import AbstractModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [3]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [4]:
import nltk
import gensim
import gensim.corpora as corpora
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
from sklearn.datasets import fetch_20newsgroups

In [5]:
twenty_docs_everything = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
twenty_docs = twenty_docs_everything['data']
import re
newdocs = [re.sub(' +',' ', doc.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').replace('"', '\'').replace('\x0c', '').replace('\x1c', '')) for doc in twenty_docs]


In [6]:
# write the two files needed to create an OCTIS dataset
with open("corpus.tsv", "w") as f :
    f.write("\n".join(map(str, newdocs)))
f.close()
words = []
for line in newdocs :
  words.extend(line.split())

with open("vocabulary.txt", "w") as f :
    f.write("\n".join(map(str, words)))
f.close()

In [7]:
class BERTopicModelImpl(AbstractModel) :

  def __init__(self, min_cluster_size=None,
               min_sample_size=None,
               embeddings=None,
               nr_topics=None) :

    super().__init__()
    self.hyperparameters = dict()
    self.hyperparameters['min_cluster_size'] = min_cluster_size
    self.hyperparameters['min_sample_size'] = min_sample_size
    self.hyperparameters['embeddings'] = embeddings
    self.BERTopic_model = None
    self.BERTopic_topics = None


    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", min_df=10)

    self.init_params = {'vectorizer_model' : vectorizer_model}

    if self.hyperparameters['min_cluster_size'] is not None :
      hdbscan_model = HDBSCAN(metric='euclidean',
                              cluster_selection_method='eom',
                              prediction_data=False,
                              min_cluster_size=self.hyperparameters['min_cluster_size'],
                              min_samples=self.hyperparameters['min_sample_size'])
      self.init_params['hdbscan_model'] = hdbscan_model

    if nr_topics is not None :
      self.init_params['nr_topics'] = nr_topics

    self.BERTopic_model = BERTopic(**self.init_params)

  def train_model(self, dataset):

    bertdata = [" ".join(words) for words in dataset.get_corpus()]
    self.BERTopic_topics, _ = self.BERTopic_model.fit_transform(bertdata,
                                                           embeddings=self.hyperparameters['embeddings'])

    bertopic_topics = [
        [topicwords[0] for topicwords in self.BERTopic_model.get_topic(i)[:10]]
          for i in range(len(set(self.BERTopic_topics)) - 1)]

    result = dict()
    result['topics'] = bertopic_topics
    return result

In [9]:
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.66k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [19]:
from octis.dataset.dataset import Dataset
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

In [20]:
bert_base_model = BERTopicModelImpl(embedding_model, 10)
results = bert_base_model.train_model(dataset)
num_topics = len(results['topics'])

# Assuming you want to consider the top 10 words for coherence calculation
topk = 10

npmi = Coherence(texts=dataset.get_corpus(), topk=topk, measure='c_v').score(results)
td = TopicDiversity().score(results)

print(f'Num Topics: {num_topics}, NPMI: {npmi}, Topic Diversity: {td}')


ValueError: Min samples and min cluster size must be integers!

In [22]:
# Step 2.1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [23]:

# Step 2.2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

In [24]:
# Step 2.3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [25]:

# Step 2.4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")


In [26]:

# Step 2.5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

In [27]:

model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  nr_topics=10                        # Step 6 - Diversify topic words
)


In [28]:

topics, probabilities = model.fit_transform(newdocs)

In [None]:
bert_base_model = BERTopicModelImpl(embedding_model, 10)
results = bert_base_model.train_model(dataset)
num_topics = len(results['topics'])

# Assuming you want to consider the top 10 words for coherence calculation
topk = 10

npmi = Coherence(texts=dataset.get_corpus(), topk=topk, measure='c_v').score(results)
td = TopicDiversity().score(results)

print(f'Num Topics: {num_topics}, NPMI: {npmi}, Topic Diversity: {td}')