code adapted from https://github.com/MIND-Lab/OCTIS

# Modeling

In [None]:
!pip install octis
!pip install bertopic
!pip install scikit-learn

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
from octis.dataset.dataset import Dataset
#from octis.models.BERTopic import BERTopic
dataset = Dataset()
dataset.load_custom_dataset_from_folder("/content/gdrive/My Drive/AllComments/Chicago")
corpus = dataset.get_corpus()
docs = []
for i in corpus:
  s = ""
  for j in i:
    s += j + ' '
  docs.append(s)

  data = dataset

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

In [None]:
cv = topic_model.vectorizer_model
X = cv.fit_transform(docs)
doc_tokens = [text.split(" ") for text in docs]

import gensim.corpora as corpora
id2word = corpora.Dictionary(doc_tokens)
texts = doc_tokens
corpus = [id2word.doc2bow(text) for text in texts]

topic_words = []
for i in range(len(topic_model.get_topic_freq())-1):
  interim = []
  interim = [t[0] for t in topic_model.get_topic(i)]
  topic_words.append(interim)

from gensim.models.coherencemodel import CoherenceModel
from octis.evaluation_metrics.diversity_metrics import TopicDiversity

coherence_model = CoherenceModel(topics=topic_words, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence_model.get_coherence()
print("topic coherence: ", coherence_model.get_coherence())



all_words = [word for words in corpus for word in words]

bertopic_topics = [
        [
            vals[0] if vals[0] in all_words else all_words[0]
            for vals in topic_model.get_topic(i)[:10]
        ]
        for i in range(len(set(topics)) - 1)
    ]

topic_dict = {'topics': bertopic_topics}

#print(topic_dict['topics'])
metric = TopicDiversity(topk=10) # Initialize metric
topic_diversity_score = metric.score(topic_dict) # Compute score of the metric
print("Topic diversity:", str(topic_diversity_score))

topic coherence:  -0.14203198450569243
Topic diversity: 0.0038461538461538464


In [None]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name
0,-1,513,-1_city_move_live_people
1,0,174,0_crime_people_gun_community
2,1,156,1_pizza_domino_bad_cheap
3,2,145,2_always_cool_back_job
4,3,75,3_move_back_away_ever


In [None]:
topic_model.get_topic(3) # Select the most frequent topic

[('move', 0.2392870481502074),
 ('back', 0.0843565261392131),
 ('away', 0.05250847651782002),
 ('ever', 0.05073979251702995),
 ('never', 0.04272781335516146),
 ('share', 0.03759253231481573),
 ('stay', 0.03522161723236146),
 ('decision', 0.032268583685616345),
 ('week', 0.02965467668055358),
 ('next', 0.02817729378588917)]

# Visualization

In [None]:
topic_model.visualize_topics(width=500, height= 500)

In [None]:
topic_model.visualize_distribution(probs[200], min_probability=0.015)

In [None]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [None]:
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
topic_model.visualize_heatmap(n_clusters=14, width=500, height=500)

In [None]:
topic_model.visualize_term_rank()