In [1]:
%%capture
!pip install bertopic
!pip install octis

In [2]:
from bertopic import BERTopic
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.coherencemodel import CoherenceModel

from octis.evaluation_metrics.diversity_metrics import TopicDiversity

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#from sklearn.datasets import fetch_20newsgroups
#docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

df = pd.read_csv("drive/MyDrive/UW Pharm/Data/choiceboard_data.csv")
#data = data.dropna()
#print('There are ' + str(data.shape[0]) + ' rows.')
#data.head()


In [7]:
df_cpd = df[["CPD_Q1", "CPD_Q3"]]
df_cpd.head()

Unnamed: 0,CPD_Q1,CPD_Q3
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...


In [8]:
corpus_1 = df_cpd["CPD_Q1"].dropna()
corpus_1 = corpus_1.astype(str).tolist()
#corpus_1

corpus_3 = df_cpd["CPD_Q3"].dropna()
corpus_3 = corpus_3.astype(str).tolist()
#corpus_3

# **Topic Modeling**




## Training


In [32]:
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
topic_model = BERTopic(language="english",
                       calculate_probabilities=True,
                       verbose=True,
                       min_topic_size=8,
                       n_gram_range = (1, 2),
                       vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(corpus_1)

2024-03-08 02:43:34,802 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-03-08 02:43:35,721 - BERTopic - Embedding - Completed ✓
2024-03-08 02:43:35,724 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-08 02:43:37,738 - BERTopic - Dimensionality - Completed ✓
2024-03-08 02:43:37,740 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-08 02:43:37,750 - BERTopic - Cluster - Completed ✓
2024-03-08 02:43:37,755 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-08 02:43:37,790 - BERTopic - Representation - Completed ✓


## Extracting Topics

In [35]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,32,-1_time_class_health_wellness,"[time, class, health, wellness, health wellnes...",[I found the time devoted to health and wellne...
1,0,42,0_time_health_class_wellness,"[time, health, class, wellness, mental, think,...",[I’m really happy that class time was devoted ...
2,1,31,1_mentor_team_time_mentor team,"[mentor, team, time, mentor team, discussions,...","[Over the past three quarters, the class time ..."


In [34]:
freq.shape

(3, 5)

-1 refers to all outliers and should typically be ignored. Next, let's take a look at a frequent topic that were generated:

In [12]:
topic_model.get_topic(0)  # Select the most frequent topic

[('time', 0.0786985796040377),
 ('health', 0.049685927874383175),
 ('class', 0.04386454468200667),
 ('wellness', 0.03799504503088137),
 ('class time', 0.03333745000228414),
 ('mental', 0.03290946121384678),
 ('health wellness', 0.032159783439784054),
 ('mental health', 0.02916373896773352),
 ('think', 0.02836428963235505),
 ('selfcare', 0.023749386639308524)]

For reducing outliers: https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html#update-topic-representation

In [13]:
new_topics = topic_model.reduce_outliers(corpus_1, topics)
#topic_model.get_topic_info()
#new_topics

100%|██████████| 1/1 [00:00<00:00, 47.24it/s]


**NOTE**: BERTopic is stocastich which mmeans that the topics might differ across runs. This is mostly due to the stocastisch nature of UMAP.

## Calculate Coherence Score

In [14]:
cleaned_docs = topic_model._preprocess_text(corpus_1)
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

In [29]:
def get_coherence(topic_model):
  topics = topic_model.get_topics()
  topics.pop(-1, None)
  topic_words = [[word for word, _ in topic_model.get_topic(topic) if word != ""] for topic in topics]
  topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]

  coherence_model = CoherenceModel(topics=topic_words,
                                  texts=tokens,
                                  corpus=corpus,
                                  dictionary=dictionary,
                                  coherence='c_v')
  coherence = coherence_model.get_coherence()
  return coherence

def get_diversity(topic_model):
  topics = topic_model.get_topics()
  topics.pop(-1, None)
  diversity_model = TopicDiversity()

  octis_friendly_topics = []
  for topic in topics:
    octis_friendly_topics.append([t[0] for t in topics[topic]])

  #print(octis_friendly_topics)
  return diversity_model.score({'topics': octis_friendly_topics})

In [28]:
get_diversity(topic_model)

[['time', 'health', 'class', 'wellness', 'class time', 'mental', 'health wellness', 'mental health', 'think', 'selfcare'], ['mentor', 'team', 'time', 'mentor team', 'discussions', 'team discussions', 'class', 'mentor team discussions', 'school', 'health']]


0.85

In [30]:
%%time

min_topic_size_min = 3
min_topic_size_max = 8


n_gram_ranges = [(1, 2), (1, 3), (2, 3)]

model_results = {
                  'n_gram_range': [],
                  'min_topic_size': [],
                  'Coherence': [],
                  'Diversity': [],
                }

for min_topic_size in range(min_topic_size_min, min_topic_size_max + 1):
  for n_gram_range in n_gram_ranges:
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
    topic_model = BERTopic(language="english",
                          calculate_probabilities=True,
                          verbose=False,
                          min_topic_size=min_topic_size,
                          n_gram_range = n_gram_ranges,
                          vectorizer_model=vectorizer_model)
    topics, probs = topic_model.fit_transform(corpus_1)
    model_results['n_gram_range'].append(n_gram_range)
    model_results['min_topic_size'].append(min_topic_size)
    model_results['Coherence'].append(get_coherence(topic_model))
    model_results['Diversity'].append(get_diversity(topic_model))


model_results = pd.DataFrame(model_results)

In [31]:
model_results.sort_values('Diversity', ascending=False)

Unnamed: 0,n_gram_range,min_topic_size,Coherence,Diversity
15,"(1, 2)",8,0.656294,0.85
10,"(1, 3)",6,0.68208,0.825
17,"(2, 3)",8,0.692144,0.766667
12,"(1, 2)",7,0.674716,0.766667
16,"(1, 3)",8,0.711735,0.733333
7,"(1, 3)",5,0.700056,0.733333
1,"(1, 3)",3,0.573193,0.71
13,"(1, 3)",7,0.619341,0.7
11,"(2, 3)",6,0.630475,0.7
9,"(1, 2)",6,0.701601,0.7
