In [4]:
%%capture
!pip install bertopic

In [5]:
from bertopic import BERTopic
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models.coherencemodel import CoherenceModel

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
#from sklearn.datasets import fetch_20newsgroups
#docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

df = pd.read_csv("drive/MyDrive/UW Pharm/Data/choiceboard_data.csv")
#data = data.dropna()
#print('There are ' + str(data.shape[0]) + ' rows.')
#data.head()


In [7]:
df_cpd = df[["CPD_Q1", "CPD_Q3"]]
df_cpd.head()

Unnamed: 0,CPD_Q1,CPD_Q3
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...


In [8]:
corpus_1 = df_cpd["CPD_Q1"].dropna()
corpus_1 = corpus_1.astype(str).tolist()
#corpus_1

corpus_3 = df_cpd["CPD_Q3"].dropna()
corpus_3 = corpus_3.astype(str).tolist()
#corpus_3

# **Topic Modeling**




## Training


In [9]:
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
topic_model = BERTopic(language="english",
                       calculate_probabilities=True,
                       verbose=True,
                       min_topic_size=8,
                       n_gram_range = (2, 3),
                       vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(corpus_1)

2024-02-23 01:40:01,533 - BERTopic - Embedding - Transforming documents to embeddings.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2024-02-23 01:40:09,721 - BERTopic - Embedding - Completed ✓
2024-02-23 01:40:09,724 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-02-23 01:40:18,411 - BERTopic - Dimensionality - Completed ✓
2024-02-23 01:40:18,415 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-02-23 01:40:18,431 - BERTopic - Cluster - Completed ✓
2024-02-23 01:40:18,442 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-02-23 01:40:18,519 - BERTopic - Representation - Completed ✓


## Extracting Topics

In [10]:
freq = topic_model.get_topic_info(); freq.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,29,-1_class_time_wellness_health,"[class, time, wellness, health, care, health w...","[Honestly, I think there should be a whole cla..."
1,0,42,0_time_health_class_wellness,"[time, health, class, wellness, mental, class ...",[Over the past 3 quarters I have tried my best...
2,1,34,1_mentor_team_time_mentor team,"[mentor, team, time, mentor team, discussions,...","[Over the past three quarters, the class time ..."


In [11]:
freq.shape

(3, 5)

-1 refers to all outliers and should typically be ignored. Next, let's take a look at a frequent topic that were generated:

In [12]:
topic_model.get_topic(0)  # Select the most frequent topic

[('time', 0.07946662723921077),
 ('health', 0.051379766324646234),
 ('class', 0.04535992688707508),
 ('wellness', 0.03560686215784657),
 ('mental', 0.03544934860866545),
 ('class time', 0.033148032672725716),
 ('mental health', 0.031665855208715205),
 ('health wellness', 0.030697975101612058),
 ('think', 0.027998015231110798),
 ('selfcare', 0.024559024820194033)]

For reducing outliers: https://maartengr.github.io/BERTopic/getting_started/outlier_reduction/outlier_reduction.html#update-topic-representation

In [14]:
new_topics = topic_model.reduce_outliers(corpus_1, topics)
#new_topics

100%|██████████| 1/1 [00:00<00:00, 50.47it/s]


**NOTE**: BERTopic is stocastich which mmeans that the topics might differ across runs. This is mostly due to the stocastisch nature of UMAP.

## Calculate Coherence Score

In [11]:
cleaned_docs = topic_model._preprocess_text(corpus_1)
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

In [12]:
def get_coherence(topic_model):
  topics = topic_model.get_topics()
  topics.pop(-1, None)
  topic_words = [[word for word, _ in topic_model.get_topic(topic) if word != ""] for topic in topics]
  topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]

  coherence_model = CoherenceModel(topics=topic_words,
                                  texts=tokens,
                                  corpus=corpus,
                                  dictionary=dictionary,
                                  coherence='c_v')
  coherence = coherence_model.get_coherence()
  return coherence

In [13]:
get_coherence(topic_model)

0.7192997528461225

In [18]:
min_topic_size_min = 3
min_topic_size_max = 8


n_gram_ranges = [(1, 2), (1, 3), (2, 3)]
#min_n_gram_range = 1
#max_n_gram_range = 3

model_results = {
                  'n_gram_range': [],
                  'min_topic_size': [],
                  'Coherence': []
                }

for min_topic_size in range(min_topic_size_min, min_topic_size_max + 1):
  for n_gram_range in n_gram_ranges:
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
    topic_model = BERTopic(language="english",
                          calculate_probabilities=True,
                          verbose=False,
                          min_topic_size=min_topic_size,
                          n_gram_range = n_gram_ranges,
                          vectorizer_model=vectorizer_model)
    topics, probs = topic_model.fit_transform(corpus_1)
    model_results['n_gram_range'].append(n_gram_range)
    model_results['min_topic_size'].append(min_topic_size)
    model_results['Coherence'].append(get_coherence(topic_model))

model_results = pd.DataFrame(model_results)

In [19]:
model_results.sort_values('Coherence', ascending=False)

Unnamed: 0,n_gram_range,min_topic_size,Coherence
17,"(2, 3)",8,0.71897
9,"(1, 2)",6,0.71897
6,"(1, 2)",5,0.707494
10,"(1, 3)",6,0.705771
16,"(1, 3)",8,0.678306
13,"(1, 3)",7,0.677749
12,"(1, 2)",7,0.676658
11,"(2, 3)",6,0.674442
4,"(1, 3)",4,0.647149
14,"(2, 3)",7,0.639251
