### Topic modeling
Extracting topics from the reddit messages 

##### Imports 

In [2]:
# Imports
from bertopic import BERTopic
import pandas as pd
import numpy as np
import nltk 

##### Data 

Loading the data 

In [3]:
# getting subset of data 
messages_df = pd.read_csv("reddit_22_51/messages.csv", sep="\t")
messages_df.head()

Unnamed: 0,id,user,text
0,j0s252k,HexagonOfVirtue,"im gonna find it just to check, it's not the ..."
1,j0s25h2,Teephex,According to you criticizing and being skeptic...
2,j0s25ht,1platesquat,Gotcha. Can you explain to me why your opinion...
3,j0s25l5,YouLostTheGame,"Euros, which some argue is actually harder tha..."
4,j0s25nr,HMID_Delenda_Est,You've been sounding more like PunishedSubSist...


Info on the data 

In [4]:
# info on data
column_list = messages_df.columns
shape = messages_df.shape

print("columns: ", column_list)
print("shape ", shape)

columns:  Index(['id', 'user', 'text'], dtype='object')
shape  (290898, 3)


**Data cleaning: stop word removal**

Stop word are very frequent words e.g. “the” and “a” that can impact the topics generated by the bertopic model due to their high frequency across most documents/texts - can be removed to get clearer more informative topics 

In [5]:
# METHOD 1: use a predefined list of stop words 
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stop_words(text, stop_words): 
    words = text.split() 
    # Use a list comprehension to remove stop words 
    filtered_words = [word for word in words if word.lower() not in stop_words] 
  
  # Join the filtered words back into a sentence 
    return ' '.join(filtered_words)

messages_df_sw = messages_df.copy()
messages_df_sw["text"] = messages_df_sw["text"].apply(lambda row: remove_stop_words(row, stop_words))
messages_df_sw.head()

[nltk_data] Downloading package stopwords to /home/jnye/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,user,text
0,j0s252k,HexagonOfVirtue,"im gonna find check, ai art one right? trigger..."
1,j0s25h2,Teephex,According criticizing skeptical active politic...
2,j0s25ht,1platesquat,Gotcha. explain opinion important me?
3,j0s25l5,YouLostTheGame,"Euros, argue actually harder WC"
4,j0s25nr,HMID_Delenda_Est,sounding like PunishedSubSister tbh


Convert data to list of strings (input needed by bertopic)

In [6]:
# convert to list of strings (input needed by bertopic model)
messages_list = messages_df_sw["text"].astype(str).tolist()
len(messages_list)

290898

Get subset of data to work with 

In [7]:
# subset of data - 5000 rows
messages_subset = messages_list[:10000]

##### Bertopic model 

In [8]:
topic_model = BERTopic()
# topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")

In [9]:
# fitting the bertopic model 
topic_model_fitted = topic_model.fit(messages_list)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [10]:
# parameters of the general model
topic_model_fitted.get_params()

{'calculate_probabilities': False,
 'ctfidf_model': ClassTfidfTransformer(),
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f329caf0a60>,
 'hdbscan_model': HDBSCAN(min_cluster_size=10, prediction_data=True),
 'language': 'english',
 'low_memory': False,
 'min_topic_size': 10,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'representation_model': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(),
 'verbose': False,
 'zeroshot_min_similarity': 0.7,
 'zeroshot_topic_list': None}

In [14]:
# get info on topics (names, important words, representative reddit message/document)
topic_info = topic_model_fitted.get_topic_info()
topic_info.to_csv("topic_info_full.csv", sep="\t", index=False)

In [12]:
topic_keywords = topic_model.get_topics()

# Print the top words for each topic
for topic, keywords in topic_keywords.items():
    print(f"Topic {topic}: {keywords[:10]}")

Topic -1: [('russia', np.float64(0.0004657108180287409)), ('trump', np.float64(0.0004204047788697832)), ('ukraine', np.float64(0.0004198793928134626)), ('us', np.float64(0.0004088083105379225)), ('people', np.float64(0.0004076616755930348)), ('like', np.float64(0.0004075707644457844)), ('money', np.float64(0.00040544289223144443)), ('would', np.float64(0.00040360888274398974)), ('war', np.float64(0.0004023351824552959)), ('left', np.float64(0.0004020856231407213))]
Topic 0: [('vaccine', np.float64(0.011672993158784036)), ('covid', np.float64(0.009511939685562451)), ('vaccines', np.float64(0.00944150040043642)), ('vaccinated', np.float64(0.008449257750013964)), ('flu', np.float64(0.005670472163514823)), ('immunity', np.float64(0.00489290686066106)), ('virus', np.float64(0.0048642741829124396)), ('unvaccinated', np.float64(0.0047889874376414385)), ('vaccination', np.float64(0.004501469139672086)), ('deaths', np.float64(0.004185430709837665))]
Topic 1: [('abortion', np.float64(0.010779252

In [13]:
# get the top words for each topic id (form the topic names)
topic_model_fitted.get_topics()

{-1: [('russia', np.float64(0.0004657108180287409)),
  ('trump', np.float64(0.0004204047788697832)),
  ('ukraine', np.float64(0.0004198793928134626)),
  ('us', np.float64(0.0004088083105379225)),
  ('people', np.float64(0.0004076616755930348)),
  ('like', np.float64(0.0004075707644457844)),
  ('money', np.float64(0.00040544289223144443)),
  ('would', np.float64(0.00040360888274398974)),
  ('war', np.float64(0.0004023351824552959)),
  ('left', np.float64(0.0004020856231407213))],
 0: [('vaccine', np.float64(0.011672993158784036)),
  ('covid', np.float64(0.009511939685562451)),
  ('vaccines', np.float64(0.00944150040043642)),
  ('vaccinated', np.float64(0.008449257750013964)),
  ('flu', np.float64(0.005670472163514823)),
  ('immunity', np.float64(0.00489290686066106)),
  ('virus', np.float64(0.0048642741829124396)),
  ('unvaccinated', np.float64(0.0047889874376414385)),
  ('vaccination', np.float64(0.004501469139672086)),
  ('deaths', np.float64(0.004185430709837665))],
 1: [('abortion',

##### Saving the model 

In [None]:
topic_model_fitted.save("bertopic_model", save_embedding_model=True)



##### Testing the saved model 

In [None]:
topic_model = BERTopic.load("bertopic_model")

In [None]:
# New documents to analyze
new_docs = ["The health care system is flawed and something should be done to improve it",
            "Climate change and global warming are really important issues and are getting worse."]

# Get topics for the new documents
topics, probs = topic_model.transform(new_docs)

[np.int64(-1), np.int64(15)]


In [24]:
# Function to get topic names
def get_topic_name(topic_model, topic_num):
    if topic_num == -1:
        return "Outlier (No strong topic match)"
    topic_words = topic_model.get_topic(topic_num)
    if topic_words:
        return ", ".join([word for word, _ in topic_words])
    return "Unknown Topic"

# Print topic assignments with topic names
for doc, topic_num in zip(new_docs, topics):
    topic_name = get_topic_name(topic_model, topic_num)
    print(f"Document: {doc}\nAssigned Topic: {topic_num} - {topic_name}\n")


Document: The health care system is flawed and something should be done to improve it
Assigned Topic: -1 - Outlier (No strong topic match)

Document: Climate change and global warming are really important issues and are getting worse.
Assigned Topic: 15 - climate, co2, warming, temperature, atmosphere, greenhouse, ice, global, carbon, scientists

