### Topic modeling
Extracting topics from the reddit messages 

##### Imports 

In [31]:
# Imports
from bertopic import BERTopic
import pandas as pd
import numpy as np
import nltk 

##### Data 

Loading the data 

In [32]:
# getting subset of data 
messages_df = pd.read_csv("reddit_22_51/messages.csv", sep="\t")
messages_df.head()

Unnamed: 0,id,user,text
0,j0s252k,HexagonOfVirtue,"im gonna find it just to check, it's not the ..."
1,j0s25h2,Teephex,According to you criticizing and being skeptic...
2,j0s25ht,1platesquat,Gotcha. Can you explain to me why your opinion...
3,j0s25l5,YouLostTheGame,"Euros, which some argue is actually harder tha..."
4,j0s25nr,HMID_Delenda_Est,You've been sounding more like PunishedSubSist...


Info on the data 

In [33]:
# info on data
column_list = messages_df.columns
shape = messages_df.shape

print("columns: ", column_list)
print("shape ", shape)

columns:  Index(['id', 'user', 'text'], dtype='object')
shape  (290898, 3)


**Data cleaning: stop word removal**

Stop word are very frequent words e.g. “the” and “a” that can impact the topics generated by the bertopic model due to their high frequency across most documents/texts - can be removed to get clearer more informative topics 

In [34]:
# METHOD 1: use a predefined list of stop words 
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

def remove_stop_words(text, stop_words): 
    words = text.split() 
    # Use a list comprehension to remove stop words 
    filtered_words = [word for word in words if word.lower() not in stop_words] 
  
  # Join the filtered words back into a sentence 
    return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to /home/jnye/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
# METHOD 2: remove most frequent words found in the data 
most_frequent_words = pd.Series(' '.join(messages_df['text']).lower().split()).value_counts()[:100].index.tolist()

messages_df_sw2 = messages_df.copy()
messages_df_sw2["text"] = messages_df_sw2["text"].apply(lambda row: remove_stop_words(row, most_frequent_words))

In [36]:
# combined 
stop_words_combined = list(set(most_frequent_words[:30] + stop_words)) 

messages_df_combined = messages_df.copy()
messages_df_combined["text"] = messages_df_combined["text"].apply(lambda row: remove_stop_words(row, stop_words_combined))

In [37]:
# import matplotlib.pyplot as plt

# # Calculate word frequencies
# word_frequencies = pd.Series(' '.join(messages_df['text']).lower().split()).value_counts()
# # Plot the top N words
# top_n = 100  # Change this to the number of words you want to plot
# word_frequencies.head(top_n).plot(kind='bar', figsize=(10, 6), color='skyblue')

# # Add labels and title
# plt.title('Top 20 Word Frequencies', fontsize=16)
# plt.xlabel('Words', fontsize=12)
# plt.ylabel('Frequency', fontsize=12)
# plt.xticks(rotation=45)
# plt.show()

Convert data to list of strings (input needed by bertopic)

In [38]:
# convert to list of strings (input needed by bertopic model)
messages_list = messages_df_combined["text"].astype(str).tolist()
len(messages_list)

290898

Get subset of data to work with 

In [39]:
# subset of data - 5000 rows
messages_subset = messages_list[:10000]
messages_subset

['im gonna find check, ai art one right? triggered rare',
 'According criticizing skeptical active politican mental disorder?',
 'Gotcha. explain opinion important me?',
 'Euros, argue actually harder WC',
 'sounding like PunishedSubSister tbh',
 'hate race hate born',
 'communism abysmal failure ever single instances attempted? Yeah. Hans would agree.',
 'made genuine effort try estimate cost moving affordable place?',
 'implemented rule saying link social media sites (Instagram, facebook, Mastodon, etc)',
 'Yes. Every single human earth eats tiny amounts poison regular basis. know know that? literally everything poisonous depending dose. Water poisonous drink enough it.',
 'set base couple days. Go read Volume 2 AFH 10-222 see done.',
 'welcome opinion.',
 'Literally least worst Trump adjacent person could sell to.',
 'Speech thought. Thought police.',
 'know happened school system.',
 'Thread OP: "so need landlords?" Me: "unironically yes" tbh long one rental company get monopoly ce

##### Bertopic model 

In [40]:
topic_model = BERTopic()
# topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")

In [41]:
# fitting the bertopic model 
topic_model_fitted = topic_model.fit(messages_subset)

In [42]:
# parameters of the general model
topic_model_fitted.get_params()

{'calculate_probabilities': False,
 'ctfidf_model': ClassTfidfTransformer(),
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x7f05d81f29e0>,
 'hdbscan_model': HDBSCAN(min_cluster_size=10, prediction_data=True),
 'language': 'english',
 'low_memory': False,
 'min_topic_size': 10,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'representation_model': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(),
 'verbose': False,
 'zeroshot_min_similarity': 0.7,
 'zeroshot_topic_list': None}

In [43]:
# get info on topics (names, important words, representative reddit message/document)
topic_info = topic_model_fitted.get_topic_info()
topic_info.to_csv("topic_info_10000_combined30.csv", sep="\t", index=False)

In [44]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4571,-1_like_im_think_one,"[like, im, think, one, right, would, its, that...",[FBI didn’t suppress (in opinion) shouldn’t ab...
1,0,335,0_biden_republicans_vote_trump,"[biden, republicans, vote, trump, democrats, s...","[republicans hold house, senate, executive bra..."
2,1,187,1_capitalism_socialism_communism_marx,"[capitalism, socialism, communism, marx, capit...","[Look, get post vague attempt (I don’t think d..."
3,2,175,2_lol_mad_yeah_exactly,"[lol, mad, yeah, exactly, sense, accurate, hah...","[Yeah, makes sense., Didn’t know lol, mean tha..."
4,3,167,3_church_sin_catholic_jesus,"[church, sin, catholic, jesus, god, religion, ...","[Sin literal definition wickedness, even dont ..."
...,...,...,...,...,...
145,144,11,144_population_grew_record_wished,"[population, grew, record, wished, turn, large...",[assumption? It’s basic math said elections “r...
146,145,10,145_justifications_contradictory_tired_fallacy,"[justifications, contradictory, tired, fallacy...","[Lol ooo Fallacy Fallacy, point CMV? different..."
147,146,10,146_illusion_reality_complex_idiom,"[illusion, reality, complex, idiom, existentia...","[argument, matter. Enjoying illusion could rea..."
148,147,10,147_hypocritical_hypocrites_hypocrite_bias,"[hypocritical, hypocrites, hypocrite, bias, ei...","[Well say hypocritical, must be. mean, fucking..."


In [45]:
# get the top words for each topic id (form the topic names)
topic_model_fitted.get_topics()

{-1: [('like', np.float64(0.005138240936190621)),
  ('im', np.float64(0.004839576584854077)),
  ('think', np.float64(0.004732134500751472)),
  ('one', np.float64(0.004406618430629048)),
  ('right', np.float64(0.004375696176533897)),
  ('would', np.float64(0.004160755302452885)),
  ('its', np.float64(0.0041037327596619945)),
  ('thats', np.float64(0.0040606156876722916)),
  ('get', np.float64(0.004010343074977448)),
  ('it', np.float64(0.0038521713647388985))],
 0: [('biden', np.float64(0.021951505430448038)),
  ('republicans', np.float64(0.02040279044825776)),
  ('vote', np.float64(0.018765080870852664)),
  ('trump', np.float64(0.018730963181831674)),
  ('democrats', np.float64(0.018216301573532045)),
  ('senate', np.float64(0.015682099418260473)),
  ('republican', np.float64(0.015204453833972117)),
  ('party', np.float64(0.015133742619958389)),
  ('obama', np.float64(0.013971249487186125)),
  ('dems', np.float64(0.012220841161121111))],
 1: [('capitalism', np.float64(0.032568334070023