In [73]:
# In this jupyter notebook we will investigate different Topic Modeling techniques and apply them to
# the subreddit wallstreetbets dataset from kaggle.
#                                              ( https://www.kaggle.com/datasets/mattpodolak/reddit-wallstreetbets-comments)
# NOTE We have done some preprocessing steps on the data.
#      The columns were misaligned and required alteration to correctly realign the data.
#      Also we have dropped many of the columns that are not needed for out purposes.


# The Topic Modeling that we will apply in this notebook are the following,

# 1. LDA or latent dirichlet allocation
#                                      Latent dirichlet allocation is a probabilistic model that treats documents
#                                      as mixtures of topics and identifies topics as collections of words.
#                                      It doesn't focus on word order but rather on the distribution of words
#                                     across documents and topics, allowing it to uncover hidden thematic patterns
#                                     in text corpora.

# 2. BERTopic
#          BERTopic is a sophisticated topic modeling approach that combines transformer-based language models
#          and contextual TF-IDF to generate dense clusters of topics from documents.

In [74]:
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN

In [75]:
import os

In [76]:
import warnings

In [78]:
# This will load the enviroment variables for the file paths

with open('.env') as f:
    config_lines = f.read().splitlines()

for line in config_lines:
    exec(line)

In [79]:
# Here we are loading in the data
df = pd.read_csv(pm9_1_5)
df2 = pd.read_csv(pm1_5_2)
df3 = pd.read_csv(pm2_3)
df4 = pd.read_csv(pm3_4_5)

In [53]:
# Here we are inspecting the data to make sure it is in working order

In [80]:
df.head()

Unnamed: 0,author,body,created_utc,total_awards_received
0,keyokenx1017,"['jimmy', 'buffet']",1591352266,0
1,2dum2FUNction,"['rdcp', 'biatch']",1591352263,0
2,xXMeebleXx,"['know', 'hurt', 'knowing', '20', '6', '19', '...",1591352261,0
3,infamousnj69,"['always', 'good', 'start', 'day', 'woke', 'cl...",1591352260,0
4,yxctz,"['premium', 'youd', 'receive', 'would', 'low',...",1591352260,0


In [81]:
df2.head()

Unnamed: 0,author,body,created_utc,total_awards_received
0,testernamed,"['100', 'today']",1589351000.0,0.0
1,Sea-School,"['hey', 'manbearpigsrs', 'okay']",1589351000.0,0.0
2,PNWPlayZ,"['danger', 'vix', 'call', 'fighting', 'jpow', ...",1589351000.0,0.0
3,krippsaiditwrong,"['better', 'poised', 'future', 'move', 'cash',...",1589351000.0,0.0
4,Sea-School,"['hey', 'manbearpigsrs', 'okay']",1589351000.0,0.0


In [82]:
df3.head()

Unnamed: 0,author,body,created_utc,total_awards_received
0,VacationLover1,"['u', 'redcedar53', 'big', 'gay', 'trump']",1608416008,0
1,numbnah,"['lmao', 'well', 'bad']",1608416007,0
2,EatAnimals_Yum,[],1608416006,0
3,upsetrobinhood,"['cathie', 'wood']",1608416002,0
4,VualkPwns,"['confirmed', 'rich', 'people', 'dance']",1608416000,0


In [83]:
df4.head()

Unnamed: 0,author,body,created_utc,total_awards_received
0,Killerslug,['ban'],1610751337,0
1,Quirky-Donut1269,"['r', 'fuk']",1610751336,0
2,Reckstar,"['went', 'ball', 'deep', '35c', '1', '22', 'to...",1610751335,0
3,Sketchdota,"['show', 'stock', 'certificate', 'hand']",1610751334,0
4,legalrock,"['someone', 'missee', 'om', 'ride', 'must', 'o...",1610751334,0


In [84]:
# Here we combine the two loaded dfs
comdf = pd.concat([df, df2, df3, df4], ignore_index=True)

In [85]:
# The below step is taken to get the best data. 
# There is alot of noise in this dataset since it is a subreddit. So if we filter to just have the posts that got
# at least 1 award then the resulting comments will contain relevant topics that are widely like, respected, or salient.

bestdata = comdf[comdf.total_awards_received >= 1]

In [86]:
# Gensim is a good library for LDA

import gensim
from gensim import corpora
from gensim.models import LdaModel

In [87]:
# ast has a function that is needed to format the data
import ast

In [88]:
# This shows how many data points we have in our dataset
len(bestdata)

15025

In [89]:
# Convert sample to literal lists
# The data was saved after being tokenized and is of type str and we need it as a list.
# The function turn_to_list will make the data of type str to a list

def turn_to_list(row):
    return ast.literal_eval(row)

In [90]:
betterdata = pd.DataFrame()

In [91]:
betterdata["body"] = bestdata.body.apply(turn_to_list)

In [93]:
# Here we are isolating the data to just be the lists and not the other columns

bestestdata = betterdata["body"]

In [94]:
# Creating first lda model (3 topics)
# We will do hyper parameter tuning on our LDA model by looking at different values for out number of topics.

# Create a dictionary and model
dictionary4 = corpora.Dictionary(bestestdata)
corpus3 = [dictionary4.doc2bow(doc) for doc in bestestdata]
num_top = 3  # Number of topics

# train model
lda_mopoop = LdaModel(corpus3, num_topics=num_top, id2word=dictionary4)

In [95]:
# Inspect the topics and their associated probabilities

for topic_id, topic in lda_mopoop.print_topics():
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.014*"call" + 0.010*"elon" + 0.009*"gold" + 0.008*"u" + 0.008*"put" + 0.007*"today" + 0.006*"award" + 0.006*"spy" + 0.006*"give" + 0.006*"fuck"
Topic 1: 0.010*"like" + 0.010*"get" + 0.008*"go" + 0.007*"money" + 0.007*"time" + 0.007*"people" + 0.006*"fucking" + 0.006*"market" + 0.005*"fuck" + 0.005*"mgm"
Topic 2: 0.013*"http" + 0.013*"gt" + 0.011*"com" + 0.011*"u" + 0.011*"amp" + 0.010*"call" + 0.007*"lt" + 0.007*"day" + 0.007*"www" + 0.007*"sayter"


In [96]:
# Creating second lda model with (5 topics hyperparameter tuning)

# Creating dictionary and model
dictionary4 = corpora.Dictionary(bestestdata)
corpus3 = [dictionary4.doc2bow(doc) for doc in bestestdata]
num_top = 5  # Number of topics

# Creating model
lda_mopoop = LdaModel(corpus3, num_topics=num_top, id2word=dictionary4)

In [97]:
# Printing the topics and their probabilities

for topic_id, topic in lda_mopoop.print_topics():
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.016*"http" + 0.015*"com" + 0.012*"amp" + 0.009*"market" + 0.009*"www" + 0.009*"day" + 0.007*"go" + 0.007*"stock" + 0.007*"time" + 0.006*"one"
Topic 1: 0.012*"tomorrow" + 0.008*"going" + 0.008*"na" + 0.008*"silver" + 0.007*"green" + 0.007*"red" + 0.006*"like" + 0.006*"gon" + 0.005*"3" + 0.005*"utm"
Topic 2: 0.009*"like" + 0.009*"mgm" + 0.008*"call" + 0.008*"make" + 0.008*"one" + 0.006*"get" + 0.006*"tesla" + 0.006*"go" + 0.006*"know" + 0.006*"money"
Topic 3: 0.016*"u" + 0.014*"gold" + 0.013*"get" + 0.012*"elon" + 0.011*"like" + 0.009*"money" + 0.008*"guy" + 0.008*"award" + 0.007*"give" + 0.007*"sayter"
Topic 4: 0.024*"call" + 0.018*"gt" + 0.011*"tsla" + 0.011*"bear" + 0.011*"lt" + 0.011*"put" + 0.009*"1" + 0.009*"mgm" + 0.009*"fuck" + 0.008*"spy"


In [98]:
# Creating Third LDA model with (8 topics hyperparameter tuning)

# Defining dictionary and model
dictionary4 = corpora.Dictionary(bestestdata)
corpus3 = [dictionary4.doc2bow(doc) for doc in bestestdata]
num_top = 8  # Number of topics

# Create and train model
lda_mopoop = LdaModel(corpus3, num_topics=num_top, id2word=dictionary4)

In [99]:
# Output the topics and probabilities
for topic_id, topic in lda_mopoop.print_topics():
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.018*"bear" + 0.011*"gang" + 0.008*"bull" + 0.007*"like" + 0.007*"people" + 0.007*"short" + 0.005*"option" + 0.005*"many" + 0.005*"internet" + 0.005*"thread"
Topic 1: 0.015*"na" + 0.013*"week" + 0.011*"gon" + 0.011*"day" + 0.011*"time" + 0.009*"get" + 0.008*"put" + 0.008*"market" + 0.007*"next" + 0.007*"money"
Topic 2: 0.043*"u" + 0.040*"gt" + 0.023*"lt" + 0.022*"sayter" + 0.019*"pop" + 0.011*"award" + 0.007*"6" + 0.007*"http" + 0.007*"guy" + 0.006*"like"
Topic 3: 0.034*"mgm" + 0.029*"call" + 0.017*"get" + 0.016*"like" + 0.014*"fuck" + 0.011*"fucking" + 0.009*"silver" + 0.009*"put" + 0.008*"would" + 0.008*"lol"
Topic 4: 0.016*"call" + 0.014*"go" + 0.011*"market" + 0.010*"stock" + 0.009*"buy" + 0.008*"buying" + 0.008*"back" + 0.007*"share" + 0.007*"tsla" + 0.007*"put"
Topic 5: 0.025*"elon" + 0.015*"gold" + 0.014*"give" + 0.012*"award" + 0.012*"fuck" + 0.010*"guy" + 0.009*"shit" + 0.008*"u" + 0.008*"one" + 0.008*"want"
Topic 6: 0.016*"gold" + 0.009*"gme" + 0.008*"good" + 0.008*

In [100]:
# creating fourth LDA model with (11 topics hyperparameter tuning)

# Creating dictionary and model
dictionary4 = corpora.Dictionary(bestestdata)
corpus3 = [dictionary4.doc2bow(doc) for doc in bestestdata]
num_top = 11  # Number of topics

# Defining model
lda_mopoop = LdaModel(corpus3, num_topics=num_top, id2word=dictionary4)

In [101]:
# Outputting the topics and probabilities
for topic_id, topic in lda_mopoop.print_topics():
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.014*"never" + 0.013*"day" + 0.012*"tomorrow" + 0.012*"green" + 0.009*"get" + 0.009*"damn" + 0.008*"gang" + 0.007*"money" + 0.007*"u" + 0.007*"thread"
Topic 1: 0.050*"na" + 0.038*"gon" + 0.013*"casino" + 0.013*"wrong" + 0.013*"mgm" + 0.010*"vega" + 0.009*"im" + 0.009*"right" + 0.009*"0" + 0.008*"fucking"
Topic 2: 0.027*"call" + 0.019*"got" + 0.013*"tsla" + 0.010*"6" + 0.010*"fucking" + 0.010*"fuck" + 0.010*"put" + 0.010*"7" + 0.009*"bought" + 0.007*"buy"
Topic 3: 0.024*"go" + 0.016*"day" + 0.011*"get" + 0.010*"future" + 0.009*"award" + 0.009*"going" + 0.008*"fucking" + 0.008*"every" + 0.008*"play" + 0.008*"back"
Topic 4: 0.014*"market" + 0.012*"like" + 0.011*"time" + 0.010*"good" + 0.008*"money" + 0.008*"get" + 0.007*"year" + 0.007*"sell" + 0.007*"still" + 0.007*"3"
Topic 5: 0.020*"day" + 0.019*"call" + 0.017*"put" + 0.017*"elon" + 0.014*"mgm" + 0.013*"silver" + 0.012*"fuck" + 0.011*"gold" + 0.011*"tomorrow" + 0.010*"guy"
Topic 6: 0.052*"http" + 0.046*"com" + 0.032*"amp" + 0.

In [102]:
# creating fifth LDA model with (18 topics hyperparameter tuning)

# Creating dictionary and model
dictionary4 = corpora.Dictionary(bestestdata)
corpus3 = [dictionary4.doc2bow(doc) for doc in bestestdata]
num_top = 18  # Number of topics

# Creating model
lda_mopoop = LdaModel(corpus3, num_topics=num_top, id2word=dictionary4)

In [103]:
# Printing out the topics and probabilities
for topic_id, topic in lda_mopoop.print_topics():
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.031*"thanks" + 0.029*"u" + 0.021*"bro" + 0.019*"guy" + 0.018*"always" + 0.018*"type" + 0.015*"yes" + 0.013*"debt" + 0.013*"go" + 0.013*"edit"
Topic 1: 0.034*"call" + 0.033*"open" + 0.031*"bear" + 0.030*"tsla" + 0.030*"tomorrow" + 0.025*"5" + 0.019*"bull" + 0.018*"let" + 0.017*"mgm" + 0.016*"red"
Topic 2: 0.032*"watch" + 0.025*"v" + 0.020*"youtube" + 0.018*"http" + 0.016*"bull" + 0.015*"r" + 0.015*"bear" + 0.014*"loss" + 0.011*"fuk" + 0.011*"get"
Topic 3: 0.025*"money" + 0.020*"make" + 0.015*"like" + 0.012*"people" + 0.012*"gay" + 0.012*"one" + 0.011*"need" + 0.011*"get" + 0.009*"could" + 0.009*"state"
Topic 4: 0.018*"retard" + 0.016*"today" + 0.014*"look" + 0.013*"minute" + 0.013*"usd" + 0.012*"poor" + 0.012*"vega" + 0.012*"dkng" + 0.011*"boy" + 0.011*"nkla"
Topic 5: 0.120*"gt" + 0.071*"lt" + 0.059*"pop" + 0.012*"mod" + 0.012*"black" + 0.012*"flair" + 0.011*"apple" + 0.009*"got" + 0.008*"texas" + 0.008*"mother"
Topic 6: 0.065*"http" + 0.060*"com" + 0.031*"www" + 0.017*"amp" 

In [104]:
# Review and Summary of LDA Model:
# 
# The method LDA performed unconvincingly. Many of the topics are poor and have little theme. The best performing topic
# was associating "www" with "com", and "http". The second best topic was grouping "elon" with "tesla". That was about as
# good as it got. As for the hyper parameter tuning the best performing number of topics was 8 and 11. That seemed to be the 
# sweet spot. The models on the edges ie 3 and 18 both were poor. The model with 5 was okay, but 8 and 11 were better.
# Overall, LDA seems to not be the choice for this dataset. 
# Next steps, either alter the underlying data or try a new model.
# What we will do, we will apply a new model.

In [106]:
#

In [107]:
#

In [108]:
# Second Model

In [109]:
#

In [110]:
#

In [111]:
#

In [112]:
# The second model will be BERTopic

In [114]:
warnings.filterwarnings("ignore")

In [115]:
from bertopic import BERTopic

In [116]:
# Build the model
model = BERTopic(language="english")

In [117]:
# The BERTopic model takes text data in as a regular sentence. It does not take text that has been pretokenized.
# Thus, to use BERTopic we must repreprocess the data from the raw form and leave it as a linear sentence (untokenized).
uncleandf = pd.read_csv(m9_1_5)
uncleandf2 = pd.read_csv(m1_5_2)
uncleandf3 = pd.read_csv(m2_3)
uncleandf4 = pd.read_csv(m3_4_5)

In [118]:
import re

In [119]:
# filter so we work with only quality comments that recieved awards.
#     (This keeps the data consistent with the previous LDA model explored above.)

uncleandf = uncleandf[uncleandf.total_awards_received >= 1]
uncleandf2 = uncleandf2[uncleandf2.total_awards_received >= 1]
uncleandf3 = uncleandf3[uncleandf3.total_awards_received >= 1]
uncleandf4 = uncleandf4[uncleandf4.total_awards_received >= 1]

In [120]:
# Define how to remove special characters
def cleaner(text):
    return re.sub(r'[^\x00-\x7F]+', ' ', text)

In [121]:
# Removing na values

# clean df 1

uncleanishdf = uncleandf.dropna()
cleanerdf = uncleanishdf.body.apply(cleaner)

In [122]:
# clean df 2
uncleanishdf2 = uncleandf2.dropna()
cleanerdf2 = uncleanishdf2.body.apply(cleaner)

In [123]:
# clean df 3
uncleanishdf3 = uncleandf3.dropna()
cleanerdf3 = uncleanishdf3.body.apply(cleaner)

In [124]:
# clean df 4
uncleanishdf4 = uncleandf4.dropna()
cleanerdf4 = uncleanishdf4.body.apply(cleaner)

In [125]:
# combine the dfs

combertdf = pd.concat([cleanerdf, cleanerdf2, cleanerdf3, cleanerdf4], ignore_index=True)

In [126]:
combertdf.head()

0    MGM $20 6/12 and AAL $16 6/12... hold over the...
1                                  I SAY WHEN WE SELL 
2    Imagine having Elon musk tendies and gilding p...
3                                        Why the gold?
4    I am holding bud and I am drinking bud. It wil...
Name: body, dtype: object

In [127]:
# Convert the data from a dataframe to a list with each row in the df being an element in the list.
# This is the format that BERTopic expects.
list_of_rows = combertdf.tolist()

In [128]:
len(list_of_rows)

15025

In [129]:
list_of_rows

['MGM $20 6/12 and AAL $16 6/12... hold over the weekend or dump EOD?',
 'I SAY WHEN WE SELL ',
 'Imagine having Elon musk tendies and gilding ppl just to fuel the rumors that you lurk here  ',
 'Why the gold?',
 'I am holding bud and I am drinking bud. It will take me to Valhalla\n\n19/6 50c',
 'Just saw someone complaining about having to wake up at 6:30am to trade...try 3:30am  ',
 'any recommendations for tmrw s plays ?',
 'Why do you guys like NKLA?',
 'Time for vote:\n1. Go to bed\n2. Collect more awards\n\nPut your answer in the comments',
 "Can't sleep as a poor put bag holder",
 'Bears are dead algos won. same thing that happened in 2019 happened now on a bigger scale. Even airlines are going up and we know those are trash except a very select few. Only hope bears have is a double dip recession.',
 'Please pump more bby',
 "I think at this point, you must be crazy to sell if you're holding long. Who is going to want to sell?",
 'Holding calls on BABA, EWH, INTC, ISRG, JD, MAT,

In [130]:
# Train model
topics, probabilities = model.fit_transform(list_of_rows)

In [131]:
topics_df = model.get_topics()

In [132]:
# View results
topics_df

{-1: [('and', 0.004419129868802029),
  ('the', 0.004270971199948869),
  ('to', 0.004153243317809024),
  ('my', 0.0040347774722752465),
  ('of', 0.003996356414691705),
  ('in', 0.0039410854627743306),
  ('it', 0.003862879361615106),
  ('you', 0.003856461233478777),
  ('is', 0.0038531265607990384),
  ('that', 0.003776825240043343)],
 0: [('awards', 0.06859637962156721),
  ('award', 0.05982105377982032),
  ('reddit', 0.017894621277735157),
  ('an', 0.01641639324533728),
  ('giving', 0.014146414585188139),
  ('coins', 0.012179489354300998),
  ('give', 0.012143841312476752),
  ('usayter', 0.009160628983455262),
  ('these', 0.008406233172019644),
  ('who', 0.007806662062497542)],
 1: [('bears', 0.05140677802815847),
  ('bear', 0.03605884493233548),
  ('bulls', 0.018124512269866094),
  ('bull', 0.013669959324598198),
  ('gay', 0.009005121170816),
  ('bearish', 0.007156786625106949),
  ('fuk', 0.006731813139756219),
  ('are', 0.005659201420826336),
  ('all', 0.005398255420157646),
  ('we', 0.0

In [144]:
# Analyizing results

# The BERTopic transformer based embedding model did very well.
# The model not only made more coherent categories, it generated many more interesting categories than LDA.
# It was able to capture many of the subreddits main themes like "Telsa", "Game Stop", and gambling themes. 
# Below I pulled out 6 topics that I think the model performed the best on.

# Number one is Tesla, or "tesla"

#  3: [('tesla', 0.08158662258873242),
#  ('elon', 0.00905030191481464),
#  ('battery', 0.008545873757071432),
#  ('nations', 0.006948797608525675),
#  ('factory', 0.006469769663712668),
#  ('teslas', 0.006168023753011131),
#  ('2000c', 0.006118294979098997),
#  ('nio', 0.005998332742021278),
#  ('will', 0.005762107913299955),
#  ('texas', 0.005611765978407114)],

# The "tesla" grouping by the model makes sense and is very consistent, the only outlier is "something".


# The next topic is GameStop or "gme" as it's stock ticker is named.

#  4: [('gme', 0.09175089768379087),
#  ('vol', 0.012417485862585294),
#  ('010821', 0.011650041697443578),
#  ('squeeze', 0.010333077406847004),
#  ('shares', 0.008889474257940717),
#  ('hold', 0.008425048487025516),
#  ('cohen', 0.008137934757482226),
#  ('pm', 0.0076450378957698345),
#  ('short', 0.007081204534924276),
#  ('am', 0.006109906155460143)],

# This grouping ("gme") is very good. It catpures the most noteable thing the subreddit is known for,
# the GameStop short squeeze. If the reader of this does not know about the short squeeze here is a quick summary.
# The redditors learned that GameStop was highly shorted by various headge funds and the redditors bought the stock "gme"
# in a large quantity with the goal of raising the price and starting a "squeeze". This is a process where the short sellers
# are forced to either paid a high margin call or buy the stock at the inflated price there by raising the price even more
# causing a feed back loop. The redditors successfully pulled this off and rocketed the stock thousands of percent higher.
# The term "cohen" refers to Steve Cohen who was bailing out some of the hedge funds on the receiving end of the squeeze.
#           " Cohen’s Point72 Asset Management — with $750 million — joined with Citadel
#            to infuse money into Melvin Capital, one of the firms hit hard because of 
#             its short position on GameStop stock. "
#                     - https://ftw.usatoday.com/2021/01/gamestop-gme-stock-reddit-mets-steve-cohen
# The rest of the terms like "vol" refers to the stocks trading volume, "hold" means hold the stock, and so on.


# The remaining topics are grouping that I think demonstrate the quality of the BERTopic model.

#  6: [('covid', 0.030001353765252484),
#  ('vaccine', 0.020655497107332017),
#  ('cases', 0.013490710965769558),
# ('coronavirus', 0.010561340156497645),
#  ('covid19', 0.009714651270327397),
#  ('deaths', 0.009268591628925313),
#  ('tested', 0.008082732390620441),
#  ('flu', 0.007017042710241926),
#  ('positive', 0.00692928405119588)],

# The subreddit is known for gambling and calling it 'investing'.
# Below you will notice the topic is "lost" and the most likely word associated with it is "5k"
# which I think is quite telling.


# 17: [('lost', 0.03848050792614471),
#  ('5k', 0.025656289533326108),
#  ('gains', 0.01692183198178607),
#  ('lose', 0.015836177631764793),
#  ('turned', 0.012828144766663054),
#  ('text', 0.0126496145637523),
#  ('money', 0.011860365709450104),
#  ('20k', 0.011241361052477332),
#  ('4k', 0.010893339966800381),
#  ('10k', 0.010732867285126691)],


#  29: [('vegas', 0.07793905714985044),
#  ('casinos', 0.03122405971849316),
#  ('las', 0.01959177332939142),
#  ('casino', 0.016838398877917344),
#  ('mgm', 0.015637326578297053),
#  ('reopening', 0.012871299850492405),
#  ('roundtable', 0.01243115966323418),
#  ('packed', 0.011646405021639859),
#  ('opening', 0.011287646023743256),
#  ('maps', 0.010873861249223)],

#  45: [('meth', 0.06450988839204563),
#  ('cocaine', 0.04074308740550251),
#  ('drugs', 0.026563132691135694),
#  ('adderall', 0.025838110805884235),
#  ('acid', 0.02153175900490353),
#  ('mushrooms', 0.015041276306591318),
#  ('surge', 0.012919055402942118),
#  ('dopamine', 0.012919055402942118),
#  ('heroin', 0.012919055402942118),
#  ('strip', 0.012291298532983221)],

In [134]:
# Applying the BERTopic model to all of the data.
# We will use the same model but apply it to more data and see the performance.

In [135]:
# Here we are defining how we will gather all our data
# The list named "data" is a list that contains the names of the files that we wish to import

data = [m0_9, m9_1_5, m1_5_2, m2_3, m3_4_5, m5_6, m6_7, m7_7_5, m7_5_8, m8_9, m9_10, m10_10_5, m10_5_11, m11_12, m12_13_5, m13_5_14, m14_15, m15_16, m16_16_5, m16_5_18, m18_19, m19_19_5, m19_5_20, m20_21, m21_22, m22_22_5, m22_5_24, m24_249, m25_26, m26_27_5, m27_5_28, m28_29]

# Below "data_adder" is a function that will load in the data, processes it to drop na values, filter out comments
# with less than 1 award, and add the resultant data to our df "empty_df". The function will iterate this process through
# the list "data" until it reaches the desired number of files specified by the parameter "num".

# the parameter "num" is the number of above datasets that will be processed and added

def data_adder(num):
    empty_df = pd.DataFrame()
    for i in range(num):
        tempdf = pd.read_csv(data[i])
        r = tempdf[tempdf.total_awards_received >= 1]
        f = r.dropna()
        o = f.body.apply(cleaner)
        empty_df = pd.concat([empty_df, o])
    return empty_df
        

        

In [136]:
# Here we are saving all the data that we will load in to the variable named "test"

test = data_adder(28)

# And we will output the number of rows in the dataset

len(test)

92051

In [137]:
# Here we will concatenate the data with the previous data to have a df with all the data we have available

newdata = pd.concat([combertdf, test.stack()])

In [138]:
# Here we are converting the data from type dataframe to a list with each element being a row

moredataisalwaysbetter = newdata.tolist()

In [139]:
# Build the second BERTopic model

model2 = BERTopic(language="english")

In [140]:
# Training the model

topics2, probabilities2 = model2.fit_transform(moredataisalwaysbetter)

In [141]:
# Getting the outputs of the model

secondtopicdf = model2.get_topics()

In [142]:
# Here we will see the outputs of the second model

secondtopicdf

{-1: [('and', 0.0007464168852368623),
  ('market', 0.0007126064277738983),
  ('of', 0.0007112934793451529),
  ('that', 0.0007060008684774118),
  ('the', 0.0007040985798314045),
  ('to', 0.000699298334263239),
  ('money', 0.000698724034908958),
  ('stock', 0.0006977062114296381),
  ('in', 0.0006976207602412313),
  ('but', 0.00069650805319462)],
 0: [('tesla', 0.03887975977318596),
  ('2000', 0.0028740603498530015),
  ('battery', 0.0027658985648652723),
  ('earnings', 0.0026681292506576),
  ('car', 0.0025334173643646613),
  ('model', 0.0024996921752613406),
  ('run', 0.0024867831575173514),
  ('nikola', 0.0023821307432788743),
  ('800', 0.002351570271457291),
  ('900', 0.0023430918716459055)],
 1: [('calls', 0.016414042087210905),
  ('1002', 0.009664419814061069),
  ('call5', 0.009140644178322244),
  ('exp', 0.008973349872646497),
  ('x1000', 0.008404656892458845),
  ('x500', 0.007939817135792088),
  ('x5000', 0.007192959253231302),
  ('call', 0.0069953486054680205),
  ('1016', 0.0068375

In [143]:
# Summary of the BERTopic model:
# The BERTopic model performed very well. The topics are coherent, well formed, and logical. BERTopic is also able to output 
# a huge amount of topics and has in-built functions to go more indepth on any particular topic. The strange thing is that
# the quality of each topic seemed to drop after expanding the dataset from 15,025 to 92,051.
#                                                            (NOTE: These low numbers are a result of filtering for comments
#                                                                   with awards from the larger dataset that has 29,000,000
#                                                                   comments. 15,025 and 92,051 are just the awarded comments.
#                                                                    The runtime of BERTopic on all 29,000,000 comments would
#                                                                     be about 12 days.)
# The reason for this drop could be due to the following reasons,
# - More data in a subreddit will increased the time period that these discussions took place over and thus each topic
#    may change overtime. 
# - The small amount of data introduced less noise and happened to have higher quality comments.
# - The larger amount of data made the model have to fit more datapoints and thus made the topics become more general.
#
#  Regardless of the reason the model dropped in performance, BERTopic is a strong topic modeler and produced great results.
