## BERTopic Modeling

In [1]:
#ASK TEAM ON HOW TO DETERMINE A GOOD TOPIC MODEL 
#LOOK INTO SETTING A SEED FOR THE TOPIC MODELING TO SAVE THE BEST MODEL*
#LOOK INTO HUGGINGFACE FOR SENTIMENT ANALYSIS MODEL 

In [96]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer

In [97]:
#read in original dataset
dep5 = pd.read_csv('dep.csv')
dep5

Unnamed: 0,index,subreddit,date,post,covid period,gender
0,0,depression,2019-08-28,the sun hasnt even came up and im f ready to g...,pre,female
1,1,depression,2019-08-28,i am just so exhausted im f currently in an in...,pre,female
2,2,depression,2019-10-08,not sure if i am depressed burnt out or if soc...,pre,female
3,3,depression,2019-10-08,hi this is my first post just wanted to talk t...,pre,female
4,4,depression,2018-11-07,i f felt like i had my shit together at now i ...,pre,female
...,...,...,...,...,...,...
847,440,depression,2023-07-25,hello all i m and my girlfriend f have been to...,post,male
848,441,depression,2023-07-25,hey yall been depressed since i was m now bee...,post,male
849,442,depression,2023-07-25,i am m i used to be a chain smoker and i decid...,post,male
850,443,depression,2023-07-24,im m i am studying in college but never had a ...,post,male


In [98]:
#From the eda, remove popular words that are not that significant 
common_words = ["feel", "like", "just", "im", "know", "need", "tl", 
               "dr", "tldr","amp", "nbsp", "really", "mom", "mum"
               , "parent", "nparent", "nmom", "dad", "father", 
                "brother", "ex", "nex", "sister", "ns", "mother"
               "because", "wife", "girlfriend", "husband", "boyfriend", "gf", "bf", "ive", "ve", "don"
               "his", "she", "shes", "hes", "https", "www", "com", "spotify", "youtube", "google", "docs"]

time_words = ["time", "today", "tomorrow", "yesterday", "morning", 
              "afternoon", "night", "day", "week", "weekend", "month",
             "year", "days", "weeks", "weekends", "months", "years", 
              "monday", "tuesday", "wednesday", "thursday", "friday", 
              "saturday", "sunday", "mon", "tues", "wed", "thurs", "fri",
              "sat", "sun", "january", "february", "march", "april", "may",
              "june", "july", "august", "september", "october", "november",
              "decemebr", "jan", "feb", "mar", "april", "may", "jun", "jul",
              "aug", "sep", "oct", "nov", "dec"]
        

#add both lists together
additional_stop_words = common_words + time_words

In [99]:
default_stop_words = set(TfidfVectorizer(stop_words="english").get_stop_words())
all_stop_words = list(default_stop_words.union(additional_stop_words))

In [100]:
all_stop_words

['noone',
 'perhaps',
 'us',
 'down',
 'eleven',
 'two',
 'weeks',
 'google',
 'me',
 'any',
 'others',
 'because',
 'days',
 'nmom',
 'next',
 'five',
 'sat',
 'he',
 'at',
 'mon',
 'then',
 'may',
 'ex',
 'many',
 'month',
 'to',
 'feb',
 'is',
 'which',
 're',
 'docs',
 'time',
 'november',
 'been',
 'back',
 'jan',
 'that',
 'nevertheless',
 'up',
 'former',
 'just',
 'nobody',
 'six',
 'motherbecause',
 'very',
 'nowhere',
 'toward',
 'whoever',
 'the',
 'since',
 'october',
 'cry',
 'four',
 'sunday',
 'somehow',
 'whom',
 'who',
 'towards',
 'none',
 'tuesday',
 'april',
 'of',
 'keep',
 'over',
 'someone',
 'hereupon',
 'how',
 'amount',
 'becomes',
 'always',
 'amoungst',
 'where',
 'much',
 'throughout',
 'do',
 'yours',
 'brother',
 'everywhere',
 'sister',
 'was',
 'a',
 'full',
 'one',
 'nothing',
 'yesterday',
 'am',
 'empty',
 'further',
 'months',
 'wednesday',
 'whole',
 'sincere',
 'beyond',
 'against',
 'ltd',
 'seeming',
 'hes',
 'com',
 'from',
 'have',
 'these',
 

In [112]:
data = dep5['post']
vectorizer_model =  TfidfVectorizer(stop_words=all_stop_words,
                                    ngram_range=(2,3), sublinear_tf=True) 
sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
topic_model = BERTopic(vectorizer_model = vectorizer_model)
topics, probs = topic_model.fit_transform(data)

2023-08-03 18:24:32,497 - BERTopic - Transformed documents to Embeddings
2023-08-03 18:24:37,328 - BERTopic - Reduced dimensionality
2023-08-03 18:24:37,363 - BERTopic - Clustered reduced embeddings


In [113]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,305,-1_bad luck_switch schools_breast cancer_weigh...
1,0,82,0_ton work_motivated work_work semester_consta...
2,1,72,1_weight gain_bring doctor_mg wellbutrin_birth...
3,2,69,2_help help_help depression_thinks burden_expe...
4,3,38,3_medical student_tell parents_parents depress...
5,4,38,4_personal level_age look_make friendships_tru...
6,5,37,5_person thoughts_tell people kill_lot suicida...
7,6,36,6_open track_owmfjtupls lgphmhscwymepbbdz_cilq...
8,7,33,7_leave country_times wanted_end marriage_voic...
9,8,28,8_tells feels_didnt reply_depression coming_ha...


In [114]:
topic_grams = []
for k in range(len(set(topics))):
    cur_top = topic_model.get_topic(k)
    if cur_top:
        cur_d = {'topic number': k}
        for j in range(10):
            cur_d[f'topic ngram {j+1}'] = cur_top[j][0]
        topic_grams.append(cur_d)
topics_df = pd.DataFrame(topic_grams)

In [115]:
topics_df

Unnamed: 0,topic number,topic ngram 1,topic ngram 2,topic ngram 3,topic ngram 4,topic ngram 5,topic ngram 6,topic ngram 7,topic ngram 8,topic ngram 9,topic ngram 10
0,0,ton work,motivated work,work semester,constant reminder,lets talk,instead working,tired wanna,bell lets talk,depression anxiety better,feels break
1,1,weight gain,bring doctor,mg wellbutrin,birth control,brain zaps,breaking things,low dose,prescribed mg,taking lexapro,wellbutrin xl
2,2,help help,help depression,thinks burden,experienced depression,care theres,say help,longer depressed,overcome depressed,questioning deserves thinks,felt potential
3,3,medical student,tell parents,parents depression,making world war,depression function,hand fair help,hand fair,think ending miserable,hours pretty introverted,hospital dont want
4,4,personal level,age look,make friendships,true friend,friends getting,exhausting talk,dont leave house,age look old,awkward good,thoughts racing
5,5,person thoughts,tell people kill,lot suicidal,lot suicidal thoughts,thoughts suicide,gonna tonight,fourth hs,fine hate,leave sub,thoughts id
6,6,open track,owmfjtupls lgphmhscwymepbbdz,cilqvodbwctphoukbs zll,hza tx,text epa,rrxfnbcijduqjpdznryij rh fthhb,owmfjtupls lgphmhscwymepbbdz kknu,px ipuiqleegvkftaenz jowwbbsjmkgogy,px ipuiqleegvkftaenz,gg tn
7,7,leave country,times wanted,end marriage,voice god,wanted kill,money parents,getting divorce,dont rlly,waiting home,emotionally abusive
8,8,tells feels,didnt reply,depression coming,happened told,genuinely care,needs help,doesnt want,want leave house,mother left,help hey
9,9,end seriously,sure wont,girl truly,taking life,edge breakup,closest got crying,friends suggested did,polyamorous said texting,close taking,close taking life


In [117]:
topic_model.get_topic(6)

[('open track', 0.006578701689857102),
 ('owmfjtupls lgphmhscwymepbbdz', 0.006436095405171005),
 ('cilqvodbwctphoukbs zll', 0.006436095405171005),
 ('hza tx', 0.006436095405171005),
 ('text epa', 0.006436095405171005),
 ('rrxfnbcijduqjpdznryij rh fthhb', 0.006436095405171005),
 ('owmfjtupls lgphmhscwymepbbdz kknu', 0.006436095405171005),
 ('px ipuiqleegvkftaenz jowwbbsjmkgogy', 0.006436095405171005),
 ('px ipuiqleegvkftaenz', 0.006436095405171005),
 ('gg tn', 0.006436095405171005)]

In [132]:
#Saving the bertopic model as a csv
topics_df.to_csv("BERTopic_model.csv", index = False)

In [120]:
dep5.head()

Unnamed: 0,index,subreddit,date,post,covid period,gender
0,0,depression,2019-08-28,the sun hasnt even came up and im f ready to g...,pre,female
1,1,depression,2019-08-28,i am just so exhausted im f currently in an in...,pre,female
2,2,depression,2019-10-08,not sure if i am depressed burnt out or if soc...,pre,female
3,3,depression,2019-10-08,hi this is my first post just wanted to talk t...,pre,female
4,4,depression,2018-11-07,i f felt like i had my shit together at now i ...,pre,female


In [131]:
pre = pd.read_csv("depression_pre.csv")

pre[pre["post"].str.contains("spotify")].to_csv("spotify.csv", index = False)

In [130]:
pre = pd.read_csv("depression_pre.csv")
pre.iloc[7893]['post']

'I made a Hip Hop playlist called "Lamenting &amp; Sad Hip Hop". It\'s helped me through tough times, so maybe it can help you too. [The Playlist](https://open.spotify.com/user/9jif2jecb7qpfw25qyz20c9ed/playlist/2Ka8i0P9BGogjccGz0FiE5?si=-p9yt61-QLqQhbqS2uED1w)\n\nHip Hop has helped me get through a lot of rough patches in my life. I\'ve been listening to it for 10+ years, so I hope this playlist can really help anyone dealing with depression or anxiety like I am.'