In [1]:
!pip install bertopic
!pip install openai
!pip install umap-learn
!pip install hdbscan




In [2]:
import pandas as pd
import openai
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from sentence_transformers import SentenceTransformer
import pickle
from umap import UMAP
from hdbscan import HDBSCAN

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import os
os.getcwd()

'/content'

In [4]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Change to the desired directory in Google Drive
os.chdir('/content/drive/My Drive/Colab Notebooks/Mental Health Depression Bertopic')

# Verify by listing files in the current directory
print(os.listdir())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['data', 'my_model_dir', 'Sentiment Dsitribution and Topic Analysis.ipynb', 'previous reddir small data', 'Depression Data Cleaning and Preprocess', 'DepressionRobertaSentimentAnalysis', 'Topic_modeling_BERTopic.ipynb']


In [5]:
#Read data
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Mental Health Depression Bertopic/data/depression_sentiments_roberta.csv')

In [6]:
print(data.shape)

(7730, 10)


In [7]:
data.head()

Unnamed: 0,id,label,document,created_utc,clean_text,negative,neutral,positive,sentiment,user
0,0,tensor(0),we understand that most people who reply immed...,0,understand people reply immediately op invitat...,0.123658,0.683168,0.193175,neutral,depression
1,1,tensor(0),welcome to r depression s check in post a plac...,0,welcome r depression check post place take mom...,0.265393,0.642364,0.092243,neutral,depression
2,2,tensor(0),anyone else instead of sleeping more when depr...,0,anyone else instead sleeping depressed stay ni...,0.225754,0.600676,0.17357,neutral,depression
3,3,tensor(0),i ve kind of stuffed around a lot in my life d...,0,kind stuffed around lot life delaying inevitab...,0.868262,0.117783,0.013955,negative,depression
4,4,tensor(0),sleep is my greatest and most comforting escap...,0,sleep greatest comforting escape whenever wake...,0.355922,0.439947,0.204132,neutral,depression


In [8]:
print(data.groupby('user').id.nunique())

abstracts = data['clean_text'].astype('str')


user
control       3900
depression    3830
Name: id, dtype: int64


In [9]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embedding = embedding_model.encode(abstracts, show_progress_bar=True)

'''with open('doc_embedding.pickle', 'wb') as pkl:
    pickle.dump(embeddings, pkl)'''

'''with open('doc_embedding.pickle', 'rb') as pkl:
    embedding = pickle.load(pkl)'''

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42,low_memory=True)

hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))


Batches:   0%|          | 0/242 [00:00<?, ?it/s]

In [10]:
with open('doc_embedding.pickle', 'wb') as pkl:
    pickle.dump(embedding, pkl)

'''with open('doc_embedding.pickle', 'rb') as pkl:
    embedding = pickle.load(pkl)'''

"with open('doc_embedding.pickle', 'rb') as pkl:\n    embedding = pickle.load(pkl)"

In [11]:
# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
#pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)


# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    #"POS": pos_model
}


In [12]:
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics,prob = topic_model.fit_transform(abstracts, embedding)
topic_model.get_topic_info()

2024-08-18 22:24:43,616 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-18 22:25:33,236 - BERTopic - Dimensionality - Completed ✓
2024-08-18 22:25:33,238 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-18 22:25:33,914 - BERTopic - Cluster - Completed ✓
2024-08-18 22:25:33,925 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-18 22:26:18,370 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,2994,-1_depression_wa_like_feel,"[depression, wa, like, feel, know, day, time, ...","[depression, depressed, anxiety, feel, feeling...","[depression, like, feel, day, work, people, ma...",[idk guess vent something whatever lately ever...
1,0,1448,0_life_want_feel_wa,"[life, want, feel, wa, like, friend, know, peo...","[depression, depressed, suicidal, care, feel, ...","[life, friend, feel like, think, family, day, ...",[starter never really childhood wa parent woul...
2,1,700,1_anxiety_panic_attack_like,"[anxiety, panic, attack, like, anxious, wa, fe...","[anxiety attack, anxiety, panic attack, zoloft...","[anxiety, panic, anxious, panic attack, sympto...",[whole life dealt trifecta depression anxiety ...
3,2,272,2_sleep_bed_awake_morning,"[sleep, bed, awake, morning, hour, wake, night...","[sleep, need sleep, sleep sleep, sleeping, abl...","[morning, woke, slept, going sleep, tired slee...","[cant sleep ugghhh, enough sleep, celesteclara..."
4,3,164,3_depression_depression depression_hide depres...,"[depression, depression depression, hide depre...","[depression depression, depression, result dep...","[hide depression, depression real, depression ...","[rsk depression, depression arai, engysmohamed..."
...,...,...,...,...,...,...,...
80,79,11,79_mileycyrus_mile_xoxo_counting,"[mileycyrus, mile, xoxo, counting, sleep, bett...","[mileycyrus, mile, cyrus, mile away, thousand ...","[mileycyrus, mile, xoxo, cyrus, sick love, tho...",[mileycyrus awww u seriously cutest dog miley ...
81,80,11,80_facebook_facebook com_apps facebook_http apps,"[facebook, facebook com, apps facebook, http a...","[apps facebook, dogbook profile, profile view,...","[facebook, facebook com, apps facebook, apps, ...",[strider sick little puppy http apps facebook ...
82,81,10,81_late_deadline_stayed late_looking tempting,"[late, deadline, stayed late, looking tempting...","[deadline, late work, delayed, busy, stayed la...","[deadline, mutt, spray paint, problem finished...","[damn missed gsoc apply deadline, damn late fi..."
83,82,10,82_hate_money people_gave love_know story,"[hate, money people, gave love, know story, st...","[hate, hating, dislike, strongly dislike, real...","[hate, money people, gave love, strongly disli...","[everyone hate much, never thought could hate ..."


In [13]:
df = data.copy()
posts = abstracts.to_list()
ids = df['id'].to_list()
sentiments = df['sentiment'].to_list()
users = df['user'].to_list()
created_utc = df['created_utc'].to_list()
negative = df['negative'].to_list()
neutral = df['neutral'].to_list()
positive = df['positive'].to_list()

In [14]:
# Reduce outliers with pre-calculate embeddings instead
new_topics = topic_model.reduce_outliers(abstracts, topics, strategy="c-tf-idf")
topic_model.update_topics(abstracts, topics=new_topics)
topic_model.get_topic_info()



Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,37,-1_snoo_ahhhhhhhhhhhhh_nomadicmatt_featherinair,"[snoo, ahhhhhhhhhhhhh, nomadicmatt, featherina...","[depression, depressed, anxiety, feel, feeling...","[depression, like, feel, day, work, people, ma...",[idk guess vent something whatever lately ever...
1,0,1994,0_life_want_feel_wa,"[life, want, feel, wa, like, know, friend, eve...","[depression, depressed, suicidal, care, feel, ...","[life, friend, feel like, think, family, day, ...",[starter never really childhood wa parent woul...
2,1,843,1_anxiety_panic_attack_wa,"[anxiety, panic, attack, wa, like, anxious, fe...","[anxiety attack, anxiety, panic attack, zoloft...","[anxiety, panic, anxious, panic attack, sympto...",[whole life dealt trifecta depression anxiety ...
3,2,347,2_sleep_bed_awake_morning,"[sleep, bed, awake, morning, hour, ugh, night,...","[sleep, need sleep, sleep sleep, sleeping, abl...","[morning, woke, slept, going sleep, tired slee...","[cant sleep ugghhh, enough sleep, celesteclara..."
4,3,311,3_depression_co_battle_http,"[depression, co, battle, http, cure, real, sta...","[depression depression, depression, result dep...","[hide depression, depression real, depression ...","[rsk depression, depression arai, engysmohamed..."
...,...,...,...,...,...,...,...
80,79,25,79_mileycyrus_hahaha_nite_counting,"[mileycyrus, hahaha, nite, counting, ouch, mil...","[mileycyrus, mile, cyrus, mile away, thousand ...","[mileycyrus, mile, xoxo, cyrus, sick love, tho...",[mileycyrus awww u seriously cutest dog miley ...
81,80,21,80_facebook_apps_profile_view,"[facebook, apps, profile, view, dogbook, com, ...","[apps facebook, dogbook profile, profile view,...","[facebook, facebook com, apps facebook, apps, ...",[strider sick little puppy http apps facebook ...
82,81,44,81_late_fb_deadline_spray,"[late, fb, deadline, spray, delayed, unfortuna...","[deadline, late work, delayed, busy, stayed la...","[deadline, mutt, spray paint, problem finished...","[damn missed gsoc apply deadline, damn late fi..."
83,82,37,82_hate_comment_ohhh_boy,"[hate, comment, ohhh, boy, ride, update, oanhl...","[hate, hating, dislike, strongly dislike, real...","[hate, money people, gave love, strongly disli...","[everyone hate much, never thought could hate ..."


In [None]:
# This part is removed, we have much less topics than the original research
"""
# reduce topics
topic_model.reduce_topics(abstracts, nr_topics='auto')

# Access updated topics
topics = topic_model.topics_
topic_model.get_topic_info()
"""

In [15]:
# Access updated topics
topics = topic_model.topics_
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,37,-1_snoo_ahhhhhhhhhhhhh_nomadicmatt_featherinair,"[snoo, ahhhhhhhhhhhhh, nomadicmatt, featherina...","[depression, depressed, anxiety, feel, feeling...","[depression, like, feel, day, work, people, ma...",[idk guess vent something whatever lately ever...
1,0,1994,0_life_want_feel_wa,"[life, want, feel, wa, like, know, friend, eve...","[depression, depressed, suicidal, care, feel, ...","[life, friend, feel like, think, family, day, ...",[starter never really childhood wa parent woul...
2,1,843,1_anxiety_panic_attack_wa,"[anxiety, panic, attack, wa, like, anxious, fe...","[anxiety attack, anxiety, panic attack, zoloft...","[anxiety, panic, anxious, panic attack, sympto...",[whole life dealt trifecta depression anxiety ...
3,2,347,2_sleep_bed_awake_morning,"[sleep, bed, awake, morning, hour, ugh, night,...","[sleep, need sleep, sleep sleep, sleeping, abl...","[morning, woke, slept, going sleep, tired slee...","[cant sleep ugghhh, enough sleep, celesteclara..."
4,3,311,3_depression_co_battle_http,"[depression, co, battle, http, cure, real, sta...","[depression depression, depression, result dep...","[hide depression, depression real, depression ...","[rsk depression, depression arai, engysmohamed..."
...,...,...,...,...,...,...,...
80,79,25,79_mileycyrus_hahaha_nite_counting,"[mileycyrus, hahaha, nite, counting, ouch, mil...","[mileycyrus, mile, cyrus, mile away, thousand ...","[mileycyrus, mile, xoxo, cyrus, sick love, tho...",[mileycyrus awww u seriously cutest dog miley ...
81,80,21,80_facebook_apps_profile_view,"[facebook, apps, profile, view, dogbook, com, ...","[apps facebook, dogbook profile, profile view,...","[facebook, facebook com, apps facebook, apps, ...",[strider sick little puppy http apps facebook ...
82,81,44,81_late_fb_deadline_spray,"[late, fb, deadline, spray, delayed, unfortuna...","[deadline, late work, delayed, busy, stayed la...","[deadline, mutt, spray paint, problem finished...","[damn missed gsoc apply deadline, damn late fi..."
83,82,37,82_hate_comment_ohhh_boy,"[hate, comment, ohhh, boy, ride, update, oanhl...","[hate, hating, dislike, strongly dislike, real...","[hate, money people, gave love, strongly disli...","[everyone hate much, never thought could hate ..."


In [17]:
d = pd.DataFrame({'ids':ids,'clean_text': posts,'topic': topics, 'probs':prob,'sentiments':sentiments,
                   'users':users,'created_utc':created_utc,'negative':negative,'neutral':neutral,
                 'positive':positive})

d.to_csv("Topic_df_with_sentiments.csv",index=False) # this new datafile contains original data + sentiment labels + topic modeling result

In [18]:
topics_df = pd.DataFrame(topic_model.get_topic_info())
topics_df.to_csv("Topic_representation.csv",index=False) # this is extra datafile from topic modeling which contains topic representation i.e. 10 words representation,which help us to explore the topics.

In [19]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("my_model_dir", serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)


In [20]:
# Load from directory
loaded_model = BERTopic.load("my_model_dir")
loaded_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,37,-1_snoo_ahhhhhhhhhhhhh_nomadicmatt_featherinair,"[snoo, ahhhhhhhhhhhhh, nomadicmatt, featherina...","[depression, depressed, anxiety, feel, feeling...","[depression, like, feel, day, work, people, ma...",
1,0,1994,0_life_want_feel_wa,"[life, want, feel, wa, like, know, friend, eve...","[depression, depressed, suicidal, care, feel, ...","[life, friend, feel like, think, family, day, ...",
2,1,843,1_anxiety_panic_attack_wa,"[anxiety, panic, attack, wa, like, anxious, fe...","[anxiety attack, anxiety, panic attack, zoloft...","[anxiety, panic, anxious, panic attack, sympto...",
3,2,347,2_sleep_bed_awake_morning,"[sleep, bed, awake, morning, hour, ugh, night,...","[sleep, need sleep, sleep sleep, sleeping, abl...","[morning, woke, slept, going sleep, tired slee...",
4,3,311,3_depression_co_battle_http,"[depression, co, battle, http, cure, real, sta...","[depression depression, depression, result dep...","[hide depression, depression real, depression ...",
...,...,...,...,...,...,...,...
80,79,25,79_mileycyrus_hahaha_nite_counting,"[mileycyrus, hahaha, nite, counting, ouch, mil...","[mileycyrus, mile, cyrus, mile away, thousand ...","[mileycyrus, mile, xoxo, cyrus, sick love, tho...",
81,80,21,80_facebook_apps_profile_view,"[facebook, apps, profile, view, dogbook, com, ...","[apps facebook, dogbook profile, profile view,...","[facebook, facebook com, apps facebook, apps, ...",
82,81,44,81_late_fb_deadline_spray,"[late, fb, deadline, spray, delayed, unfortuna...","[deadline, late work, delayed, busy, stayed la...","[deadline, mutt, spray paint, problem finished...",
83,82,37,82_hate_comment_ohhh_boy,"[hate, comment, ohhh, boy, ride, update, oanhl...","[hate, hating, dislike, strongly dislike, real...","[hate, money people, gave love, strongly disli...",


In [21]:
d.columns

Index(['ids', 'clean_text', 'topic', 'probs', 'sentiments', 'users',
       'created_utc', 'negative', 'neutral', 'positive'],
      dtype='object')

In [22]:
topics_df.columns

Index(['Topic', 'Count', 'Name', 'Representation', 'KeyBERT', 'MMR',
       'Representative_Docs'],
      dtype='object')

In [23]:
topics_df['Representative_Docs'].isna().sum()

0

In [24]:
topics_df['Representative_Docs']

Unnamed: 0,Representative_Docs
0,[idk guess vent something whatever lately ever...
1,[starter never really childhood wa parent woul...
2,[whole life dealt trifecta depression anxiety ...
3,"[cant sleep ugghhh, enough sleep, celesteclara..."
4,"[rsk depression, depression arai, engysmohamed..."
...,...
80,[mileycyrus awww u seriously cutest dog miley ...
81,[strider sick little puppy http apps facebook ...
82,"[damn missed gsoc apply deadline, damn late fi..."
83,"[everyone hate much, never thought could hate ..."


In [25]:
topics_df['Representation']

Unnamed: 0,Representation
0,"[snoo, ahhhhhhhhhhhhh, nomadicmatt, featherina..."
1,"[life, want, feel, wa, like, know, friend, eve..."
2,"[anxiety, panic, attack, wa, like, anxious, fe..."
3,"[sleep, bed, awake, morning, hour, ugh, night,..."
4,"[depression, co, battle, http, cure, real, sta..."
...,...
80,"[mileycyrus, hahaha, nite, counting, ouch, mil..."
81,"[facebook, apps, profile, view, dogbook, com, ..."
82,"[late, fb, deadline, spray, delayed, unfortuna..."
83,"[hate, comment, ohhh, boy, ride, update, oanhl..."


In [27]:
sorted_topics = topics_df.sort_values(by="Count", ascending=False)

In [44]:
sorted_topics.loc[0, 'Representation']

['snoo',
 'ahhhhhhhhhhhhh',
 'nomadicmatt',
 'featherinair',
 'beeeaaach',
 'zaibatsu',
 'kateblogs',
 'keongzai',
 'yuddylicious',
 'marlonjenglish']

In [48]:
sorted_topics.loc[1, 'Representation']

['life',
 'want',
 'feel',
 'wa',
 'like',
 'know',
 'friend',
 'even',
 'people',
 'time']

topic 0 is not very clear, it might be life or friendship

In [49]:
sorted_topics.loc[2, 'Representation']

['anxiety',
 'panic',
 'attack',
 'wa',
 'like',
 'anxious',
 'feel',
 'get',
 'also',
 'time']

In [51]:
print("topic" + str(sorted_topics.loc[2, 'Topic']) +  "is most probably feelings")

topic1is most probably feelings


In [47]:
sorted_topics

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
1,0,1994,0_life_want_feel_wa,"[life, want, feel, wa, like, know, friend, eve...","[depression, depressed, suicidal, care, feel, ...","[life, friend, feel like, think, family, day, ...",[starter never really childhood wa parent woul...
2,1,843,1_anxiety_panic_attack_wa,"[anxiety, panic, attack, wa, like, anxious, fe...","[anxiety attack, anxiety, panic attack, zoloft...","[anxiety, panic, anxious, panic attack, sympto...",[whole life dealt trifecta depression anxiety ...
3,2,347,2_sleep_bed_awake_morning,"[sleep, bed, awake, morning, hour, ugh, night,...","[sleep, need sleep, sleep sleep, sleeping, abl...","[morning, woke, slept, going sleep, tired slee...","[cant sleep ugghhh, enough sleep, celesteclara..."
4,3,311,3_depression_co_battle_http,"[depression, co, battle, http, cure, real, sta...","[depression depression, depression, result dep...","[hide depression, depression real, depression ...","[rsk depression, depression arai, engysmohamed..."
5,4,244,4_work_working_bored_day,"[work, working, bored, day, office, boring, we...","[work, work work, working, going work, job, wo...","[day work, boring, productive, work today, wor...","[work, work, work]"
...,...,...,...,...,...,...,...
51,50,22,50_birthday_officially_chore_fucking,"[birthday, officially, chore, fucking, happy, ...","[birthday today, spend birthday, today birthda...","[birthday, birthday im, party, birthday today,...",[oh officially birthday happy rd birthday look...
81,80,21,80_facebook_apps_profile_view,"[facebook, apps, profile, view, dogbook, com, ...","[apps facebook, dogbook profile, profile view,...","[facebook, facebook com, apps facebook, apps, ...",[strider sick little puppy http apps facebook ...
77,76,20,76_puppy_dog_doggie_ur,"[puppy, dog, doggie, ur, poor, away, bout, dea...","[want puppy, puppy, dog, doggie, breed, really...","[puppy, poor, doggie, want puppy, study away, ...",[ill make fresh start promise xtra sad puppy f...
73,72,20,72_border_takeusbacktochina_restricted_fmwangy...,"[border, takeusbacktochina, restricted, fmwang...","[depression suicidal, word depression, stress ...","[border, suicidal attempt, word depression, de...",[border restricted victim day day getting engu...


In [55]:
sorted_topics[["Name","Count"]]

Unnamed: 0,Name,Count
1,0_life_want_feel_wa,1994
2,1_anxiety_panic_attack_wa,843
3,2_sleep_bed_awake_morning,347
4,3_depression_co_battle_http,311
5,4_work_working_bored_day,244
...,...,...
51,50_birthday_officially_chore_fucking,22
81,80_facebook_apps_profile_view,21
77,76_puppy_dog_doggie_ur,20
73,72_border_takeusbacktochina_restricted_fmwangy...,20


In [56]:
sorted_topics.loc[6:20,["Name","Count"]]

Unnamed: 0,Name,Count
6,5_twitter_tweet_tweetdeck_update,149
12,11_quot_song_album_fm,117
17,16_co_http_depression_mentalhealth,114
10,9_sick_ill_feeling_hope,105
7,6_kal_kutner_penn_episode,104
16,15_tomorrow_school_holiday_today,99
8,7_hungry_lunch_dinner_chicken,89
11,10_eye_hurt_leg_foot,85
14,13_job_interview_work_bos,83
18,17_waiting_train_traffic_bus,83


By looking at names if we make guesses we have the following topics:

In [57]:
Topic_names = ["life","anxiety","sleep","depression","work","twitter","music","mental health",
               "health and emotional well-being","TV shows","school","food","body parts",
               "job","public transport","french","weather"]