<a href="https://colab.research.google.com/github/andrea-mar/BBK_MScDataScience2reddit_posts/blob/main/BERTopic_reddit_posts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

BERTopic  - https://pypi.org/project/bertopic/

Optimised code for large data sets (for GPU)
source code : https://colab.research.google.com/drive/1W7aEdDPxC29jP99GGZphUlqjMFFVKtBC?usp=sharing#scrollTo=swkRsYLdC9YE

In [1]:
!nvidia-smi

Sun Nov 19 10:54:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
%%capture
%pip install bertopic

In [3]:
%%capture
%pip install git+https://github.com/MaartenGr/BERTopic.git@master

%pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.nvidia.com
%pip install cuml-cu11 --extra-index-url=https://pypi.nvidia.com
%pip install cugraph-cu11 --extra-index-url=https://pypi.nvidia.com
%pip install cupy-cuda11x -f https://pip.cupy.dev/aarch64

%pip install safetensors
%pip install datasets
%pip install datashader
%pip install adjustText

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from google.colab import drive

In [5]:
# from plotly.offline import init_notebook_mode
# init_notebook_mode(connected=True)

In [6]:
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
data_path = '/content/drive/MyDrive/MS_DS_NLP_project/english_posts_cleaned.csv'

In [8]:
data = pd.read_csv(data_path)
data.head(1)

Unnamed: 0,id,main_submission_id,comment_parent_id,subreddit,post_type,text,datetime,month,year,text_length,language,language_ft
0,is4ft9s,y2q46p,t3_y2q46p,autism,comment,I don t think it works like that,2022-10-13 05:58:56,10,2022,32,en,en


In [9]:
# BERTopic on Large Datasets

In [10]:
len(data.text)

522377

In [11]:
# if error, see notebooke below:
# https://colab.research.google.com/drive/13sspqiEZwso4NYTbsflpPyNFaVAAxUgr#scrollTo=xgAFgI15ddf6
import cuml
cuml.__version__

'23.10.00'

In [12]:
from sentence_transformers import SentenceTransformer

# Create embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(data.text, show_progress_bar=True)

Batches:   0%|          | 0/16325 [00:00<?, ?it/s]

In [13]:
# save embedings
import numpy as np

with open('/content/drive/MyDrive/MS_DS_NLP_project/models/bert_topic_embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

In [14]:
embeddings = np.load('/content/drive/MyDrive/MS_DS_NLP_project/models/bert_topic_embeddings.npy')

In [15]:
import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

# Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(data.text):
  vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 10] # only include words that appear in at least 10 documents ( = explude very rare and unique words )
len(vocab)

100%|██████████| 522377/522377 [00:21<00:00, 24256.38it/s]


38858

In [16]:
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
from bertopic import BERTopic

# Prepare sub-models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')   # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
umap_model = UMAP(n_components=10, n_neighbors=50, random_state=42, metric="cosine", verbose=True) # reduce dimensionality of the input data to 10 dimentions
hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=True, min_cluster_size=20, verbose=True) # minimum number of samples required to form a cluster is 20
vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english")

# Fit BERTopic without actually performing any clustering
bertopic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        verbose=True
).fit(data.text, embeddings=embeddings)

[D] [11:06:29.216580] /__w/cuml/cuml/cpp/src/umap/runner.cuh:108 n_neighbors=50
[D] [11:06:29.217402] /__w/cuml/cuml/cpp/src/umap/runner.cuh:130 Calling knn graph run
[D] [11:07:45.003955] /__w/cuml/cuml/cpp/src/umap/runner.cuh:136 Done. Calling fuzzy simplicial set
[D] [11:07:45.105241] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:317 Smooth kNN Distances
[D] [11:07:45.106620] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:319 sigmas = [ 0.198672, 0.224045, 0.000720777, 0.153191, 0.209908, 0.220584, 0.0388853, 0.0235907, 0.0447377, 0.246539, 0.252918, 0.0359993, 0.0469379, 0.0135128, 0.0106627, 0.0368283, 0.0400955, 0.167955, 0.177165, 0.0183715, 0.0193916, 0.127753, 0.103453, 0.191617, 0.0293034 ]

[D] [11:07:45.109393] /__w/cuml/cuml/cpp/src/umap/fuzzy_simpl_set/naive.cuh:321 rhos = [ 4.76837e-07, 1.19209e-07, 0.00250971, 5.96046e-08, 1.19209e-07, 2.98023e-07, 0.208885, 0.439423, 0.322167, 2.38419e-07, 1.19209e-07, 0.206836, 0.343409, 0.234218, 0.357747, 0.166992, 0.

2023-11-19 11:10:04,799 - BERTopic - Reduced dimensionality
2023-11-19 11:11:35,802 - BERTopic - Clustered reduced embeddings
  idf = np.log((avg_nr_samples / df)+1)


In [17]:
len(bertopic_model.get_topic_info()) # 612 topics

612

In [18]:
bertopic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,404565,-1_like_people_just_autism,"[like, people, just, autism, don, autistic, kn...",[M definitely should not be texting demanding ...
1,0,4470,0_gender_conversion_trans_binary,"[gender, conversion, trans, binary, sex, dysph...","[That's not what Conversion Therapy is, When p..."
2,1,3826,1_thank_thanks_appreciate_sharing,"[thank, thanks, appreciate, sharing, comment, ...",[Thank you so much for your response That s wh...
3,2,3704,2_adhd_meds_medication_ketamine,"[adhd, meds, medication, ketamine, symptoms, p...",[Yes I have both adhd and ASD They re so diffe...
4,3,3665,3_friends_reunion_high_touch,"[friends, reunion, high, touch, friend, reunio...","[My two best friends are from high school, I a..."
...,...,...,...,...,...
607,606,20,606_minecraft_instruments_game_played,"[minecraft, instruments, game, played, explore...",[minecraft i am THAT stereotypical autistic gu...
608,607,20,607_soulmate_soulmates_introduction_soul,"[soulmate, soulmates, introduction, soul, sesh...","[No one is my soulmate and im gonna die alone,..."
609,608,20,608_notes_outline_unread_note,"[notes, outline, unread, note, taking, hugely,...",[I just don t take notes don t study and someh...
610,609,20,609_adhd_biggie_rumination_stimulation,"[adhd, biggie, rumination, stimulation, medica...",[I have two boys and year old is ASD ADHD and ...


In [21]:
bertopic_model.save(
    path='/content/drive/MyDrive/MS_DS_NLP_project/models/bertopic_model_dir',
    serialization="safetensors",
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)

In [22]:
# load the saved model
folder_path = '/content/drive/MyDrive/MS_DS_NLP_project/models/bertopic_model_dir'
os.chdir(folder_path)

In [23]:
bertopic_model.visualize_topics() # topic 4 is about aba and inlcudes 3560 posts

In [32]:
bertopic_model.visualize_hierarchy(top_n_topics=20)

In [33]:
bertopic_model.visualize_barchart(top_n_topics=10)

In [34]:
# the ABA topic show medium correlation with topic 0 (gender conversion therapy), 2 (adhd medication), 10 (autism in adults), 11 (therapy in general) 13 (diagnostic), 14 (Aspergers sydrome - high functioning autism) and 21 (neurodirvengence), 17 (dog training)
# this is consistent with the expectations from how data / posts were colected : posts related to autism and ABA therapy
# as this current study is interested in the opinions about ABA therapy in particular, only the posts from topic 4 will be selected for futher analysis
bertopic_model.visualize_heatmap(top_n_topics=30, n_clusters=5, width=1000, height=1000)

In [19]:
# get the topics and the topic probabilities per document
document_topics, probs = bertopic_model.transform(data.text)

Batches:   0%|          | 0/16325 [00:00<?, ?it/s]

[D] [11:21:59.124739] /__w/cuml/cuml/cpp/src/umap/runner.cuh:347 Running transform
[D] [11:21:59.124874] /__w/cuml/cuml/cpp/src/umap/runner.cuh:349 Building KNN Graph
[D] [11:23:15.017537] /__w/cuml/cuml/cpp/src/umap/runner.cuh:382 Smoothing KNN distances
[D] [11:23:15.122283] /__w/cuml/cuml/cpp/src/umap/runner.cuh:414 Executing fuzzy simplicial set
[D] [11:23:15.142597] /__w/cuml/cuml/cpp/src/umap/runner.cuh:443 Performing L1 normalization


2023-11-19 11:23:16,383 - BERTopic - Reduced dimensionality


[D] [11:23:15.567508] /__w/cuml/cuml/cpp/src/umap/runner.cuh:479 n_epochs=30
[D] [11:23:15.684786] /__w/cuml/cuml/cpp/src/umap/runner.cuh:502 Computing # of epochs for training each sample
[D] [11:23:15.686682] /__w/cuml/cuml/cpp/src/umap/runner.cuh:509 Performing optimization


2023-11-19 11:23:27,873 - BERTopic - Predicted clusters


In [25]:
# get the all the documents and their respective topic
topics_df = bertopic_model.get_document_info(data.text)
topics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 522377 entries, 0 to 522376
Data columns (total 8 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Document                 522377 non-null  object 
 1   Topic                    522377 non-null  int64  
 2   Name                     522377 non-null  object 
 3   Representation           522377 non-null  object 
 4   Representative_Docs      522377 non-null  object 
 5   Top_n_words              522377 non-null  object 
 6   Probability              522377 non-null  float32
 7   Representative_document  522377 non-null  bool   
dtypes: bool(1), float32(1), int64(1), object(5)
memory usage: 30.4+ MB


In [26]:
topics_df.head(3)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,I don t think it works like that,-1,-1_like_people_just_autism,"[like, people, just, autism, don, autistic, kn...",[M definitely should not be texting demanding ...,like - people - just - autism - don - autistic...,0.0,False
1,I do we have handicap add on to our government...,-1,-1_like_people_just_autism,"[like, people, just, autism, don, autistic, kn...",[M definitely should not be texting demanding ...,like - people - just - autism - don - autistic...,0.0,False
2,Hey u Starflarity thank you for your post at r...,37,37_message_moderators_compose_config,"[message, moderators, compose, config, sidebar...",[Hey u Shaydie thank you for your post at r au...,message - moderators - compose - config - side...,0.70282,False


In [27]:
# save the dataset
csv_path = '/content/drive/MyDrive/MS_DS_NLP_project/bertopics_reddit_aba_asd.csv'

topics_df.to_csv(csv_path, index=False)

In [29]:
# fileter the documents/row that belong to topic 4 (aba)
aba_topics = topics_df[topics_df['Topic'] == 4]
aba_topics # 3560 rows/posts about aba

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
108,As an autistic person please listen to us when...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,1.000000,False
114,My daughter who has down syndrome and autism d...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,1.000000,False
117,One bit of word from an autistic adult Stop AB...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,1.000000,False
133,It s really terrible despite the growing evide...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,0.916550,False
142,Thank you for sharing this So far our BCBA has...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,1.000000,False
...,...,...,...,...,...,...,...,...
522283,going from level to level My year old was diag...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,1.000000,False
522312,Missed school due to therapy Hello my son is a...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,0.884826,False
522324,Why do autistic adults speak out against ABA a...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,1.000000,False
522332,Discouraged In Home BCBA Hi folks I'm coming h...,4,4_aba_bcba_rbt_bcbas,"[aba, bcba, rbt, bcbas, rbts, field, hours, sl...","[I had ABA and have nothing bad to say of it, ...",aba - bcba - rbt - bcbas - rbts - field - hour...,1.000000,False


In [30]:
# save aba posts dataset as csv
csv_path = '/content/drive/MyDrive/MS_DS_NLP_project/bertopics_reddit_aba_posts.csv'

aba_topics.to_csv(csv_path, index=False)

In [None]:
# save aba posts dataset as csv
# csv_path = '/content/drive/MyDrive/MS_DS_NLP_project/bertopics_reddit_aba_posts_indexed.csv'

# aba_topics.to_csv(csv_path, index=True)