### Installing packages

In [None]:
!pip install datasets
!pip install bertopic
!pip install joblib==1.1.0
!pip install spacy_langdetect
!pip install lexicalrichness

#### All the basics

In [1]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data
raw_dataset = pd.read_csv("df_2c.csv") # sentiment classification dataset: negative, neutral or positive

# stop words
stopword_list = pd.read_csv("stopwords.csv")
stopword_list = stopword_list.word.tolist()

In [3]:
# Cleaning up dataset
reports = raw_dataset.report.to_list()
vectorizer_model = CountVectorizer(ngram_range=(2,2), stop_words = stopword_list) # more fancy way to remove stop words

In [16]:
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', 
                        cluster_selection_method='eom', prediction_data=True, min_samples=2) # Lowering min_samples will reduce outliers

In [17]:
topic_model = BERTopic(#language = "Danish", # this will just choose the multi-lingual
                       embedding_model="all-mpnet-base-v2", # specifying to use the v2 multilingual model
                       #embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
                       #embedding_model="distiluse-base-multilingual-cased-v2", # this just creates a twitter class and a garbage class. Very weird
                       nr_topics = "auto", # this makes the model use something called HDBSCAN to merge topics that are quite similar
                       calculate_probabilities=True, 
                       vectorizer_model=vectorizer_model, # this is a way to remove our stop-words so they will not appear in the topic descriptions
                       min_topic_size = 10,
                       hdbscan_model = hdbscan_model,
                       #umap_model=umap_model # a way to make reproducible results
                       #embedding_model=sentence_model
                       diversity=0.2, # Whether to use MMR to diversify the resulting topic representations. If set to None, MMR will not be used.
                       top_n_words=20
                       )

topics, probs = topic_model.fit_transform(reports)

In [18]:
# viewing tweets dataset
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,34,-1_body load_fall asleep_eye visuals_feel body
1,0,100,0_body load_started feel_listening music_visua...
2,1,25,1_san francisco_depth perception_closed eyes_d...
3,2,20,2_hour mark_ecstasy feel_yellow capsules_trip ...


In [19]:
topic_model.visualize_barchart(n_words=10, width=300, height=400)

In [21]:
topic_model.visualize_topics()


k >= N for N * N square matrix. Attempting to use scipy.linalg.eigh instead.



TypeError: Cannot use scipy.linalg.eigh for sparse A with k >= N. Use scipy.linalg.eigh(A.toarray()) or reduce k.

In [22]:
topic_model.visualize_documents(reports, width=600, height=400, hide_annotations = True, hide_document_hover = True)

KeyboardInterrupt: 