### Installing packages

In [None]:
# !pip install datasets
# !pip install bertopic
# !pip install joblib==1.1.0
# !pip install spacy_langdetect
# !pip install lexicalrichness

#### All the basics

In [2]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load data
raw_dataset = pd.read_csv("df_lsd.csv")

# stop words
stopword_list = pd.read_csv("stopwords.csv")
stopword_list = stopword_list.word.tolist()

In [7]:
# Cleaning up dataset
reports = raw_dataset.report.to_list()
vectorizer_model = CountVectorizer(ngram_range=(1,1), stop_words = stopword_list) # more fancy way to remove stop words

In [9]:
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', 
                        cluster_selection_method='eom', prediction_data=True, min_samples=6) # Lowering min_samples will reduce outliers

In [10]:
topic_model = BERTopic(#language = "Danish", # this will just choose the multi-lingual
                       embedding_model="all-mpnet-base-v2", # specifying to use the v2 multilingual model
                       #embedding_model="paraphrase-multilingual-MiniLM-L12-v2",
                       #embedding_model="distiluse-base-multilingual-cased-v2", # this just creates a twitter class and a garbage class. Very weird
                       nr_topics = "auto", # this makes the model use something called HDBSCAN to merge topics that are quite similar
                       calculate_probabilities=True, 
                       vectorizer_model=vectorizer_model, # this is a way to remove our stop-words so they will not appear in the topic descriptions
                       min_topic_size = 20,
                       hdbscan_model = hdbscan_model,
                       #umap_model=umap_model # a way to make reproducible results
                       #embedding_model=sentence_model
                       diversity=0.2 # Whether to use MMR to diversify the resulting topic representations. If set to None, MMR will not be used.
                       )

topics, probs = topic_model.fit_transform(reports)

In [11]:
# viewing tweets dataset
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,418,-1_time_trip_feel_experience
1,0,214,0_time_started_trip_friends
2,1,160,1_time_life_trip_experience
3,2,151,2_time_trip_experience_life
4,3,117,3_time_trip_started_looked
5,4,87,4_time_trip_feel_started
6,5,64,5_trip_time_experience_friends


In [12]:
topic_model.visualize_barchart(n_words=10, width=300, height=300)

In [1]:
topic_model.visualize_documents(reports, width=600, height=400, hide_annotations = True, hide_document_hover = True)

NameError: name 'topic_model' is not defined

In [None]:
topic_model.visualize_heatmap()