## Applying BERTopic to the data

In [1]:
#Import packages
import pandas as pd
from bertopic import BERTopic
import os
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

#Loading working directory
cwd = os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
#Import the data
df_post = pd.read_csv(cwd+ '/data/Post_dataset_processed.csv')

### Functions for BERTopic

In [3]:
#The data should already be processed based on the "Processing" notebook
#This preprocessing step is specifically for BERTopic
def BERTopic_preprocessing(df_file:pd.DataFrame, columns:list[str]):
    '''
    Assign the df to df_file
    Assign the columns you want to keep to columns
    Combines all columns per row into a single string
    Output: The df ready for BERTopic modelling to data:list[str]
    '''
    #Creating strings per row, for each column
    df_file['combined']:str = ""
    for col in columns:
        df_file['combined'] += df_file[col].astype(str) + " "
    df_file['combined'] = df_file['combined'].str.strip() #removing trailing spaces
    #Combining all strings into a list
    data = df_file['combined'].tolist()
    
    return data

In [11]:
#Applying the topic modelling to the data
def BERTopic_modelling(model_data:list[str]):
    '''
    Assign the data to model_data
    Fits the data to the BERTopic model
    Outputs the topics and probabilities
    Visualizes the topics based on total amount, frequency & intertopic distance map
    '''
    vectorizer_model = CountVectorizer(ngram_range=(1,2), stop_words="english") #Performing extra removal of stopwords
    umap_model = UMAP(random_state=25) #Making it more reproducible
    model = BERTopic(vectorizer_model=vectorizer_model, umap_model=umap_model)
    topics, probabilities = model.fit_transform(model_data)
    print(f'Total amount of topics: {len(model.get_topics().keys())}\n')
    print(f'Insight into the first five topics: \n {model.get_topic_freq().head()}')
    return model, probabilities

### Performing the topic modelling and visualizing the results

In [13]:
#Preprocessing the data
model_data = BERTopic_preprocessing(df_file = df_post, columns = [
                                                    'Title Post', 'Post Text', 'Text of URL Post'
                                                    ])
#Applying BERTopic to the data
model, probabilities = BERTopic_modelling(model_data = data)
model.save(cwd+ "/models/TopicModel")

Total amount of topics: 22

Insight into the first five topics: 
    Topic  Count
0     -1    469
1      0     52
2      1     43
3      2     43
4      3     41


In [12]:
#Getting the frequency of posts in topics
model.get_topic_freq()

Unnamed: 0,Topic,Count
0,-1,469
1,0,52
2,1,43
3,2,43
4,3,41
5,4,37
6,5,37
7,6,36
8,7,35
9,8,32


In [18]:
#Defining function to visualize the topics with frequent words and posts
def BERTopic_visualization(model, model_data:list[str]):
    '''
    Assign the model to model
    Assign the theprecessed data to model_data
    Visualizes the topics based on total amount, frequency & intertopic distance map
    '''
    #Creating a dataframe with the topics and their top words
    df_topic_word = pd.DataFrame(columns = ['Topic Number', 'Top Words', 'Weight'])
    df_topic_doc = pd.DataFrame(columns = ['Topic Number', 'Representative Docs'])
    for topic in model.get_topics().keys():
        topic_data = model.get_topic(topic)
        for index in range(len(topic_data)):
            df_topic_word = df_topic_word.append({'Topic Number': topic, 'Top Words': topic_data[index][0], 'Weight': topic_data[index][1]}, ignore_index=True)
    #Creating a dataframe with the topics and their representative documents    
        topic_docs = model.get_representative_docs(topic)
        for doc in topic_docs:
            df_topic_doc  = df_topic_doc.append({'Topic Number': topic, 'Representative Docs': doc}, ignore_index=True)
    return df_topic_word, df_topic_doc


In [39]:
#Calling the visualization and saving the results
df_topic_word1, df_topic_doc1 = BERTopic_visualization(model = model, model_data = data)

df_topic_word1.to_csv(cwd+ "/topic_result/T1word.csv")
df_topic_doc1.to_csv(cwd+ "/topic_result/T1doc.csv")

#Saving the results to combine them with the actual posts
document_info = model.get_document_info(docs=df_post['Title Post'])
document_info = document_info.rename(columns={'Document': 'Title Post'})
document_info.to_csv(cwd+ "/data/df_post_topic.csv")

### Graphs and further visualization of the topic results

In [22]:
#Visualzing the most common words per topic, for the first 10 topics
print('Barchart of most common words per topic')
model.visualize_barchart(n_words=10, top_n_topics=10, width=300, height=300)

Barchart of most common words per topic


In [23]:
#Plotting the similarity between topics through a heatmap
print('Heatmap based on similarity')
model.visualize_heatmap(n_clusters=10)

Heatmap based on similarity


In [24]:
#Plotting the differences and similarities between topics on two axis
print('Map of topics')
model.visualize_documents(model_data, reduced_embeddings=None, height=600)

Map of topics


In [25]:
#Visualziing the intertopic distance between topics
print('Intertopic Distance Map')
model.visualize_topics()

Intertopic Distance Map
