In [3]:
import os
import pickle
import pandas as pd
from bertopic import BERTopic
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
import nltk
nltk.download('stopwords')

# Data Loading

In [4]:
data_filepath = '../data/cache/en/corpus_processed.csv'
os.path.isfile(data_filepath)

True

In [5]:
data = pd.read_csv(data_filepath)
data.head()

In [6]:
documents = data['body'].tolist()
len(documents)

10814

# Training



*   Language: `english` (`multilingual` for multi-lingual model)
*   Calculate topic probabilities 

In [7]:
# Instantiate the model
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

topic_model = BERTopic(
    language='english', 
    top_n_words=10, # number of words per topic that should be extracted
    min_topic_size=15, # specifies what the minimum size of a topic can be
    calculate_probabilities=True, 
    vectorizer_model=vectorizer_model,
    verbose=True
    )

In [6]:
# Train the model
topics, probs = topic_model.fit_transform(documents)

## Extracting Topics

-1 refers to all outliers and should typically be ignored

In [9]:
freq = topic_model.get_topic_info()

In [7]:
# Total number of identified topics
len(freq)

In [8]:
# Most frequent topics
freq.head(20)

In [9]:
# Select the most frequent topic
topic_model.get_topic(0)

**NOTE**: BERTopic is stochastic which means that the topics might differ across runs. This is mostly due to the stocastisch nature of UMAP.

## Custom labels

In [10]:
topic_labels = topic_model.generate_topic_labels(nr_words=7,
                                                 topic_prefix=True,
                                                 word_length=15,
                                                 separator=", ")
topic_labels

In [14]:
topic_model.set_topic_labels(topic_labels)

## **Visualization**

In [11]:
topic_model.visualize_topics()

### Visualize Topic Hierarchy

In [12]:
topic_model.visualize_hierarchy(top_n_topics=50, custom_labels=topic_labels)

#### Hierarchical labels



In [13]:
hierarchical_topics = topic_model.hierarchical_topics(documents)

In [14]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=topic_labels)

#### Text-based topic tree

In [15]:
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

### Visualize Terms

In [16]:
topic_model.visualize_barchart(top_n_topics=20)

### Visualize Topic Similarity

In [17]:
topic_model.visualize_heatmap(n_clusters=20, width=1000, height=1000)

## **Topic Representation**

Fine-tune the model after initial training. 

### Topic Reduction



In [22]:
reduced_topic_model = BERTopic(
    language='english', 
    top_n_words=10, # number of words per topic that should be extracted
    min_topic_size=15, # specifies what the minimum size of a topic can be
    calculate_probabilities=True, 
    vectorizer_model=vectorizer_model,
    nr_topics=40,
    verbose=True
    )

In [18]:
updated_topics, updated_probs = reduced_topic_model.fit_transform(documents)

In [19]:
freq = reduced_topic_model.get_topic_info()
freq

In [20]:
updated_topic_labels = reduced_topic_model.generate_topic_labels(nr_words=7,
                                                 topic_prefix=True,
                                                 word_length=15,
                                                 separator=", ")
updated_topic_labels

## Analysis Updated Topics

In [21]:
reduced_topic_model.visualize_topics()

In [22]:
new_hierarchical_topics = reduced_topic_model.hierarchical_topics(documents)
reduced_topic_model.visualize_hierarchy(hierarchical_topics=new_hierarchical_topics, custom_labels=updated_topic_labels)

In [23]:
reduced_topic_model.visualize_barchart(top_n_topics=20)

In [24]:
reduced_topic_model.visualize_heatmap(n_clusters=5, width=1000, height=1000)

## **Model serialization**

In [31]:
model_filepath = '../data/cache/en/topic_model'

# Save model
reduced_topic_model.save(model_filepath)

# Annotated dataset

In [25]:
df = pd.DataFrame({"id": data.id.to_list(), "topic": updated_topics})
df

In [26]:
df['topic_label'] = df['topic'].apply(lambda x: reduced_topic_model.topic_labels_[x])
df['topic_label'] = df['topic_label'].apply(lambda x: '_'.join(x.split('_')[1:]))
df

In [27]:
reduced_topic_model.topic_sizes_

In [35]:
df.to_csv('../data/cache/en/subtopic_annotations.csv', index=False)