# BERTopic Model
BERTopic is a topic modeling technique that leverages 🤗 transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.
### This will be used for the Topic Modelling phase
It implements DTM and multiple visualisation for the topics extracted


Setup phase
- Installing third-party libraries and extensions
- Import data

In [None]:
!pip install bertopic swifter flair spacy-transformers psutil


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
!python3 - m spacy download en_core_web_trf


In [None]:
from copy import deepcopy

import nltk
import pandas as pd
import plotly.express as px
import spacy
import swifter
import tensorflow_hub
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from hdbscan import HDBSCAN
from keras.preprocessing.text import text_to_word_sequence
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from umap import UMAP


In [None]:
nltk.download('stopwords')

Adjust parameters

In [None]:
dataset = 'collab_dataset_small_07_22_<220000'
# Available cleaning methods are:
# - emma: The cleaning pipeline used by Emma Tattershall in the paper:
#                     https://github.com/etattershall/burst-detection
# - auto:
# - manual:
cleaning_method = 'manual'

# Available cleaning methods are:
# - sentence-transformers: https://www.sbert.net/docs/pretrained_models.html
# - flair: using flair library to utilize any hugging face model available
# - spacy:
# - use: Universal Sentence Encoder from tensorflow
embedding = 'sentence-transformers'

# BERTopic parameter tuning

# Available hyperparameter tuning options:
# - default : uses the default BERTopic configuration
# - custom : uses separate submodules for each step of the algorithm
#           it requires extra parameters to be set (default values for each
#           parameter are the best based on the experiments ran)
# - full : uses a breakdown of BERTopic to drill further down into the separate
#          submodules for each step of the algorithm. (default values for each
#          parameter are the best based on the experiments ran)
#
# By default, the model will be selected to work on english and
# will compute probabilities for each corresponding topic

model_complexity = 'default'

# custom parameters (also applicable for full version)
top_n_words = 10
n_gram_range = (1, 2)
nr_topics = 100

# full parameters

# UMAP parameters
low_memory = False
n_neighbors = 15
n_components = 10

# HDBSCAN parameters
min_cluster_size = 10
min_samples = 1
metric = 'euclidean'
precition_data = True

if model_complexity == 'default':
    parameters_str = f'{dataset}_{cleaning_method}_{embedding}_{model_complexity}'
elif model_complexity == 'custom':
    parameters_str = f'{dataset}_{cleaning_method}_{embedding}_{model_complexity}_{top_n_words}_{n_gram_range}_{nr_topics}'
else:
    parameters_str = f'{dataset}_{cleaning_method}_{embedding}_{model_complexity}_{top_n_words}_{n_gram_range}_{nr_topics}_'


In [None]:
df = pd.read_csv(f'/content/drive/MyDrive/Colab Notebooks/{dataset}.csv')


Cleaning the dataset

In [None]:
# cleaning method - emma
def get_stopwords():
    stop = set(stopwords.words('english'))
    stop = set([s.replace("'", "") for s in stop])

    # Add years to prevent spikes
    for year in range(1900, 2020):
        stop.add(str(year))

    # Add small numbers
    for num in range(0, 100):
        if len(str(num)) < 2:
            stop.add(str(num))
            num = '0' + str(num)

        stop.add(str(num))

    # Add these extra stopwords to the list
    # TODO: Look through the corpus and decide which are
    # extra stopwords needed for this specific domain
    extra = [
        'use', 'using', 'uses', 'used', 'based', 'including', 'include',
        'approach', 'factors', 'business', 'risk', 'factors16',
        'wa', 'ha', 'doe', 'item', '1a', 'factor', '1b', '1aitem', '10-k',
        'item', '1arisk', 'factors11', '1arisk', 'factors10k', 'factorsk13', 'could',
        'factorsk10', 'may', 'looking', 'forward', 'statement'
    ]

    for number in range(1, 300):
        factor_string = 'factors'+str(number)
        stop.add(factor_string)

    for word in extra:
        stop.add(word)

    return stop


In [None]:
# cleaning method - manual
def get_stopwords():
    stop = set(stopwords.words('english'))
    stop = set([s.replace("'", "") for s in stop])

    # Add years to prevent spikes
    for year in range(1900, 2020):
        stop.add(str(year))

    # Add small numbers
    for num in range(0, 100):
        if len(str(num)) < 2:
            stop.add(str(num))
            num = '0' + str(num)

        stop.add(str(num))

    # Add these extra stopwords to the list
    # TODO: Look through the corpus and decide which are
    # extra stopwords needed for this specific domain
    extra = [
        'use', 'using', 'uses', 'used', 'based', 'including', 'include',
        'approach', 'factors', 'business', 'risk', 'factors16',
        'wa', 'ha', 'doe', 'item', '1a', 'factor', '1b', '1aitem', '10-k',
        'item', '1arisk', 'factors11', '1arisk', 'factors10k', 'factorsk13', 'could',
        'factorsk10', 'may', 'looking', 'forward', 'statement'
    ]

    for number in range(1, 300):
        factor_string = 'factors'+str(number)
        stop.add(factor_string)

    for word in extra:
        stop.add(word)

    return stop


In [None]:
# cleaning method - auto

In [None]:
if cleaning_method == 'manual':
    df['clean_text'] = df['text'].str.lower()
    df['clean_text'] = df['clean_text'].swifter.apply(lambda x: ' '.join(
        [word for word in text_to_word_sequence(x)]))
    stop_words = get_stopwords()
    df['clean_text'] = df['clean_text'].swifter.apply(lambda x: ' '.join(
        [word for word in x.split() if word not in (stop_words)]))
elif cleaning_method == 'emma':
    print('Not implemented yet')
else:
    print('Not implemented yet')


In [None]:
# Extracting the text with timestamps
timestamps = df.year.to_list()
text = df.clean_text.to_list()


Sentence embedding

In [None]:
if embedding == 'sentence-transformers':
    sent_embedding = SentenceTransformer("all-mpnet-base-v2")
elif embedding == 'flair':
    sent_embedding = TransformerDocumentEmbeddings(
        'sentence-transformers/all-mpnet-base-v2')
elif embedding == 'spacy':
    spacy.prefer_gpu()
    sent_embedding = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner',
                                                            'attribute_ruler', 'lemmatizer'])
elif embedding == 'use':
    sent_embedding = tensorflow_hub.load(
        "https://tfhub.dev/google/universal-sentence-encoder/4")
else:
    sent_embedding = SentenceTransformer("all-mpnet-base-v2")

if model_complexity == 'default':
    topic_model = BERTopic(verbose=True, calculate_probabilities=True,
                           language="english", nr_topics="auto",
                           embedding_model=sent_embedding)
elif model_complexity == 'custom':
    topic_model = BERTopic(verbose=True, n_gram_range=n_gram_range,
                           calculate_probabilities=True, language="english",
                           nr_topics="auto", low_memory=True,
                           embedding_model=sent_embedding)
else:
    umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine',
                      low_memory=True)
    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
                            prediction_data=True)
    topic_model = BERTopic(verbose=True, n_gram_range=n_gram_range,
                           calculate_probabilities=True,
                           nr_topics="auto", umap_model=umap_model,
                           hdbscan_model=hdbscan_model,
                           embedding_model=sent_embedding)

topics, probs = topic_model.fit_transform(text)


In [None]:
topic_model.save(f"/content/drive/MyDrive/Colab Notebooks/BERTopic_model_{parameters_str}")


In [None]:
# Explore a topic
topic_model.get_topic(21)


In [None]:
# Overview of the topics
topic_info = topic_model.get_topic_info()
topic_info


In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_topics()

DTM

In [None]:
# Running DTM on the entire dataset
topics_over_time = topic_model.topics_over_time(text, topics, timestamps)

In [None]:
topics_over_time.to_csv(f'/content/drive/MyDrive/Colab Notebooks/DTM_{parameters_str}.csv')

In [None]:
topics_over_time=topics_over_time[topics_over_time['Topic'] != -1]

In [None]:
# First 10 topics bases on frequency
topics_over_time.head(10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time,top_n_topics=10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time)