# BERTopic Model
BERTopic is a topic modeling technique that leverages 🤗 transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.
### This will be used for the Topic Modelling phase
It implements DTM and multiple visualisation for the topics extracted


Setup phase
- Installing third-party libraries and extensions
- Import data

In [None]:
from copy import deepcopy

import nltk
import pandas as pd
import plotly.express as px
import swifter
import tensorflow_hub
from bertopic import BERTopic
from hdbscan import HDBSCAN
from keras.preprocessing.text import text_to_word_sequence
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from umap import UMAP


In [None]:
nltk.download('stopwords')

Adjust parameters

In [None]:
dataset = 'collab_test'
# Available cleaning methods are:
# - emma: The cleaning pipeline used by Emma Tattershall in the paper:
#                     https://github.com/etattershall/burst-detection
# - auto:
# - manual:
cleaning_method = 'manual'

# Available cleaning methods are:
# - sentece-transformers: https://www.sbert.net/docs/pretrained_models.html
# - flair: using flair library to utilize any hugging face model available
# - spacy:
# - use: Universal Sentence Encoder from tensorflow
embedding = 'sentence-transformers'

# BERTopic parameter tuning
parameters_str = f'{dataset}_{cleaning_method}_{embedding}'
# Available hyperparameter tuning options:
# - default : uses the default BERTopic configuration
# - custom : uses separate submodules for each step of the algorithm
#           it requires extra parameters to be set (default values for each
#           parameter are the best based on the experiments ran)
# - full : uses a breakdown of BERTopic to drill further down into the separate
#          submodules for each step of the algorithm. (default values for each
#          parameter are the best based on the experiments ran)
model_complexity = 'default'
top_n_words = 15
n_gram_range = (1, 3)


In [None]:
df = pd.read_csv(f'{dataset}.csv')

Cleaning the dataset

In [None]:
# cleaning method - emma
def get_stopwords():
    stop = set(stopwords.words('english'))
    stop = set([s.replace("'", "") for s in stop])

    # Add years to prevent spikes
    for year in range(1900, 2020):
        stop.add(str(year))

    # Add small numbers
    for num in range(0, 100):
        if len(str(num)) < 2:
            stop.add(str(num))
            num = '0' + str(num)

        stop.add(str(num))

    # Add these extra stopwords to the list
    # TODO: Look through the corpus and decide which are
    # extra stopwords needed for this specific domain
    extra = [
        'use', 'using', 'uses', 'used', 'based', 'including', 'include',
        'approach', 'factors', 'business', 'risk', 'factors16',
        'wa', 'ha', 'doe', 'item', '1a', 'factor', '1b', '1aitem', '10-k',
        'item', '1arisk', 'factors11', '1arisk', 'factors10k', 'factorsk13', 'could',
        'factorsk10', 'may', 'looking', 'forward', 'statement'
    ]

    for number in range(1, 300):
        factor_string = 'factors'+str(number)
        stop.add(factor_string)

    for word in extra:
        stop.add(word)

    return stop


In [None]:
# cleaning method - manual
def get_stopwords():
    stop = set(stopwords.words('english'))
    stop = set([s.replace("'", "") for s in stop])

    # Add years to prevent spikes
    for year in range(1900, 2020):
        stop.add(str(year))

    # Add small numbers
    for num in range(0, 100):
        if len(str(num)) < 2:
            stop.add(str(num))
            num = '0' + str(num)

        stop.add(str(num))

    # Add these extra stopwords to the list
    # TODO: Look through the corpus and decide which are
    # extra stopwords needed for this specific domain
    extra = [
        'use', 'using', 'uses', 'used', 'based', 'including', 'include',
        'approach', 'factors', 'business', 'risk', 'factors16',
        'wa', 'ha', 'doe', 'item', '1a', 'factor', '1b', '1aitem', '10-k',
        'item', '1arisk', 'factors11', '1arisk', 'factors10k', 'factorsk13', 'could',
        'factorsk10', 'may', 'looking', 'forward', 'statement'
    ]

    for number in range(1, 300):
        factor_string = 'factors'+str(number)
        stop.add(factor_string)

    for word in extra:
        stop.add(word)

    return stop


In [None]:
# cleaning method - auto

In [None]:
if cleaning_method == 'manual':
    df['clean_text'] = df['text'].str.lower()
    df['clean_text'] = df['clean_text'].swifter.apply(lambda x: ' '.join(
        [word for word in text_to_word_sequence(x)]))
    stop_words = get_stopwords()
    df['clean_text'] = df['clean_text'].swifter.apply(lambda x: ' '.join(
        [word for word in x.split() if word not in (stop_words)]))
elif cleaning_method == 'emma':
    print('Not implemented yet')
else:
    print('Not implemented yet')


In [None]:
# Extracting the text with timestamps
timestamps = df.year.to_list()
text = df.clean_text.to_list()

Sentence embedding

In [None]:
if embedding == 'sentence-transformers':
    sentence_model = SentenceTransformer("all-mpnet-base-v2")
    embeddings = sentence_model.encode(text, show_progress_bar=True)
elif embedding == 'flair':
    print('Not implemented yet')
elif embedding == 'spacy':
    print('Not implemented yet')
elif embedding == 'use':
    embeddings = tensorflow_hub.load(
        "https://tfhub.dev/google/universal-sentence-encoder/4")


In [None]:
if model_complexity == 'default':
    topic_model = BERTopic(calculate_probabilities=True, language="english",
                           nr_topics="auto")
elif model_complexity == 'custom':
    topic_model = BERTopic(verbose=True, n_gram_range=n_gram_range,
                           calculate_probabilities=True, language="english",
                           nr_topics="auto")
else:
    umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine',
                      low_memory=False)
    hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean',
                            prediction_data=True)
    topic_model = BERTopic(verbose=True, n_gram_range=n_gram_range,
                           calculate_probabilities=True,
                           nr_topics="auto", umap_model=umap_model,
                           hdbscan_model=hdbscan_model)
topics, probs = topic_model.fit_transform(text, embeddings=embeddings)


In [None]:
topic_model.save(f"BERTopic_model_{parameters_str}")


In [None]:
# Explore a topic
topic_model.get_topic(21)

In [None]:
# Overview of the topics
topic_info = topic_model.get_topic_info()
topic_info

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_topics()

DTM

In [None]:
# Running DTM on the entire dataset
topics_over_time = topic_model.topics_over_time(text, topics, timestamps)

In [None]:
topics_over_time.to_csv(f'DTM_{parameters_str}.csv')

In [None]:
topics_over_time=topics_over_time[topics_over_time['Topic'] != -1]

In [None]:
# First 10 topics bases on frequency
topics_over_time.head(10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time,top_n_topics=10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time)

YoY Changes in importance of topics extracted

In [None]:
topic_evolution = pd.DataFrame()
timestamps_set = sorted(set(timestamps))
for timestamp in timestamps_set:
    temp_df = topics_over_time[topics_over_time['Timestamp'] == timestamp]
    if topic_evolution.shape[0] == 0:
        temp_df = temp_df[['Topic', 'Name', 'Words', 'Frequency']]
        temp_df = temp_df.rename(columns={'Frequency': str(timestamp)})
        topic_evolution = deepcopy(temp_df)
    else:
        temp_df = temp_df[['Topic', 'Frequency']]
        temp_df = temp_df.rename(columns={'Frequency': str(timestamp)})
        topic_evolution = topic_evolution.merge(temp_df, on='Topic', how='outer')


In [None]:
topic_evolution = topic_evolution.fillna(0)

In [None]:
topic_evolution.head(10)

In [None]:
columns_to_process = topic_evolution.columns.to_list()
columns_to_process.remove('Topic')
columns_to_process.remove('Name')
columns_to_process.remove('Words')
columns_to_process


In [None]:
for index in range(0, len(columns_to_process)-1):
    new_column = columns_to_process[index+1].split('-')[0] + '-' + columns_to_process[index].split('-')[0]
    topic_evolution[new_column] = topic_evolution[columns_to_process[index+1]
                                                  ] - topic_evolution[columns_to_process[index]]
topic_evolution.head(10)


In [None]:
topics = topics_over_time.Topic.unique()
change_in_topics = pd.DataFrame()
for topic in topics:
    topic_df = topics_over_time[topics_over_time['Topic'] == topic]
    topic_df = topic_df.sort_values('Timestamp')
    topic_df['Previous_Frequency'] = topic_df.Frequency.shift(1)
    topic_df['Change'] = topic_df['Frequency'] - topic_df['Previous_Frequency']
    max_freq = max(topic_df['Frequency'])
    topic_df['%_Change'] = topic_df['Change']*100/max_freq
    change_in_topics = change_in_topics.append(topic_df, ignore_index=True)
change_in_topics = change_in_topics[change_in_topics['Topic'] != -1]
change_in_topics = change_in_topics.dropna()
change_in_topics = change_in_topics.groupby(by=['Topic', 'Timestamp']).agg({
    'Frequency': 'sum',
    'Name': 'count',
    'Previous_Frequency': 'sum',
    'Change': 'sum',
    '%_Change': 'mean',
    'Words': ' '.join
}).reset_index()
change_in_topics.head(10)


In [None]:
fig = px.line(change_in_topics, x="Timestamp", y="Change", color='Topic', title='YOY change in topic frequency')
fig.show()
