# BERTopic Model Analysis

BERTopic is a topic modeling technique that leverages 🤗 transformers and c-TF-IDF to create dense clusters allowing for easily interpretable topics whilst keeping important words in the topic descriptions.

### This will be used for the Topic Modelling phase
It implements DTM and multiple visualisation for the topics extracted


### Setup phase
- Installing third-party libraries and extensions
- Import data

In [None]:
import re
import string
from copy import deepcopy
import tensorflow as tf
import nltk
import pandas as pd
import plotly.express as px
import spacy
import swifter
import tensorflow_hub
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from hdbscan import HDBSCAN
from keras.preprocessing.text import text_to_word_sequence
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.notebook import tqdm
from umap import UMAP


In [None]:
spacy.prefer_gpu()

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('punkt')


### Adjust parameters

In [None]:
# For loading data directly from a local file
local_data = False
dataset = 'collab_dataset_07_22__220000'

# For extracting data from a MongoDB instance
start_year = '07'
end_year = '22'
size_cutoff = 220000  # Maximum value: 740000

# Available cleaning methods are:
# - emma: The cleaning pipeline used by Emma Tattershall in the paper:
#                     https://github.com/etattershall/burst-detection
# - nltk: A cleaning pipeline using the builtin methods
#         from NLTK library
# - custom: A cleaning pipeline built using several iterations
#           and experiments to get the best results for BERTopic
cleaning_method = 'custom'

# Available cleaning methods are:
# - sentence-transformers: https://www.sbert.net/docs/pretrained_models.html
# - flair: using flair library to utilize any hugging face model available
# - spacy: using spaCy English Transformers model
# - use: Universal Sentence Encoder from tensorflow
embedding = 'sentence-transformers'

# BERTopic parameter tuning

# Available hyperparameter tuning options:
# - default : uses the default BERTopic configuration
# - custom : uses separate submodules for each step of the algorithm
#           it requires extra parameters to be set (default values for each
#           parameter are the best based on the experiments ran)
# - full : uses a breakdown of BERTopic to drill further down into the separate
#          submodules for each step of the algorithm. (default values for each
#          parameter are the best based on the experiments ran)
#
# By default, the model will be selected to work on english and
# will compute probabilities for each corresponding topic

model_complexity = 'default'

# custom parameters (also applicable for full version)
top_n_words = 15
n_gram_range = (1, 3)
nr_topics = 100

# full parameters

# UMAP parameters
low_memory = False
n_neighbors = 15
n_components = 10

# HDBSCAN parameters
min_cluster_size = 10
min_samples = 1
metric = 'euclidean'
prediction_data = True

if model_complexity == 'default':
    parameters_str = f'local_{cleaning_method}_{embedding}_{model_complexity}'
elif model_complexity == 'custom':
    parameters_str = f'local_{cleaning_method}_{embedding}_{model_complexity}_{top_n_words}_{n_gram_range}_{nr_topics}'
else:
    parameters_str = f'local_{cleaning_method}_{embedding}_{model_complexity}_{top_n_words}_{n_gram_range}_{nr_topics}_'


### Loading the data

In [None]:
if local_data:
    df = pd.read_csv(f'{dataset}.csv')
else:
    # Connect to local instance of MongoDB
    client = MongoClient('127.0.0.1', 27017)
    db = client.frtp
    collection = db.documents

    # Extract all the data available with specific restrictions
    result = collection.find({
        "year": {
            '$lt': end_year,
            '$gte': start_year
        },
        "size": {
            '$lt': size_cutoff
        }
    })
    df = pd.DataFrame(list(result))
    df['year'] = pd.to_datetime(df['year'], format='%y')

### Cleaning the dataset

In [None]:
# cleaning method - emma (this is the preprocessing pipeline that was used in the burstiness model paper)
alphabets = "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
htmltags = '<[^>]+>'
htmlspecial = '&#?[xX]?[a-zA-Z0-9]{2,8};'

start_delimiter = 'documentstart'
sent_delimiter = 'sentenceboundary'
end_delimiter = 'documentend'

ngram_length = 3

delimiters = [start_delimiter, sent_delimiter, end_delimiter]

# Download the lemmatisesr
wnl = WordNetLemmatizer()

# Create a tokeniser
count = CountVectorizer(strip_accents='ascii', min_df=1)
tokeniser = count.build_analyzer()


def normalise_acronymns(text):
    '''
    Remove the periods in acronyms. 
    Adapted from the method found at https://stackoverflow.com/a/40197005 
    '''
    return re.sub(r'(?<!\w)([A-Z, a-z])\.', r'\1', text)


def normalise_decimals(text):
    '''
    Remove the periods in decimal numbers and replace with POINT
    '''
    return re.sub(r'([0-9])\.([0-9])', r'\1POINT\2', text)


def split_into_sentences(text):
    '''
    Sentence splitter adapted from https://stackoverflow.com/a/31505798
    '''
    text = text.replace("\n", " ")
    text = re.sub(prefixes, "\\1<prd>", text)
    text = re.sub(websites, "<prd>\\1", text)

    # my addition
    text = re.sub(htmltags, " ", text)
    text = re.sub(htmlspecial, " ", text)

    if "Ph.D" in text:
        text = text.replace("Ph.D.", "PhD")

    text = re.sub("\s" + alphabets + "[.] ", " \\1", text)
    text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1\\2\\3", text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1\\2", text)
    text = re.sub(" "+suffixes+"[.] "+starters, " \\1 \\2", text)
    text = re.sub(" "+suffixes+"[.]", " \\1", text)
    text = re.sub(" " + alphabets + "[.]", " \\1", text)

    if "”" in text:
        text = text.replace(".”", "”.")
    if "\"" in text:
        text = text.replace(".\"", "\".")
    if "!" in text:
        text = text.replace("!\"", "\"!")
    if "?" in text:
        text = text.replace("?\"", "\"?")

    text = text.replace(".", "<stop>")
    text = text.replace("?", "<stop>")
    text = text.replace("!", "<stop>")

    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]

    non_empty = []

    for s in sentences:
        # we require that there be two alphanumeric characters in a row
        if len(re.findall("[A-Za-z0-9][A-Za-z0-9]", s)) > 0:
            non_empty.append(s)
    return non_empty


def pad_sentences(sentences):
    '''
    Takes a list of sentences and returns a string in which:
        - The beginning of the abstract is indicated by DOCUMENTSTART
        - The end is indicated by DOCUMENTEND
        - Sentence boundaries are indicated by SENTENCEBOUNDARY

    The number of delimiters used is dependent on the ngram length
    '''
    sent_string = (' '+(sent_delimiter+' ')*(ngram_length-1)).join(sentences)

    return (start_delimiter+' ')*(ngram_length-1) + sent_string + (' '+end_delimiter)*(ngram_length-1)


def get_stopwords():
    stop = set(stopwords.words('english'))
    stop = set([s.replace("'", "") for s in stop])

    # Add years to prevent spikes
    for year in range(1900, 2020):
        stop.add(str(year))

    # Add small numbers
    for num in range(0, 100):
        if len(str(num)) < 2:
            stop.add(str(num))
            num = '0' + str(num)

        stop.add(str(num))

    # Add several specific terms considered stopwords based on
    # manual dataset inspection
    extra = [
        'use', 'using', 'uses', 'used', 'based', 'including', 'include', 'approach', 'factors', 'business', 'risk'
        'wa', 'ha', 'doe', 'item', '1a', 'factor', '1b', '1aitem', '10-k', '1AITEM', 'could', 'regarding'
    ]
    for word in extra:
        stop.add(word)
    return stop


def cleaning_pipeline(text):
    '''
    Takes a binary string and returns a list of cleaned sentences, stripped of punctuation and lemmatised
    '''

    stopwords = get_stopwords()
    text = normalise_decimals(normalise_acronymns(text))
    text = ' '.join([word for word in text.split() if word not in stopwords])
    sentences = split_into_sentences(text)

    # strip out punctuation and make lowercase
    clean_sentences = []
    for s in sentences:

        # Deal with special cases
        s = re.sub(r'[-/]', ' ', s)

        # Remove all other punctuation
        s = re.sub(r'[^\w\s]', '', s)

        clean_sentences.append(s.lower())

    # pad sentences with delimiters
    text = pad_sentences(clean_sentences)

    # Lemmatise word by word
    lemmas = []
    for word in tokeniser(text):
        lemmas.append(wnl.lemmatize(word))

    return ' '.join(lemmas)


def cleaning_pipeline_df(row):
    row['clean_text'] = cleaning_pipeline(row['text'])
    return row


In [None]:
# cleaning method - custom
def get_stopwords_manual():
    stop = set(stopwords.words('english'))
    stop = set([s.replace("'", "") for s in stop])

    # Add years to prevent spikes
    for year in range(1900, 2020):
        stop.add(str(year))

    # Add small numbers
    for num in range(0, 100):
        if len(str(num)) < 2:
            stop.add(str(num))
            num = '0' + str(num)

        stop.add(str(num))

    # Add several specific terms considered stopwords based on
    # manual dataset inspection
    extra = [
        'use', 'using', 'uses', 'used', 'based', 'including', 'include',
        'approach', 'factors', 'business', 'risk', 'factors16',
        'wa', 'ha', 'doe', 'item', '1a', 'factor', '1b', '1aitem', '10-k',
        'item', '1arisk', 'factors11', '1arisk', 'factors10k', 'factorsk13', 'could',
        'factorsk10', 'may', 'looking', 'forward', 'statement'
    ]

    # Add factors combined with numbers as
    # they appear often as references
    for number in range(1, 300):
        factor_string = 'factors'+str(number)
        stop.add(factor_string)

    for word in extra:
        stop.add(word)

    return stop


In [None]:
# cleaning method - nltk
punct = []
punct += list(string.punctuation)
punct += '’'
punct.remove("'")


def remove_punctuations(text):
    for punctuation in punct:
        text = text.replace(punctuation, ' ')
    return text


def nlp(df):
    df['token'] = df['text'].swifter.apply(lambda x: x.lower())
    df['token'] = df['token'].swifter.apply(lambda x: x.replace('\n', ' '))
    df['token'] = df['token'].str.replace('http\S+|www.\S+', '', case=False)
    df['token'] = df['token'].swifter.apply(lambda x: x.replace('&gt;', ''))
    df['token'] = df['token'].swifter.apply(remove_punctuations)
    df['token'] = df['token'].swifter.apply(lambda x: str(x).replace(" s ", " "))
    df['final_text'] = df['token'].swifter.apply(word_tokenize)
    my_stopwords = set(stopwords.words('english'))
    df['final_text'] = df['final_text'].swifter.apply(
        lambda text_list: [x for x in text_list if x not in my_stopwords])
    df['final_text'] = df['final_text'].swifter.apply(lambda list_data: [x for x in list_data if x.isalpha()])
    df['clean_text'] = df['final_text'].swifter.apply(lambda row: ' '.join([word for word in row]))
    return df


In [None]:
if cleaning_method == 'custom':
    df['clean_text'] = df['text'].str.lower()
    df['clean_text'] = df['clean_text'].swifter.apply(lambda x: ' '.join(
        [word for word in text_to_word_sequence(x)]))
    stop_words = get_stopwords_manual()
    df['clean_text'] = df['clean_text'].swifter.apply(lambda x: ' '.join(
        [word for word in x.split() if word not in (stop_words)]))
elif cleaning_method == 'emma':
    df['clean_text'] = ''
    df = df.swifter.apply(cleaning_pipeline_df, axis=1)
else:
    df = nlp(df)


In [None]:
# Extracting the text with timestamps
timestamps = df.year.to_list()
text = df.clean_text.to_list()

### Topic Modelling utilising BERTopic

In [None]:
if embedding == 'sentence-transformers':
    embedding_model = SentenceTransformer("all-roberta-large-v1")
elif embedding == 'flair':
    embedding_model = TransformerDocumentEmbeddings(
        'sentence-transformers/all-MiniLM-L6-v2')
elif embedding == 'spacy':
    spacy.prefer_gpu()
    embedding_model = spacy.load("en_core_web_trf", exclude=['tagger', 'parser', 'ner',
                                                             'attribute_ruler', 'lemmatizer'])
elif embedding == 'use':
    embedding_model = tensorflow_hub.load(
        "https://tfhub.dev/google/universal-sentence-encoder/4")
else:
    embedding_model = SentenceTransformer("all-mpnet-base-v1")

if model_complexity == 'default':
    topic_model = BERTopic(verbose=True, language="english", nr_topics="auto",
                           embedding_model=embedding_model)
elif model_complexity == 'custom':
    topic_model = BERTopic(verbose=True, n_gram_range=n_gram_range,
                           language="english", top_n_words=top_n_words,
                           nr_topics=nr_topics, low_memory=low_memory,
                           embedding_model=embedding_model)
else:
    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, metric=metric,
                      low_memory=low_memory)
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, metric=metric,
                            prediction_data=prediction_data)
    topic_model = BERTopic(verbose=True, n_gram_range=n_gram_range,
                           nr_topics=nr_topics, umap_model=umap_model,
                           hdbscan_model=hdbscan_model,
                           embedding_model=embedding_model)

topics, probs = topic_model.fit_transform(text)


In [None]:
# Due to a bug that I had to report on library's page: https://github.com/MaartenGr/BERTopic/issues/470
if embedding != 'use':
    topic_model.save(f"models/BERTopic_model_{parameters_str}")


In [None]:
# Explore a topic
topic_model.get_topic(21)

In [None]:
# Overview of the topics
topic_info = topic_model.get_topic_info()
topic_info

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_topics()

### Utilising DTM to convert the topics resulted into a timeserie

In [None]:
# Running DTM on the entire dataset
topics_over_time = topic_model.topics_over_time(text, topics, timestamps)

### Cleaning DTM results

In [None]:
# TODO: Decide if it is always excluded
topics_over_time=topics_over_time[topics_over_time['Topic'] != -1]

In [None]:
# topics_over_time = topics_over_time.drop(columns=['Unnamed: 0']).sort_values('Topic')

In [None]:
smaller_reporting_topics = topics_over_time[(topics_over_time['Words'].str.contains("smaller"))]['Topic'].unique()
topics_over_time = topics_over_time[~(topics_over_time['Topic'].isin(smaller_reporting_topics))]

In [None]:
spaces_topics = topics_over_time[(topics_over_time['Words'].str.contains(", , ,"))]['Topic'].unique()
topics_over_time = topics_over_time[~(topics_over_time['Topic'].isin(spaces_topics))]

In [None]:
# First 10 entries in the timeseries generated by DTM
topics_over_time.head(10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time,top_n_topics=10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time)

In [None]:
topics_over_time.to_csv(f'output/DTM/DTM_{parameters_str}.csv')

### YoY Changes in importance of topics extracted

In [None]:
topic_evolution = pd.DataFrame()
timestamps_set = sorted(set(timestamps))
for timestamp in timestamps_set:
    temp_df = topics_over_time[topics_over_time['Timestamp'] == timestamp]
    if topic_evolution.shape[0] == 0:
        temp_df = temp_df[['Topic', 'Name', 'Words', 'Frequency']]
        temp_df = temp_df.rename(columns={'Frequency': str(timestamp)})
        topic_evolution = deepcopy(temp_df)
    else:
        temp_df = temp_df[['Topic', 'Frequency']]
        temp_df = temp_df.rename(columns={'Frequency': str(timestamp)})
        topic_evolution = topic_evolution.merge(temp_df, on='Topic', how='outer')


In [None]:
topic_evolution = topic_evolution.fillna(0)

In [None]:
topic_evolution.head(10)

In [None]:
columns_to_process = topic_evolution.columns.to_list()
columns_to_process.remove('Topic')
columns_to_process.remove('Name')
columns_to_process.remove('Words')
columns_to_process


In [None]:
for index in range(0, len(columns_to_process)-1):
    new_column = columns_to_process[index+1].split('-')[0] + '-' + columns_to_process[index].split('-')[0]
    topic_evolution[new_column] = topic_evolution[columns_to_process[index+1]
                                                  ] - topic_evolution[columns_to_process[index]]
topic_evolution.head(10)


In [None]:
topics = topics_over_time.Topic.unique()
change_in_topics = pd.DataFrame()
for topic in topics:
    topic_df = topics_over_time[topics_over_time['Topic'] == topic]
    topic_df = topic_df.sort_values('Timestamp')
    topic_df['Previous_Frequency'] = topic_df.Frequency.shift(1)
    topic_df['Change'] = topic_df['Frequency'] - topic_df['Previous_Frequency']
    max_freq = max(topic_df['Frequency'])
    topic_df['%_Change'] = topic_df['Change']*100/max_freq
    change_in_topics = change_in_topics.append(topic_df, ignore_index=True)
change_in_topics = change_in_topics[change_in_topics['Topic'] != -1]
change_in_topics = change_in_topics.dropna()
change_in_topics = change_in_topics.groupby(by=['Topic', 'Timestamp']).agg({
    'Frequency': 'sum',
    'Name': 'count',
    'Previous_Frequency': 'sum',
    'Change': 'sum',
    '%_Change': 'mean',
    'Words': ' '.join
}).reset_index()
change_in_topics.head(10)


In [None]:
fig = px.line(change_in_topics, x="Timestamp", y="Change", color='Topic', title='YOY change in topic frequency')
fig.show()
