In [None]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from copy import deepcopy
import plotly.express as px
from nltk.corpus import stopwords
from keras.preprocessing.text import text_to_word_sequence
import swifter
from pymongo import MongoClient

In [None]:
start_year = '07'
end_year = '22'

In [2]:
# Connect to MongoDB
client = MongoClient('127.0.0.1', 27017)
db = client.frtp
collection = db.documents

In [10]:
# Extract all the data available
result = collection.find({"year": {'$lt': start_year, '$gte': end_year}})
df = pd.DataFrame(list(result))
df['year'] = pd.to_datetime(df['year'], format='%y')
df.to_csv(f'collab_dataset_{start_year}_{end_year}.csv')

In [None]:
df.head(5)

In [None]:
def get_stopwords():
    stop = set(stopwords.words('english'))
    stop = set([s.replace("'", "") for s in stop])

    # Add years to prevent spikes
    for year in range(1900, 2020):
        stop.add(str(year))

    # Add small numbers
    for num in range(0, 100):
        if len(str(num)) < 2:
            stop.add(str(num))
            num = '0' + str(num)

        stop.add(str(num))

    # Add these extra stopwords to the list
    # TODO: Look through the corpus and decide which are
    # extra stopwords needed for this specific domain
    extra = [
        'use', 'using', 'uses', 'used', 'based', 'including', 'include',
        'approach', 'factors', 'business', 'risk','factors16',
        'wa', 'ha', 'doe', 'item', '1a', 'factor', '1b', '1aitem', '10-k', 
        'item','1arisk','factors11','1arisk','factors10k','factorsk13','could',
        'factorsk10','may'
    ]

    for number in range(1,300):
      factor_string='factors'+str(number)
      stop.add(factor_string)

    for word in extra:
      stop.add(word)

    return stop

In [None]:
df['clean_text'] = df['text'].str.lower()
df['clean_text'] = df['clean_text'].swifter.apply(lambda x: ' '.join([word for word in text_to_word_sequence(x)]))
stop_words = get_stopwords()
df['clean_text'] = df['clean_text'].swifter.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
timestamps = df.year.to_list()

# Use unclean version of text
text = df.text.to_list()

In [None]:
sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(text, show_progress_bar=True)
topic_model = BERTopic(verbose=True,calculate_probabilities=True,language = "english",nr_topics=50)
topics, probs = topic_model.fit_transform(text,embeddings=embeddings)

In [None]:
# Explore a topic
topic_model.get_topic(18)

In [None]:
# Overview of the topics
topic_info = topic_model.get_topic_info()
topic_info.to_csv('topic_info.csv')
topic_info

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_topics()

# DTM

In [None]:
# Running DTM on the entire dataset
topics_over_time = topic_model.topics_over_time(text, topics, timestamps)

In [None]:
topics_over_time

In [None]:
# First 10 topics bases on frequency
topics_over_time.head(10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time,top_n_topics=10)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time)

In [None]:
topic_evolution = pd.DataFrame()
timestamps_set = sorted(set(timestamps))
for timestamp in timestamps_set:
    temp_df = topics_over_time[topics_over_time['Timestamp'] == timestamp]
    if topic_evolution.shape[0] == 0:
        temp_df = temp_df[['Topic','Name','Words','Frequency']]
        temp_df = temp_df.rename(columns={'Frequency':str(timestamp)})
        topic_evolution = deepcopy(temp_df)
    else:
        temp_df = temp_df[['Topic','Frequency']]
        temp_df = temp_df.rename(columns={'Frequency':str(timestamp)})
        topic_evolution = topic_evolution.merge(temp_df,on='Topic',how='outer')

In [None]:
topic_evolution = topic_evolution.fillna(0)

In [None]:
topic_evolution.head(10)

In [None]:
columns_to_process = topic_evolution.columns.to_list()
columns_to_process.remove('Topic')
columns_to_process.remove('Name')
columns_to_process.remove('Words')
columns_to_process

In [None]:
for index in range(0,len(columns_to_process)-1):
    new_column = columns_to_process[index+1].split('-')[0] + '-' + columns_to_process[index].split('-')[0]
    topic_evolution[new_column] = topic_evolution[columns_to_process[index+1]] - topic_evolution[columns_to_process[index]]
topic_evolution.head(10)

In [None]:
topics = topics_over_time.Topic.unique()
change_in_topics = pd.DataFrame()
for topic in topics:
    topic_df = topics_over_time[topics_over_time['Topic'] == topic]
    topic_df = topic_df.sort_values('Timestamp')
    topic_df['Previous_Frequency'] = topic_df.Frequency.shift(1)
    topic_df['Change'] = topic_df['Frequency'] - topic_df['Previous_Frequency']
    max_freq = max(topic_df['Frequency'])
    topic_df['%_Change'] = topic_df['Change']*100/max_freq
    change_in_topics = change_in_topics.append(topic_df,ignore_index=True)
change_in_topics = change_in_topics[change_in_topics['Topic'] != -1]
change_in_topics = change_in_topics.dropna()
change_in_topics.head(10)

In [None]:
fig = px.line(change_in_topics, x="Timestamp", y="%_Change",color='Words', title='YOY change in topic frequency')
fig.show()