# Import packages and data

In [None]:
import numpy as np
import pandas as pd
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer
import html
import datetime
import re
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt

In [None]:
mastodon_data = pd.read_feather(
    "../1_data_collection/data/8_mastodon_final_data.feather"
)

In [None]:
mastodon_data["text_combine"] = mastodon_data["text_combine"].apply(
    lambda x: html.unescape(x)
)


mastodon_data["text_combine"] = mastodon_data["text_combine"].apply(
    lambda x: re.sub("chatgpt", "", x, flags=re.IGNORECASE)
)


mastodon_data["text_combine"] = mastodon_data["text_combine"].str.replace("#", "")
mastodon_data["text_combine"] = mastodon_data["text_combine"].str.replace(
    "@twitter.com", ""
)

In [None]:
docs = mastodon_data["text_combine"]

In [None]:
docs.head(15)

# Model train

First model fit

In [None]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

# embedding_model="all-MiniLM-L6-v2"
vectorizer_model = CountVectorizer(
    ngram_range=(1, 3), stop_words="english", max_features=10_000
)

topic_model = BERTopic(
    umap_model=umap_model,
    embedding_model=sentence_model,
    vectorizer_model=vectorizer_model,
    verbose=True,
    min_topic_size=75,
    nr_topics="auto",
)
topics, probs = topic_model.fit_transform(docs)

In [None]:
# check the topics information
topic_model.get_topic_info()

## Analyze first fit

Inter-topic distance map

In [None]:
display(topic_model.get_topic_info())
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_tree = topic_model.get_topic_tree(hierarchical_topics)
print(topic_tree)
fig = topic_model.visualize_topics()
display(fig)
display(topic_model.visualize_heatmap())

In [None]:
representative_docs = topic_model.get_representative_docs(topic=27)
display(representative_docs)

# Merge topics

1st merge

In [None]:
representative_docs = topic_model.get_representative_docs(topic=22)
display(representative_docs)

In [None]:
topics_to_merge = [
    [
        2,
        39,
        34,
    ]
]

# chatgpt_ai and code write
# google_bard and google search
topic_model.merge_topics(docs, topics_to_merge)

In [None]:
display(topic_model.get_topic_info())
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_tree = topic_model.get_topic_tree(hierarchical_topics)

print(topic_tree)

fig = topic_model.visualize_topics()

display(fig)
display(topic_model.visualize_heatmap())

2nd Merge

In [None]:
representative_docs = topic_model.get_representative_docs(topic=17)
display(representative_docs)

In [None]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(
    docs, calculate_tokens=True
)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(docs[1], topic_token_distr[1])
df

In [None]:
topics_to_merge = [
    [4, 19, 1],
    [
        2,
        25,
        18,
    ],
    [3, 11],
    [
        13,
        26,
        31,
        32,
    ],
    [27, 35],
    [10, 34],
    [22, 36, 15, 12, 21, 5, 8, 17, 29, 33, 14, 28, 9, 23],
    [
        30,
        7,
    ],
    [6, 24],
    [0, 37],
]

topic_model.merge_topics(mastodon_data["text_combine"], topics_to_merge)

In [None]:
display(topic_model.get_topic_info())
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_tree = topic_model.get_topic_tree(hierarchical_topics)
print(topic_tree)
fig = topic_model.visualize_topics()
display(fig)
display(topic_model.visualize_heatmap())
terms = topic_model.visualize_barchart(top_n_topics=35, n_words=10, height=350)
display(terms)

In [None]:
representative_docs = topic_model.get_representative_docs(topic=13)
display(representative_docs)

In [None]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(
    docs, calculate_tokens=True
)

# Visualize the token-level distributions
df = topic_model.visualize_approximate_distribution(docs[13], topic_token_distr[1])
df

# Reduce outliers

In [None]:
topics = topic_model.topics_
new_topics = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf")
topic_model.update_topics(docs, new_topics)
documents = pd.DataFrame({"Documents": docs, "Topic": new_topics})
topic_model._update_topic_size(documents)

In [None]:
# check the topics information
topic_model.get_topic_info()

# Final update to topics with vectorizer

In [None]:
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

In [None]:
# check the topics information
topic_model.get_topic_info()

# Visualize final results

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_tree = topic_model.get_topic_tree(hierarchical_topics)
print(topic_tree)
fig = topic_model.visualize_topics()
display(fig)
terms = topic_model.visualize_barchart(top_n_topics=17, n_words=10, height=300)
display(terms)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

## change topic names

In [None]:
topic_model.set_topic_labels(
    {
        0: "AI & Big Tech",
        1: "Ask ChatGPT to Generate Text",
        2: "GPT Language Models",
        3: "Coding with ChatGPT",
        4: "Twitter & Mastodon",
        5: "AI-Related Media Content",
        6: "Alternative Access to ChatGPT",
        7: "ChatGPT Answers' Quality",
        8: "AI & Law & Trial",
        9: "Political Topics",
        10: "Data Privacy Issues",
        11: "Paid GPT Service",
    }
)

In [None]:
fig = topic_model.visualize_topics(custom_labels=True)
display(fig)
terms = topic_model.visualize_barchart(
    top_n_topics=17, n_words=10, height=280, width=290, custom_labels=True
)
display(terms)
topic_model.visualize_hierarchy(
    hierarchical_topics=hierarchical_topics, custom_labels=True
)
topic_model.visualize_heatmap(custom_labels=True, width=800, height=650)

In [None]:
from pickle import TRUE


topic_model.visualize_documents(docs, custom_labels=TRUE)

In [None]:
for topic in range(12):
    topic_info = topic_model.get_topic(topic)
    representative_docs = df[df.topic == topic][:12]["document"].tolist()

    print("Topic: ", topic)
    print("Topic Information: ")
    print(topic_info)
    print("Representative Documents: ")
    print(representative_docs)
    print("\n")

In [None]:
# save inter distance map result as html

fig.write_html("./data/12_topic_inter_map.html")

In [None]:
# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(
    n_neighbors=50, n_components=3, min_dist=2.0, metric="cosine"
).fit_transform(embeddings)
topic_model.visualize_hierarchical_documents(
    docs, hierarchical_topics, reduced_embeddings=reduced_embeddings, custom_labels=True
)

# Save model and results

In [None]:
topic_model.get_topic_info()

In [None]:
# save topic information
topic_info = topic_model.get_topic_info()
topic_info.to_csv("./data/topic_info.csv")

# save topic infor with topic id
topic_results = pd.DataFrame(
    {"id": mastodon_data["id"], "topic_id": topic_model.topics_}
)
topic_results.to_csv("./data/topic_results.csv")