In [16]:
import re
import pandas as pd
import numpy as np

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from bertopic.representation import MaximalMarginalRelevance

import pathlib

import plotly

In [17]:
# Prepare data
data_set_path = pathlib.Path("dataset/fsas/fsas_full_utf8_dated.csv")
trump = pd.read_csv(str(data_set_path))
trump.abstract = trump.apply(lambda row: re.sub(r"http\S+", "", row.abstract).lower(), 1)
trump.abstract = trump.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row.abstract.split())), 1)
trump.abstract = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.abstract).split()), 1)
# trump = trump.loc[(trump.isRetweet == "f") & (trump.abstract != ""), :]
timestamps = trump.date.to_list()
abstracts = trump.abstract.to_list()

data = abstracts

arr_multi = np.array(abstracts)
print(arr_multi)

['a concept of multi valued cognitive maps is introduced in this paper the concept expands the fuzzy one however all variables and weights are not linearly ordered in the concept but are only partially ordered such an approach allows us to operate in cognitive maps with partially ordered linguistic variables directly without vague fuzzification defuzzification methods hence we may consider more subtle differences in degrees of experts uncertainty than in the fuzzy case we prove the convergence of such cognitive maps and give two simple computational examples which demonstrate using such a partially ordered uncertainty degree scale compared to the fuzzy case'
 'in this paper we investigate the fixed time synchronization for fuzzy inertial neural networks with time varying coefficients and time delays the fuzzy inertial neural networks are transformed into two forms of first order differential systems and then two kinds of different controllers of time variable are designed in these sche

In [18]:
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0
                  # random_state=42,
                  )

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True,
                        min_samples=5)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

topic_model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    min_topic_size=7  # Create more topics
)

topics, probs = topic_model.fit_transform(data)

# Use the "c-TF-IDF" strategy with a threshold
new_topics = topic_model.reduce_outliers(abstracts, topics, strategy="c-tf-idf", threshold=0.1)

# Reduce all outliers that are left with the "distributions" strategy
new_topics = topic_model.reduce_outliers(abstracts, new_topics, strategy="distributions")

topic_model.update_topics(abstracts, topics=new_topics)

KeyboardInterrupt: 

In [None]:
topic_model.update_topics(data, topics=new_topics)

In [None]:
topic_model.get_topic_info()

In [None]:
doc_img = topic_model.visualize_documents(abstracts, hide_annotations=True, width=910,
                                          height=585)
doc_img.show()

In [None]:
# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = sentence_model.encode(abstracts, show_progress_bar=True)
# # We reduce our embeddings to 2D as it will allows us to quickly iterate later on
# reduced_embeddings = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine',
#                           # random_state=42,
#                           ).fit_transform(embeddings)
# doc_plt = topic_model.visualize_documents(abstracts, reduced_embeddings=reduced_embeddings,
#                                           hide_document_hover=True, hide_annotations=True)
# doc_plt.show()

In [None]:
distance_map = topic_model.visualize_topics()
distance_map.show()

In [None]:
# DTM
nr_bins = 30
topics_over_time = topic_model.topics_over_time(abstracts, timestamps, nr_bins=nr_bins)

In [None]:
# DTM可視化
import datetime

top_n_topics = 7

from_n = 0

img1 = topic_model.visualize_topics_over_time(topics_over_time, topics=[n + from_n for n in range(top_n_topics)],
                                              width=900, height=450)
img1.show()

now = datetime.datetime.now()

# file_png_name = "plt/plt_dtm_seed_" + str(random_state) + "_b_" + str(nr_bins) + "_t_" + str(
#     top_n_topics) + "_" + now.strftime(
#     '%Y-%m-%d%H-%M-%S') + ".png"
# file_html_name = "plt/plt_dtm_seed_" + str(random_state) + "_b_" + str(nr_bins) + "_t_" + str(
#     top_n_topics) + "_" + now.strftime(
#     '%Y-%m-%d%H-%M-%S') + ".html"
#
# path_dtm_png = pathlib.Path(file_png_name)
# path_dtm_html = pathlib.Path(file_html_name)
#
# img1.write_image(str(path_dtm_png))
# img1.write_html(str(path_dtm_html))