In [1]:
import re
import pandas as pd
import numpy as np

# Prepare data
trump = pd.read_csv('dataset/fsas/fsas_full_utf8_dated.csv')
trump.abstract = trump.apply(lambda row: re.sub(r"http\S+", "", row.abstract).lower(), 1)
trump.abstract = trump.apply(lambda row: " ".join(filter(lambda x: x[0] != "@", row.abstract.split())), 1)
trump.abstract = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.abstract).split()), 1)
# trump = trump.loc[(trump.isRetweet == "f") & (trump.abstract != ""), :]
timestamps = trump.date.to_list()
abstracts = trump.abstract.to_list()

arr_multi = np.array(abstracts)
print(arr_multi)

['a concept of multi valued cognitive maps is introduced in this paper the concept expands the fuzzy one however all variables and weights are not linearly ordered in the concept but are only partially ordered such an approach allows us to operate in cognitive maps with partially ordered linguistic variables directly without vague fuzzification defuzzification methods hence we may consider more subtle differences in degrees of experts uncertainty than in the fuzzy case we prove the convergence of such cognitive maps and give two simple computational examples which demonstrate using such a partially ordered uncertainty degree scale compared to the fuzzy case'
 'in this paper we investigate the fixed time synchronization for fuzzy inertial neural networks with time varying coefficients and time delays the fuzzy inertial neural networks are transformed into two forms of first order differential systems and then two kinds of different controllers of time variable are designed in these sche

In [4]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP
import os

os.environ['CUDA_VISIBLE_DEVICES'] = "0"

# Create instances of GPU-accelerated UMAP and HDBSCAN
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

vectorizer_model = CountVectorizer(stop_words="english")

topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model, verbose=True, nr_topics="auto", vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(abstracts)

embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
topic_model.save("./model_dir/", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)

CudaSupportError: Error at driver init: 
Call to cuInit results in CUDA_ERROR_NO_DEVICE (100):

In [None]:
# Load from directory
loaded_model = BERTopic.load("\\model_dir\\")

# Load from file
# loaded_model = BERTopic.load("my_model")

In [None]:
nr_bins = 30

In [None]:

topics_over_time = topic_model.topics_over_time(abstracts, timestamps, nr_bins=nr_bins)

In [None]:
# 上位n個だけ表示
top_n_topics = 7
# img = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=top_n_topics, calculate_probabilities=True)
img = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=top_n_topics)
img.show()

file_name = "plt\\plt_b_" +  str(nr_bins) + "_t_" + str(top_n_topics) + ".html"
# img.write_image("test.png")
img.write_html(file_name)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.visualize_topics().show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# Show wordcloud
create_wordcloud(topic_model, topic=2)