In [None]:
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer

# 加载模型
topic_model = BERTopic.load("./0_topic_model")

cut_word_file_path = '../数据文件/切词数据/切词-论文数据.txt'
emb_file_path = '../数据文件/词向量/论文-acge_text_embedding.npy'
second_stopwords_list = open('../stopwords/second_stopwords.txt', 'r', encoding='utf-8').read().split('\n')

with open(cut_word_file_path, 'r', encoding='utf-8') as file:
  docs = file.readlines()
embeddings = np.load(emb_file_path)

vectorizer_model = CountVectorizer(stop_words=second_stopwords_list)

# topics是每个文档的聚类结果，probs是每个文档对应每个主题的概率
topics, probs = topic_model.transform(docs, embeddings)

## 减少主题数

In [None]:
# # 减少主题数，和BERTopic()中的nr_topics参数功能一样
# # 该方法会覆盖掉原来的topic_model

# topic_model.reduce_topics(docs, nr_topics=10)
# topic_model.get_topic_info()

## 减少离群值 方法1：用probs概率减少离群值

In [None]:
# # 使用reduce_outliers()后不建议进行主题合并、主题减少的操作，有报错可能

# new_topics = topic_model.reduce_outliers(
#     docs, 
#     topics, 
#     strategy="probabilities", 
#     probabilities=probs, 
#     threshold=0.9                     #该参数是判断是否为离群值的阈值，小于该参数则归为离群值
#     )

# # 可以查看离群值个数，以及每个文档更新后的主题
# print(new_topics.count(-1), new_topics)

In [None]:
# # 应用更新
# # update_topic会覆盖掉原来的topic_model

# topic_model.update_topics(docs, topics=new_topics, vectorizer_model=vectorizer_model)
# topic_info = topic_model.get_topic_info()
# topic_info

## 减少离群值 方法2：使用embedding减少离群值

In [None]:
# # 使用reduce_outliers()后不建议进行主题合并、主题减少的操作，有报错可能

# new_topics = topic_model.reduce_outliers(
#   docs,
#   topics,
#   strategy="embeddings",
#   embeddings=embeddings,              # 应用训练好的Embedding
#   threshold=0.5                      # 可以设置阈值
# )

# # 可以查看离群值个数，以及每个文档更新后的主题
# print(new_topics.count(-1), new_topics)

In [None]:
# # 应用更新
# # update_topic会覆盖掉原来的topic_model

# topic_model.update_topics(docs, topics=new_topics, vectorizer_model=vectorizer_model)
# topic_info = topic_model.get_topic_info()
# topic_info

# 总结：减少离群值的主要策略
1. DBSCAN的min_samples；
2. UMAP的min_dist；
3. reduce_outliers()；
4. HDBSCAN的min_cluster_size；
5. UMAP的random_state；
6. 清理数据

# 主题可视化

## 条形图

In [None]:
# # 可更改labels，详情见：21_vis_terms.ipynb

# # Arguments:
# #     topics: A selection of topics to visualize.
# #     top_n_topics: Only select the top n most frequent topics.
# #     n_words: Number of words to show in a topic
# #     custom_labels: Whether to use custom topic labels that were defined using
# #                `topic_model.set_topic_labels`.
# #     title: Title of the plot.
# #     width: The width of each figure.
# #     height: The height of each figure.

# topic_model.visualize_barchart()

## 主题分布图

In [None]:
# # Arguments:
# #           topics: A selection of topics to visualize
# #           top_n_topics: Only select the top n most frequent topics
# #           custom_labels: Whether to use custom topic labels that were defined using 
# #                      `topic_model.set_topic_labels`.
# #           title: Title of the plot.
# #           width: The width of the figure.
# #           height: The height of the figure.

# topic_model.visualize_topics()

# # 主题数量太少会出现报错


## 散点图

In [None]:
# # 详可更改labels，详情见：22_vis_umap.ipynb
# reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(
#     docs, reduced_embeddings=reduced_embeddings, hide_document_hover=True, hide_annotations=True
#     )

## 层次聚类图

In [None]:
# hierarchical_topics = topic_model.hierarchical_topics(docs)
# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
# # 参考层次聚类图来进行主题合并，效果更好

# topic_model.merge_topics(docs, 

# # 依据层次聚类结果，更改该二维列表
# [[5,2,11,21,0,1,10,4,25,20],[],[5,13,3],[16,18,21,10,11,15,25,4,0,1,2]]
# )

# topic_info_new = topic_model.get_topic_info()
# topic_info_new