In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from transformers.pipelines import pipeline
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from transformers import logging

import re
import pandas as pd
from itertools import combinations
from collections import defaultdict, Counter
import math
import networkx as nx
# import pandas as pd
import jieba

# 数据清洗
# 1. 读取CSV文件  
df = pd.read_csv(r"C:\Users\zpzxm\OneDrive - HKUST (Guangzhou)\Desktop\科研论文\高速塌方事故对比研究\陕西高速正向.csv")  

# 第一列是'content'  
articles_tokens = df.iloc[:, 0].tolist()

# 分词步骤
articles_tokens = [" ".join(jieba.cut(article)) for article in articles_tokens]

# 读取同义词表并构建同义词替换字典
synonym_dict = {}
with open("data/synonyms.txt", "r", encoding="utf-8") as f:
    for line in f:
        synonyms = line.strip().split(",")
        for word in synonyms[1:]:
            synonym_dict[word] = synonyms[0]

# 替换同义词
for i in range(len(articles_tokens)):
    articles_tokens[i] = " ".join([synonym_dict.get(word, word) for word in articles_tokens[i].split()])

logging.set_verbosity_error()
# 转化成列表
docs_pre = [] 
for article in articles_tokens:
    lines = article.split()
    chinese_words = [line for line in lines if line.strip()]
    docs_pre.append(chinese_words)

# 去除停用词
stpwrd = open("data/stopwords-simple.txt", "r", encoding='utf-8').read()
stop_words = stpwrd.split("\n")
sentences = []
for words in docs_pre:
    doc = [token for token in words if token not in stop_words]
    docs_merge = " ".join(doc)
    sentences.append(docs_merge)

  from .autonotebook import tqdm as notebook_tqdm
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zpzxm\AppData\Local\Temp\jieba.cache
Loading model cost 0.293 seconds.
Prefix dict has been built successfully.


In [2]:
#加载预训练模型
model_name = "hfl/chinese-bert-wwm-ext"
model = BertModel.from_pretrained(model_name)

# 使用Tokenizer，就是为了将输入的句子加工为bert模型可以处理的格式
tokenizer = BertTokenizer.from_pretrained(model_name)
# 将模型放置在GPU上
device = torch.device("cuda")
# 把模型放到cpu或gpu
model.to(device)
# 将模型设置为评估模式，https://blog.csdn.net/weixin_45275599/article/details/131524189
model.eval()

# 切分数据
batch_size = 1  # 批大小
data_loader = DataLoader(sentences, batch_size=batch_size)
for batch in data_loader:
    print(len(batch), batch)

# 生成向量
cls_embeddings = []
for batch_sentences in tqdm(data_loader):# 使用tqdm显示处理进度
    inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
    inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embeddings.append(outputs.last_hidden_state[:, 0].cpu().numpy()) # 只取CLS对应的向量
    # print('numpy格式', type(outputs.last_hidden_state[:, 0].cpu().numpy()), outputs.last_hidden_state[:, 0].cpu().numpy().shape)
# 合并文档向量
print('batch数：', len(cls_embeddings))
cls_embeddings_np = np.vstack(cls_embeddings)
print('文档向量', type(cls_embeddings_np), cls_embeddings_np.shape)

# 保存文档向量到npy文件
output_file = "data/embedding.npy"
np.save(output_file, cls_embeddings_np)
print("词向量存储于: ", output_file)
embeddings = np.load(output_file)
print("加载回来，验证一下：", type(embeddings), embeddings.shape)

#导入向量
embeddings = np.load('data/embedding.npy')
print(type(embeddings), embeddings.shape)

1 ['刚刷 到 ， 清香 三烛 ， 愿 顺风 顺水 顺 财神 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢']
1 ['一路 走 好 感谢 感谢 感谢 的 心酸 感谢 感谢 感谢']
1 ['清香 三柱 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢']
1 ['感谢 感谢 感谢 感谢 顺']
1 ['愿 国泰民安 ！ 逝者 安息 ！ 感谢 感谢 感谢']
1 ['清香 三柱 ！ 逝者 安息 感谢 感谢 感谢']
1 ['感谢 感谢 感谢 感谢 感谢 可怜 的 孩子 感谢 感谢 感谢 感谢']
1 ['意外 永远 不 知道 那天 会到']
1 ['太 可惜 了 … … 一路 走 好 感谢 感谢 感谢']
1 ['刷到 视频 的 人 ， 全家 平安 健康 感谢 感谢 感谢 感谢 感谢 感谢']
1 ['愿 来世 没有 灾难 感谢 感谢 感谢']
1 ['清香 三注 ， 一路 走 好 感谢 感谢 感谢 感谢 感谢']
1 ['大 早上 刷到 上柱 香 感谢 感谢 感谢 感谢 感谢 感谢']
1 ['一路 走 好 ， 见者 全家 平安 健康 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢']
1 ['路过 ， 大 早上 刷到 ， 希望 他们 在 另外 一个 世界 还是 一样 相亲相爱 一家人 感谢 感谢 感谢 感谢 感谢 感谢']
1 ['上 香 三柱 感谢 感谢 感谢 可怜 人']
1 ['愿 在 天堂 一切 安好 感谢 感谢 感谢']
1 ['愿 在 天堂 一切 安好 感谢 感谢 感谢']
1 ['感谢 感谢 感谢 感谢 感谢 感谢 感谢 比心 比心']
1 ['心痛 ， 一路 走 好']
1 ['心痛 ， 一路 走 好']
1 ['无意间 刷到 平安 果 平安 果 平安 果 愿世者 一路 走 好 ， 生 者 ， 顺风 顺水 顺顺利利 ， 吉祥如意 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢 感谢']
1 ['三桂 清香 ， 一路 走 好 感谢 感谢 感谢 感谢 感谢 感谢 见者 全家 平安 健康 感谢 感谢 感谢 感谢 感谢 感谢']
1 ['上 香 三柱']
1 ['上 香 三柱']
1 ['比心 比心 比心 感谢 感谢 感谢']
1 ['比心 比心 比心 感谢 感谢 感谢']
1 ['无意 刷到 大吉大利 

100%|██████████| 6510/6510 [00:49<00:00, 132.23it/s]

batch数： 6510
文档向量 <class 'numpy.ndarray'> (6510, 768)
词向量存储于:  data/embedding.npy
加载回来，验证一下： <class 'numpy.ndarray'> (6510, 768)
<class 'numpy.ndarray'> (6510, 768)





In [3]:
# 把模型放到cpu或gpu
device = torch.device("cuda")
model.to('cuda')  # 将模型加载到GPU
# 将模型设置为评估模式，https://blog.csdn.net/weixin_45275599/article/details/131524189
model.eval()

# 1. 词向量模型，同时加载本地训练好的词向量
embedding_model = pipeline(
  "feature-extraction",
  #model="JIHUAI/bert-ancient-chinese",device=0
  model="hfl/chinese-bert-wwm-ext",device=0
)

# 2. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors= 15,
  n_components=5,
  min_dist=0.0,
  metric='cosine',
  random_state=64
)
reduced_embeddings = umap_model.fit_transform(embeddings)


# 3. 创建HDBSCAN聚类模型
# 如果要建设离群值，可以减小下面两个参数min_cluster_size min_samples
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=4,
  min_samples=5,
  metric='euclidean'
)


# 4. 创建CountVectorizer模型
stpwrd=open(r"C:\Users\zpzxm\OneDrive - HKUST (Guangzhou)\Desktop\语义网络\stop_dic\stopwords.txt","r",encoding='utf-8').read()
stop_words= stpwrd.split("\n")
vectorizer_model =  TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",max_df=0.95, min_df=2,stop_words=stop_words)

from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


# 5. 选择主题表示
from bertopic.representation import MaximalMarginalRelevance

main_representation = MaximalMarginalRelevance(diversity=.5, top_n_words=30)
representation_model = {"Main": main_representation}

topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  top_n_words=10,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  min_topic_size=10, 
  verbose=True,
)

    # 訓練並產生資料
topic_model.fit_transform(sentences, embeddings=embeddings)
# topics, probs = topic_model.fit_transform(ws)
    # 產生資料時間資料
    # topics_over_time = topic_model.topics_over_time(ws, timestamps, nr_bins=20)
    # topics_over_time = topic_model.topics_over_time(ws, nr_bins=20)
    # 各 Topic TF-IDF 關鍵字直方圖


2024-08-25 15:55:31,540 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-08-25 15:55:43,970 - BERTopic - Dimensionality - Completed ✓
2024-08-25 15:55:43,971 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-08-25 15:55:44,063 - BERTopic - Cluster - Completed ✓
2024-08-25 15:55:44,065 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-08-25 15:56:47,935 - BERTopic - Representation - Completed ✓


([-1,
  41,
  92,
  -1,
  -1,
  -1,
  41,
  -1,
  1,
  125,
  28,
  238,
  182,
  107,
  133,
  63,
  28,
  28,
  202,
  73,
  73,
  99,
  -1,
  137,
  137,
  -1,
  -1,
  182,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  7,
  36,
  71,
  141,
  36,
  36,
  37,
  132,
  174,
  114,
  7,
  26,
  39,
  39,
  166,
  71,
  -1,
  -1,
  213,
  36,
  1,
  -1,
  40,
  -1,
  38,
  143,
  246,
  -1,
  0,
  -1,
  0,
  65,
  121,
  180,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  47,
  -1,
  220,

In [4]:
top_n_topics=5
bar_fig = topic_model.visualize_barchart(
        n_words=10,
        top_n_topics=top_n_topics,
        title= '',
        width=600, 
        height=800
    )

# topic_info = topic_model.get_topic_info()

#保存聚类结果
# topic_docs = topic_model.get_document_info(sentences)
# topic_docs.to_csv('data/聚类结果n=8结果.csv')

# 设置字体为Times New Roman
bar_fig.update_layout(
    font=dict(
        family="Times New Roman",
        size=30,  # 可以根据需要调整字体大小
        color="Black"  # 可以根据需要调整字体颜色
    ),
    margin=dict(l=300, r=0, t=200, b=200),  # 设置边距为0
    autosize=True,  # 自动调整大小
    xaxis=dict(
        automargin=True,
        title=dict(
            standoff=20  # 调整标题与轴之间的距离
        )
    ),
    yaxis=dict(
        automargin=True,
        title=dict(
            standoff=20  # 调整标题与轴之间的距离
        )
    ),
    plot_bgcolor='rgba(0,0,0,0)',  # 设置背景颜色为透明
    paper_bgcolor='rgba(0,0,0,0)'  # 设置画布背景颜色为透明
)

for i, annotation in enumerate(bar_fig.layout.annotations):
    annotation.text = f"Topic {i + 1}"

html_file_path = "result/陕西高速正向bar_fig.html"
bar_fig.write_html(html_file_path)

js_component = """
<script src="https://res.zvo.cn/translate/translate.js"></script>
<script>
// 设置本地语种（当前网页的语种）。如果不设置，默认自动识别当前网页显示文字的语种。 可填写如 'english'、'chinese_simplified' 等，具体参见文档下方关于此的说明。
translate.language.setLocal('chinese_simplified');

// 新增功能：将所有文本字段转换为英文小写，并以两个空格结束
function convertTextFields() {
    var elements = document.querySelectorAll('body *');
    elements.forEach(function(element) {
        if (element.childNodes.length === 1 && element.childNodes[0].nodeType === Node.TEXT_NODE) {
            // 去掉末尾的空格
            element.textContent = element.textContent.trimEnd().toLowerCase() + '  ';
        }
    });
}

// 等待页面加载完成后执行翻译
document.addEventListener('DOMContentLoaded', function() {
    translate.execute();
});
</script>

"""


# 打开生成的 HTML 文件并读取内容
with open(html_file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

# 添加 CSS 样式以设置所有元素的字体大小为 20px
style_tag = """
<style>
* {
    font-size: 30px !important;
}
</style>
"""

# 将 CSS 样式添加到 <head> 标签中
html_content = re.sub(r'(<head[^>]*>)', r'\1' + style_tag, html_content)


# 在 HTML 文件的尾部添加 JavaScript 组件和控件
control_panel = """
<div id="controlPanel">
    <label>
        <input type="checkbox" id="convertCheckbox"> 转换文本字段
    </label>
    <button onclick="convertTextFields()">转换</button>
</div>
"""

html_content += control_panel
html_content += js_component

# 保存修改后的 HTML 文件
with open(html_file_path, "w", encoding="utf-8") as file:
    file.write(html_content)

    
#     # 各 Topic 間距離圖
# topic_fig = topic_model.visualize_topics(
#         top_n_topics=top_n_topics,
#         width=1000,
#     )
# topic_fig.write_html("result/topic_fig.html")

#     # 各 Topic 時間序列圖
#     # tot_fig = topic_model.visualize_topics_over_time(
#     #     topics_over_time, top_n_topics=top_n_topics, width=1000
#     # )

# topic_similar_heatmap = topic_model.visualize_heatmap(n_clusters=11)
# topic_similar_heatmap.write_html('result/topic_similar_heatmap.html')
# topic_similar_heatmap

# topic_model_hierarchy = topic_model.visualize_hierarchy()
# topic_model_hierarchy.write_html('result/topic_model_hierarchy.html')
# topic_model_hierarchy

#     # 儲存成 html 檔案，供前端展示使用
    
#     # topic_model_distribution=topic_model.visualize_distribution(top_n_topics=20, width=1000, height=1000, custom_labels=True, hide_annotations=True)
#     # topic_model.distribution.write_html(topic_distribution.html")

#     # tot_fig.write_html(tot_fig.html")

# term_score_decline = topic_model.visualize_term_rank()
# term_score_decline.write_html('result/term_score_decline.html')
# term_score_decline

print("Task Done, Please check the export folder for the results.")

Task Done, Please check the export folder for the results.


In [None]:
topic_info = topic_model.get_topic_info()
print(topic_info)

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(sentences)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:

# 训练模型
topic_model.fit_transform(sentences, embeddings=embeddings)


In [None]:

# 获取聚类结果
topic_info = topic_model.get_topic_info()

#保存聚类结果
topic_docs = topic_model.get_document_info(sentences)
topic_docs.to_csv('data/聚类结果n=6.csv')

# ⭐ UMAP可视化
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine',random_state=64).fit_transform(embeddings)
topic_model.visualize_documents(sentences,  reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_barchart(top_n_topics=6, n_words=10,width=400, height=600)
bar_fig.write_html("result/bar-file.html")
# bar_fig.write_html(r"C:\Users\zpzxm\OneDrive - HKUST (Guangzhou)\Desktop\科研论文\高速塌方事故对比研究\bar_fig.html")

In [None]:
# 把模型放到cpu或gpu
model.to('cuda')  # 将模型加载到GPU
# 将模型设置为评估模式，https://blog.csdn.net/weixin_45275599/article/details/131524189
model.eval()

# 1. 词向量模型，同时加载本地训练好的词向量
embedding_model = pipeline(
  "feature-extraction",
  #model="JIHUAI/bert-ancient-chinese",device=0
  model="SIKU-BERT/sikubert",device=0
)

# 2. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors= 15,
  n_components=5,
  min_dist=0.0,
  metric='cosine',
  random_state=64
)
reduced_embeddings = umap_model.fit_transform(embeddings)


# 3. 创建HDBSCAN聚类模型
# 如果要建设离群值，可以减小下面两个参数min_cluster_size min_samples
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=4,
  min_samples=4,
  metric='euclidean'
)


# 4. 创建CountVectorizer模型
stpwrd=open("data/stopwords.txt","r",encoding='utf-8').read()
stop_words= stpwrd.split("\n")
vectorizer_model =  TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",max_df=0.95, min_df=2,stop_words=stop_words)

from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


# 5. 选择主题表示
from bertopic.representation import MaximalMarginalRelevance

main_representation = MaximalMarginalRelevance(diversity=.3, top_n_words=30)
representation_model = {"Main": main_representation}

topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  min_topic_size=10, 
  verbose=True,
)

# 训练模型
topic_model.fit_transform(sentences, embeddings=embeddings)

# 获取聚类结果
topic_info = topic_model.get_topic_info()

#保存聚类结果
topic_docs = topic_model.get_document_info(sentences)
topic_docs.to_csv('data/聚类结果n=10.csv')

# ⭐ UMAP可视化
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine',random_state=64).fit_transform(embeddings)
topic_model.visualize_documents(sentences,  reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_barchart(top_n_topics=20, n_words=10,width=400, height=600)

In [None]:
# 把模型放到cpu或gpu
model.to('cuda')  # 将模型加载到GPU
# 将模型设置为评估模式，https://blog.csdn.net/weixin_45275599/article/details/131524189
model.eval()

# 1. 词向量模型，同时加载本地训练好的词向量
embedding_model = pipeline(
  "feature-extraction",
  #model="JIHUAI/bert-ancient-chinese",device=0
  model="SIKU-BERT/sikubert",device=0
)

# 2. 创建UMAP降维模型
umap_model = UMAP(
  n_neighbors= 15,
  n_components=5,
  min_dist=0.0,
  metric='cosine',
  random_state=64
)
reduced_embeddings = umap_model.fit_transform(embeddings)


# 3. 创建HDBSCAN聚类模型
# 如果要建设离群值，可以减小下面两个参数min_cluster_size min_samples
# https://hdbscan.readthedocs.io/en/latest/faq.html
hdbscan_model = HDBSCAN(
  min_cluster_size=4,
  min_samples=3,
  metric='euclidean'
)


# 4. 创建CountVectorizer模型
stpwrd=open("data/stopwords.txt","r",encoding='utf-8').read()
stop_words= stpwrd.split("\n")
vectorizer_model =  TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",max_df=0.95, min_df=2,stop_words=stop_words)

from bertopic.vectorizers import ClassTfidfTransformer
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)


# 5. 选择主题表示
from bertopic.representation import MaximalMarginalRelevance

main_representation = MaximalMarginalRelevance(diversity=.5, top_n_words=30)
representation_model = {"Main": main_representation}

topic_model = BERTopic(
  embedding_model=embedding_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  min_topic_size=4, 
  verbose=True,
)

# 训练模型
topic_model.fit_transform(sentences, embeddings=embeddings)

# 获取聚类结果
topic_info = topic_model.get_topic_info()

#保存聚类结果
topic_docs = topic_model.get_document_info(sentences)
topic_docs.to_csv('data/聚类结果n=15.csv')

# ⭐ UMAP可视化
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine',random_state=64).fit_transform(embeddings)
topic_model.visualize_documents(sentences,  reduced_embeddings=reduced_embeddings)

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(sentences)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_hierarchical_documents(sentences, hierarchical_topics, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine',random_state=64).fit_transform(embeddings)
fig=topic_model.visualize_hierarchical_documents(sentences, hierarchical_topics, reduced_embeddings=reduced_embeddings)
fig.write_html("data/hierarchical_documents.html")

In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. 读取CSV文件  
df = pd.read_csv('data/聚类结果n=6.csv')  
sentences = df['Document'].tolist()
themes = df['Topic'].tolist()
theme_names = df['Name'].tolist()

with open('data/book-name.txt', "r", encoding='utf-8') as file:
    labels = [line.strip() for line in file.readlines()]

# 找到所有唯一的主题和标签
unique_themes = list(set(themes))
unique_labels = list(set(labels))

# 创建一个嵌套字典来计数
theme_label_count = defaultdict(lambda: defaultdict(int))

# 统计每个label下每个主题的数量
for theme, label in zip(themes, labels):
    theme_label_count[theme][label] += 1

# 为每个主题准备数据
plot_data = {theme: [theme_label_count[theme][label] for label in unique_labels] for theme in unique_themes}

# 绘制柱形图
plt.figure(figsize=(6, 8))
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 使用颜色字典来保存每个主题的颜色
colors = plt.cm.get_cmap('Set3', len(unique_themes))

# 设置柱子的宽度
bar_width = 0.8 / len(unique_themes)

# 绘制每个主题的柱形图并保存每个主题的柱形对象和名称
bars = []
labels_for_legend = []
theme_name_map = dict(zip(unique_themes, theme_names))  # 主题到主题名称的映射

# 使用 numpy 来创建位置
indices = np.arange(len(unique_labels))

for idx, theme in enumerate(unique_themes):
    if theme > -1:
        counts = plot_data[theme]
      # 仅绘制非零数量的主题
        if any(counts):  # 仅绘制非零数量的主题
            bar = plt.bar(indices + idx * bar_width, counts, bar_width, label=theme_name_map[theme], color=colors(idx), alpha=0.8, linewidth=0.1, edgecolor='black')
            bars.append(bar)
            labels_for_legend.append(theme_name_map[theme])

# 绘制每个主题的折线并保存每个主题的线条对象和名称
#lines = []
labels_for_legend = []
theme_name_map = dict(zip(themes, theme_names))  # 主题到主题名称的映射

for idx, theme in enumerate(unique_themes):
    if theme > -1:  # 仅绘制非零数量的主题
        counts = plot_data[theme]
        if any(counts):  # 仅绘制非零数量的主题
            labels_for_legend.append(theme_name_map[theme])

plt.xlabel('Books')
plt.ylabel('Theme Count')
#plt.title('Theme Count by Labels')

# 设置 x 轴刻度
plt.xticks(indices + bar_width * (len(unique_themes) / 2 - 0.5), unique_labels)

# 设置图例在右侧
plt.legend(bars, labels_for_legend, title='Themes', loc='center left', bbox_to_anchor=(1, 0.5))
plt.grid(True)
plt.show()

In [None]:
#构建语义相似度矩阵

# 读取同义词表并构建同义词替换字典
synonym_dict = {}
with open("synonyms.txt", "r", encoding="utf-8") as f:
    for line in f:
        synonyms = line.strip().split("\t")
        for word in synonyms[0:]:
            synonym_dict[word] = synonyms[1]

# 替换同义词
for i in range(len(tokens1)):
    tokens1[i] = "/".join([synonym_dict.get(word, word) for word in tokens1[i].split("/")])



#轉化成列表
docs_pre=[] 
for article in tokens1:
    article_tokens=article.split('/')
    lines= [re.sub(r'[^\w\s]', '', seg) for seg in article_tokens]
    chinese_words = [line for line in lines if line.strip()]
    docs_pre.append(chinese_words)

# 去除停用詞
stpwrd=open("data/stopwords.txt","r",encoding='utf-8').read()
stop_words= stpwrd.split("\n")

docs=[]
for words in docs_pre:
    doc = [token for token in words if token not in stop_words]
    docs.append(doc)   

# 训练 word2vec 模型
model2 = word2vec.Word2Vec(docs, vector_size=300,window=5, min_count=3, workers=12,epochs=64)
vocab_pre = list(model2.wv.index_to_key) 
print("禮：",model2.wv.most_similar('禮', topn=20))
with open("./coreword.txt","r",encoding='utf-8') as f:
    words_core = [line.strip() for line in f.readlines()]
with open("./target.txt","r",encoding='utf-8') as f:
    words_target = [line.strip() for line in f.readlines()]
    
vocab_dlc=list(words_core + vocab_pre)
docs_new=[]
for words in docs:
    doc = [token for token in words if token in vocab_dlc]
    docs_new.append(doc)   

model = word2vec.Word2Vec(docs_new, vector_size=300,window=9, min_count=1, workers=12,epochs=72)

# # 保存模型
# model.save('word2vec_1.model')
# # # 加载模型
# model = Word2Vec.load('word2vec_1.model') 

# 将所有词转化成词向量并建立词典
word_vectors = []
word2ind = {}
for i, word in enumerate(word_tokens):
    word_vectors.append(model.wv[word])
    word2ind[word] = i
word_vectors = np.array(word_vectors)

words_list=[]
for target_word in words_core:
    similar_words =  [word for word, _ in model.wv.most_similar(target_word, topn=10)]
    words_list.append(similar_words)
words_dlc_pre=[token for sentence in words_list for token in sentence]
words_dlc = list(set(words_dlc_pre))
vocab=[token for token in words_dlc if token in vocab_pre]

merged_list = list(set(words_core + vocab))

# 创建一个空的相似性矩阵
similarity_matrix = np.zeros((len(merged_list ), len(merged_list )))

# 计算每两个词之间的相似性
for i in range(len(merged_list)):
        for j in range(len(merged_list)):
                similarity_matrix[i][j] = model.wv.similarity(merged_list[i], merged_list[j])
        

# 将相似性矩阵转换为DataFrame
df_similarity = pd.DataFrame(similarity_matrix, index=merged_list , columns=merged_list )


# 将相似度大于0.5的值设置为NaN
df_similarity_new = df_similarity*1
df_similarity_new[df_similarity_new  <= 0.75] = np.nan
df_similarity_new[df_similarity_new  >= 0.99] = np.nan


# 将相似度大于0.5的值设置为NaN
df_similarity_new = df_similarity*0.2
df_similarity_new[df_similarity_new  <= 0.12] = np.nan

# 保存DataFrame到CSV文件
df_similarity_new.to_csv("data/word_similarity_matrix.csv")