In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import nltk
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import re
import os
from matplotlib.colors import LinearSegmentedColormap

In [8]:
plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

In [10]:
# -------------------------------
# 3. 模型构建与训练
# -------------------------------

# 读取预处理后的数据
file_path = r'C:\Users\ASUS\Desktop\LAD\sentimentdataset_preprocessed.csv'
df = pd.read_csv(file_path)

# 将字符串形式的分词结果转换为列表
df['Lemmatized_Tokens'] = df['Lemmatized_Tokens'].apply(
    lambda x: re.findall(r"'([^']*)'", x) if isinstance(x, str) else []
)

# 构建词典
dictionary = Dictionary(df['Lemmatized_Tokens'])

# 过滤低频词和高频词
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=10000)

# 生成语料库（词袋模型）
corpus = [dictionary.doc2bow(text) for text in df['Lemmatized_Tokens']]

In [11]:
print(corpus)

[[(0, 1), (1, 1)], [], [(2, 1)], [(3, 1), (4, 1)], [(5, 1), (6, 1), (7, 1)], [(8, 1), (9, 1)], [(0, 1)], [(5, 1), (10, 1)], [], [(0, 1), (11, 1), (12, 1)], [(5, 1)], [(8, 1)], [(13, 1), (14, 1)], [(5, 2), (15, 1), (16, 1), (17, 1)], [(18, 1)], [(19, 1), (20, 1), (21, 1)], [(22, 1)], [(22, 1)], [(23, 1)], [(8, 1), (24, 1)], [(1, 1), (25, 1), (26, 1)], [(13, 1), (27, 1)], [(0, 1)], [(2, 1), (28, 1)], [(29, 1), (30, 1), (31, 1)], [(22, 1)], [(5, 1), (32, 1), (33, 1)], [(25, 1), (26, 1), (34, 1), (35, 1)], [(21, 1)], [(5, 1)], [(4, 1), (13, 1), (36, 1), (37, 1)], [(1, 1), (38, 1), (39, 1)], [(5, 1), (40, 1), (41, 1)], [(8, 1), (23, 1)], [(0, 1), (24, 1), (42, 1)], [(4, 1), (43, 1), (44, 1)], [(18, 1), (23, 1), (45, 1), (46, 1)], [], [(5, 1), (6, 1), (7, 1)], [(3, 1)], [(47, 1)], [(22, 1), (29, 1), (48, 1)], [(0, 1), (8, 1)], [(10, 1), (26, 1), (34, 1)], [(4, 1), (13, 1), (42, 1), (49, 1)], [(5, 1), (25, 1), (50, 1)], [(23, 1)], [(21, 1), (51, 1), (52, 1)], [(53, 1), (54, 1)], [(8, 1), (19,

In [12]:
# 训练LDA模型
num_topics = 5  # 主题数量，可调整
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=20,
    alpha='auto',
    eta='auto',
    random_state=42
)


In [15]:
# 计算模型困惑度和一致性得分
perplexity = lda_model.log_perplexity(corpus)
coherence_model = CoherenceModel(model=lda_model, texts=df['Lemmatized_Tokens'], 
                                 dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()

In [16]:
print(f"主题数量: {num_topics}")
print(f"困惑度(Perplexity): {perplexity:.4f}")
print(f"一致性得分(Coherence Score): {coherence:.4f}")

主题数量: 5
困惑度(Perplexity): -5.9795
一致性得分(Coherence Score): 0.5458


In [17]:
# 4.1 pyLDAvis交互图
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(lda_vis, r'C:\Users\ASUS\Desktop\LAD\lda_visualization.html')
print("pyLDAvis交互图已保存至: lda_visualization.html")

pyLDAvis交互图已保存至: lda_visualization.html


In [18]:
# 4.2 词云图
plt.figure(figsize=(15, 12))
for i in range(num_topics):
    plt.subplot(3, 2, i+1)  # 调整子图布局
    topic_words = dict(lda_model.show_topic(i, topn=30))
    wordcloud = WordCloud(width=800, height=600, 
                         background_color='white', 
                         max_words=30).generate_from_frequencies(topic_words)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'主题 {i+1} 的关键词')
    plt.axis('off')
plt.tight_layout()
plt.savefig(r'C:\Users\ASUS\Desktop\LAD\topic_wordclouds.png', dpi=300)
plt.close()
print("主题词云图已保存至: topic_wordclouds.png")

findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: 

主题词云图已保存至: topic_wordclouds.png


In [19]:
# 4.3 热力图：文档-主题概率分布矩阵（可选）
def get_topic_distribution(model, corpus):
    """获取文档-主题概率分布矩阵"""
    doc_topic_dist = []
    for doc in corpus:
        topic_probs = model.get_document_topics(doc, minimum_probability=0)
        probs = [0] * model.num_topics
        for topic, prob in topic_probs:
            probs[topic] = prob
        doc_topic_dist.append(probs)
    return np.array(doc_topic_dist)

In [20]:
# 获取文档-主题分布矩阵
doc_topic_dist = get_topic_distribution(lda_model, corpus)

# 随机选择部分文档进行可视化（避免矩阵过大）
sample_size = min(50, len(doc_topic_dist))  # 最多显示50个文档
sample_indices = np.random.choice(len(doc_topic_dist), sample_size, replace=False)
sample_doc_topic_dist = doc_topic_dist[sample_indices]

# 创建自定义颜色映射
colors = ['#f7fbff', '#abd0e6', '#377eb8', '#023858']
cmap = LinearSegmentedColormap.from_list('custom_blue', colors, N=256)

# 绘制热力图
plt.figure(figsize=(12, 10))
sns.heatmap(sample_doc_topic_dist, cmap=cmap, annot=False, 
           xticklabels=[f'主题 {i+1}' for i in range(num_topics)],
           yticklabels=False)
plt.title('文档-主题概率分布矩阵')
plt.xlabel('主题')
plt.ylabel('文档')
plt.tight_layout()
plt.savefig(r'C:\Users\ASUS\Desktop\LAD\doc_topic_heatmap.png', dpi=300)
plt.close()
print("文档-主题热力图已保存至: doc_topic_heatmap.png")


findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: Font family 'WenQuanYi Micro Hei' not found.
findfont: Font family 'Heiti TC' not found.
findfont: 

文档-主题热力图已保存至: doc_topic_heatmap.png


In [21]:
# 5. 保存主题模型和结果
lda_model.save(r'C:\Users\ASUS\Desktop\LAD\lda_model')

# 提取每个文档的主要主题
df['Dominant_Topic'] = [np.argmax(probs) + 1 for probs in doc_topic_dist]
df['Topic_Probability'] = [max(probs) for probs in doc_topic_dist]

# 保存结果
result_path = r'C:\Users\ASUS\Desktop\LAD\sentimentdataset_with_topics.csv'
df.to_csv(result_path, index=False)
print(f"带主题标签的数据已保存至: {result_path}")

# 输出各主题的关键词
print("\n各主题的关键词:")
for idx, topic in lda_model.print_topics(-1):
    print(f"主题 {idx+1}: {topic}")

带主题标签的数据已保存至: C:\Users\ASUS\Desktop\LAD\sentimentdataset_with_topics.csv

各主题的关键词:
主题 1: 0.048*"new" + 0.024*"friend" + 0.021*"emotion" + 0.020*"creativity" + 0.020*"gratitude" + 0.019*"art" + 0.017*"sky" + 0.016*"project" + 0.016*"personal" + 0.016*"explore"
主题 2: 0.031*"beauty" + 0.024*"echo" + 0.024*"heart" + 0.024*"symphony" + 0.023*"nature" + 0.021*"moment" + 0.020*"wave" + 0.019*"dream" + 0.016*"solitude" + 0.015*"landscape"
主题 3: 0.039*"dance" + 0.037*"challenge" + 0.032*"find" + 0.028*"embrace" + 0.018*"moment" + 0.018*"book" + 0.018*"life" + 0.016*"mind" + 0.016*"love" + 0.014*"one"
主题 4: 0.028*"joy" + 0.027*"day" + 0.025*"experience" + 0.022*"laughter" + 0.019*"garden" + 0.018*"life" + 0.015*"like" + 0.015*"hope" + 0.014*"echo" + 0.014*"labyrinth"
主题 5: 0.037*"feel" + 0.033*"world" + 0.020*"embark" + 0.020*"new" + 0.018*"ancient" + 0.018*"explore" + 0.018*"school" + 0.017*"attend" + 0.017*"curiosity" + 0.017*"journey"
