In [26]:
import numpy as np
import glob
import os
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
import time

# 定义一些超参数
K = 25  # 主题数目
alpha = 0.2  # 文档主题分布的超参数
beta = 0.01  # 主题词分布的超参数
num_iters = 1000  # 迭代次数

data_path = "/Desktop/中央一号文件"

# 读取所有文本文件
documents = []
total_word_count = 0 

# 添加一个空字典，用于存储每个单词出现的次数
word_freq = {}

for filename in os.listdir(data_path):
    if filename.endswith(".txt"):
        with open(os.path.join(data_path, filename), "r") as f:
            text = f.read()
            # 分词
            seg_list = jieba.cut(text)
            # 计算所有单词的概率，过滤掉概率小于0.01或大于0.99的单词
            words = []
            for word in seg_list:
                if word not in word_freq:
                    word_freq[word] = 1
                else:
                    word_freq[word] += 1
                p = word_freq[word] / sum(word_freq.values())
                if p>0.0001 and p < 0.9999:
                    words.append(word)
            # 进行词性标注
            tagged_words = []
            for word in words:
                word_tag = nltk.pos_tag([word], lang='eng')
                tagged_words.append(word_tag[0])
            # 去除停用词
            stop_words = set()
            with open("/Desktop/postgraduate/nlp/stop_words.txt", "r") as stop_f:
                for line in stop_f:
                    stop_words.add(line.strip())
            words = [word for word, pos in tagged_words if pos.startswith('N') and word not in stop_words]
            doc_str = " ".join(words)
            if len(doc_str) > 0:
                documents.append(doc_str)
            # 打印文档的词汇列表
            # print(doc_str)
            total_word_count += len(words)

print("The number of words in doc_str is:", total_word_count)

# 构建词表并特征提取
vectorizer = CountVectorizer()
word_freq = vectorizer.fit_transform(documents)
vocab = vectorizer.get_feature_names_out()
vocab_size = len(vocab)
word2idx = {w: i for i, w in enumerate(vocab)}
tf_transformer = TfidfTransformer(use_idf=False)
word_freq_tf = tf_transformer.fit_transform(word_freq)
data = np.asarray(word_freq_tf.toarray(), dtype=np.float32)

# 初始化文档-主题和主题-词语分布
doc_topic_dist = np.random.dirichlet([alpha] * K, len(documents))
topic_word_dist = np.random.dirichlet([beta] * vocab_size, K)
i = 0
# 迭代推断
for _ in range(num_iters):
    start_time = time.time()
#     E 步骤：计算每个词语属于每个主题的概率
    topic_word_prob = np.zeros((K, vocab_size))
    for k in range(K):
        for w in range(vocab_size):
            try:
                topic_word_prob[k, w] = np.sum([doc_topic_dist[d, k] * topic_word_dist[k, w] for d, doc in enumerate(documents) if vocab[w] in doc])
            except KeyError:
                continue
        topic_word_prob[k] /= topic_word_prob[k].sum()

    
    # M 步骤：更新主题-词语分布和文档-主题分布
    for k in range(K):
        for w in range(vocab_size):
            topic_word_dist[k, w] = (topic_word_prob[k, w] * np.array([doc_topic_dist[d, k] for d in range(len(documents))])).sum()
        topic_word_dist[k] /= topic_word_dist[k].sum()

    for d in range(len(documents)):
        for k in range(K):
            doc_topic_dist[d, k] = np.sum((topic_word_prob[k, [word2idx[word] for word in documents[d].split() if word in word2idx]][:, np.newaxis] * topic_word_dist[k]))
        doc_topic_dist[d] /= doc_topic_dist[d].sum()

    end_time = time.time()
    iter_time = end_time - start_time
    i = i+1
    print("Iteration {} time: {:.2f} seconds".format(i, iter_time))
    
# # 选取每个主题的主题词
# topic_words = []
# for k in range(K):
#     topic_word_prob = topic_word_dist[k]
#     sorted_prob_indices = np.argsort(topic_word_prob)[::-1]
#     topic_word_list = [vocab[i] for i in sorted_prob_indices] # 选取所有词
#     topic_words.append(topic_word_list)
    
# # 输出每个主题的主题词
# for k in range(K):
#     print("Topic {}: {}".format(k, ", ".join(topic_words[k])))




The number of words in doc_str is: 72847
Iteration 1 time: 18.72 seconds
Iteration 2 time: 18.58 seconds
Iteration 3 time: 18.51 seconds
Iteration 4 time: 18.49 seconds
Iteration 5 time: 18.48 seconds
Iteration 6 time: 18.50 seconds
Iteration 7 time: 18.52 seconds
Iteration 8 time: 18.58 seconds
Iteration 9 time: 18.52 seconds
Iteration 10 time: 18.52 seconds
Iteration 11 time: 18.49 seconds
Iteration 12 time: 18.51 seconds
Iteration 13 time: 18.53 seconds
Iteration 14 time: 18.52 seconds
Iteration 15 time: 18.51 seconds
Iteration 16 time: 18.51 seconds
Iteration 17 time: 18.51 seconds
Iteration 18 time: 18.52 seconds
Iteration 19 time: 18.50 seconds
Iteration 20 time: 18.51 seconds
Iteration 21 time: 18.52 seconds
Iteration 22 time: 18.59 seconds
Iteration 23 time: 18.50 seconds
Iteration 24 time: 18.51 seconds
Iteration 25 time: 18.48 seconds
Iteration 26 time: 18.53 seconds
Iteration 27 time: 18.49 seconds
Iteration 28 time: 18.53 seconds
Iteration 29 time: 18.50 seconds
Iteration 3

Iteration 244 time: 18.50 seconds
Iteration 245 time: 18.54 seconds
Iteration 246 time: 18.49 seconds
Iteration 247 time: 18.52 seconds
Iteration 248 time: 18.51 seconds
Iteration 249 time: 18.53 seconds
Iteration 250 time: 18.49 seconds
Iteration 251 time: 18.53 seconds
Iteration 252 time: 18.49 seconds
Iteration 253 time: 18.52 seconds
Iteration 254 time: 18.49 seconds
Iteration 255 time: 18.54 seconds
Iteration 256 time: 18.50 seconds
Iteration 257 time: 18.51 seconds
Iteration 258 time: 18.49 seconds
Iteration 259 time: 18.53 seconds
Iteration 260 time: 18.49 seconds
Iteration 261 time: 18.53 seconds
Iteration 262 time: 18.51 seconds
Iteration 263 time: 18.51 seconds
Iteration 264 time: 18.52 seconds
Iteration 265 time: 18.52 seconds
Iteration 266 time: 18.51 seconds
Iteration 267 time: 18.53 seconds
Iteration 268 time: 18.52 seconds
Iteration 269 time: 18.52 seconds
Iteration 270 time: 18.50 seconds
Iteration 271 time: 18.57 seconds
Iteration 272 time: 18.48 seconds
Iteration 273 

Iteration 485 time: 18.51 seconds
Iteration 486 time: 18.47 seconds
Iteration 487 time: 18.47 seconds
Iteration 488 time: 18.49 seconds
Iteration 489 time: 18.47 seconds
Iteration 490 time: 18.41 seconds
Iteration 491 time: 18.43 seconds
Iteration 492 time: 18.46 seconds
Iteration 493 time: 18.50 seconds
Iteration 494 time: 18.48 seconds
Iteration 495 time: 18.45 seconds
Iteration 496 time: 18.45 seconds
Iteration 497 time: 18.49 seconds
Iteration 498 time: 18.46 seconds
Iteration 499 time: 18.47 seconds
Iteration 500 time: 18.47 seconds
Iteration 501 time: 18.47 seconds
Iteration 502 time: 18.47 seconds
Iteration 503 time: 18.43 seconds
Iteration 504 time: 18.43 seconds
Iteration 505 time: 18.45 seconds
Iteration 506 time: 18.51 seconds
Iteration 507 time: 18.46 seconds
Iteration 508 time: 18.48 seconds
Iteration 509 time: 18.46 seconds
Iteration 510 time: 18.43 seconds
Iteration 511 time: 18.47 seconds
Iteration 512 time: 18.44 seconds
Iteration 513 time: 18.46 seconds
Iteration 514 

Iteration 726 time: 18.51 seconds
Iteration 727 time: 18.47 seconds
Iteration 728 time: 18.47 seconds
Iteration 729 time: 18.47 seconds
Iteration 730 time: 18.54 seconds
Iteration 731 time: 18.50 seconds
Iteration 732 time: 18.48 seconds
Iteration 733 time: 18.49 seconds
Iteration 734 time: 18.52 seconds
Iteration 735 time: 18.52 seconds
Iteration 736 time: 18.50 seconds
Iteration 737 time: 18.47 seconds
Iteration 738 time: 18.46 seconds
Iteration 739 time: 18.51 seconds
Iteration 740 time: 18.51 seconds
Iteration 741 time: 18.46 seconds
Iteration 742 time: 18.47 seconds
Iteration 743 time: 18.49 seconds
Iteration 744 time: 18.43 seconds
Iteration 745 time: 18.49 seconds
Iteration 746 time: 18.50 seconds
Iteration 747 time: 18.47 seconds
Iteration 748 time: 18.45 seconds
Iteration 749 time: 18.51 seconds
Iteration 750 time: 18.45 seconds
Iteration 751 time: 18.48 seconds
Iteration 752 time: 18.49 seconds
Iteration 753 time: 18.44 seconds
Iteration 754 time: 18.41 seconds
Iteration 755 

Iteration 967 time: 18.46 seconds
Iteration 968 time: 18.49 seconds
Iteration 969 time: 18.44 seconds
Iteration 970 time: 18.42 seconds
Iteration 971 time: 18.44 seconds
Iteration 972 time: 18.48 seconds
Iteration 973 time: 18.47 seconds
Iteration 974 time: 18.48 seconds
Iteration 975 time: 18.46 seconds
Iteration 976 time: 18.46 seconds
Iteration 977 time: 18.44 seconds
Iteration 978 time: 18.46 seconds
Iteration 979 time: 18.42 seconds
Iteration 980 time: 18.46 seconds
Iteration 981 time: 18.54 seconds
Iteration 982 time: 18.44 seconds
Iteration 983 time: 18.46 seconds
Iteration 984 time: 18.47 seconds
Iteration 985 time: 18.45 seconds
Iteration 986 time: 18.43 seconds
Iteration 987 time: 18.49 seconds
Iteration 988 time: 18.47 seconds
Iteration 989 time: 18.51 seconds
Iteration 990 time: 18.47 seconds
Iteration 991 time: 18.48 seconds
Iteration 992 time: 18.44 seconds
Iteration 993 time: 18.48 seconds
Iteration 994 time: 18.57 seconds
Iteration 995 time: 18.47 seconds
Iteration 996 

In [27]:
import pandas as pd

# 计算所有词的词频并存储在一个字典中
word_freq_dict = {}
for doc_str in documents:
    for word in doc_str.split():
        if word in word_freq_dict:
            word_freq_dict[word] += 1
        else:
            word_freq_dict[word] = 1

# 创建 Excel 文件并写入数据
writer = pd.ExcelWriter('/Desktop/postgraduate/nlp/topic_words_try.xlsx', engine='xlsxwriter')
for k in range(K):
    top_words_idx = np.argsort(-topic_word_dist[k])
    top_words = [vocab[i] for i in top_words_idx]
    
    # 计算每个单词的概率并将其存储在字典中
    word_prob_dict = {}
    prob_sum = 0
    for word in top_words:
        if word in word_freq_dict:
            prob = word_freq_dict[word] / total_word_count
            prob_sum += prob
            word_prob_dict[word] = prob
    prob_sum_str = f'P={prob_sum:.5f}'
    
    # 创建新的工作簿并写入数据
    sheet_name = f'Topic {k}'
    df = pd.DataFrame({'Word': top_words, 'Probability': [word_prob_dict.get(word, '') for word in top_words]})
    df.to_excel(writer, sheet_name=sheet_name, index=False)
    worksheet = writer.sheets[sheet_name]
    worksheet.write(len(top_words)+1, 0, prob_sum_str)

# 保存 Excel 文件并关闭 writer
writer.save()

