# sklearn-LDA

代码示例：https://mp.weixin.qq.com/s/hMcJtB3Lss1NBalXRTGZlQ （玉树芝兰） <br>
可视化：https://blog.csdn.net/qq_39496504/article/details/107125284  <br>
sklearn lda参数解读:https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
<br>中文版参数解读：https://blog.csdn.net/TiffanyRabbit/article/details/76445909
<br>LDA原理-视频版：https://www.bilibili.com/video/BV1t54y127U8
<br>LDA原理-文字版：https://www.jianshu.com/p/5c510694c07e
<br>score的计算方法：https://github.com/scikit-learn/scikit-learn/blob/844b4be24d20fc42cc13b957374c718956a0db39/sklearn/decomposition/_lda.py#L729
<br>主题困惑度1：https://blog.csdn.net/weixin_43343486/article/details/109255165
<br>主题困惑度2：https://blog.csdn.net/weixin_39676021/article/details/112187210

## 1.预处理

In [1]:
import os
import pandas as pd
import re
import nltk
from nltk import FreqDist
# 加载停用词库
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.stem import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
import ast

[nltk_data] Downloading package stopwords to /Users/asa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/asa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Read the CSV file
df = pd.read_csv('/Users/asa/VScode/patent/label6pca_patent.csv', index_col=0)
df_cluster_3 = df[df['cluster'] == 2]
df_cluster_3['keywords'] = df_cluster_3['keywords'].apply(ast.literal_eval)
df_cluster_3['processed_text'] = df_cluster_3['keywords'].apply(lambda x: ' '.join(['_'.join(item.split()) for item in x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cluster_3['keywords'] = df_cluster_3['keywords'].apply(ast.literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cluster_3['processed_text'] = df_cluster_3['keywords'].apply(lambda x: ' '.join(['_'.join(item.split()) for item in x]))


## 2.LDA分析

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [4]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [5]:
n_features = 25 #提取1000个特征词语
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english',
                                max_df = 0.5,
                                min_df = 10,
                                ngram_range=(1, 3))
tf = tf_vectorizer.fit_transform(df_cluster_3.processed_text)

In [6]:
n_topics = 5
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=50,
                                learning_method='batch',
                                learning_offset=50,
#                                 doc_topic_prior=0.1,
#                                 topic_word_prior=0.01,
                               random_state=0)
lda.fit(tf)

### 2.1输出每个主题对应词语 

In [7]:
n_top_words = 25
tf_feature_names = tf_vectorizer.get_feature_names_out()
topic_word = print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
remote_monitoring elderly_advantage neural_network mobile_telephone elderly_person smart_phone terminal_mobile monitoring_old communication_module mobile_terminal utility_model medicine_box novelty_method communication_unit
Topic #1:
mobile_terminal elderly_person communication_module monitoring_old utility_model remote_monitoring terminal_mobile mobile_telephone medicine_box neural_network elderly_advantage smart_phone novelty_method communication_unit
Topic #2:
communication_unit terminal_mobile elderly_person elderly_advantage medicine_box monitoring_old communication_module mobile_terminal utility_model mobile_telephone neural_network remote_monitoring smart_phone novelty_method
Topic #3:
novelty_method smart_phone elderly_advantage monitoring_old communication_module elderly_person mobile_terminal utility_model mobile_telephone medicine_box neural_network remote_monitoring communication_unit terminal_mobile
Topic #4:
medicine_box mobile_telephone utility_model elderly_ad

### 2.2输出每篇文章对应主题 

In [None]:
import numpy as np

In [None]:
topics=lda.transform(tf)

In [None]:
topic = []
for t in topics:
    topic.append("Topic #" + str(list(t).index(np.max(t))))

# Assigning the topic information to the DataFrame
df_cluster_3['概率最大的主题序号'] = topic  # Column for the topic with the highest probability
df_cluster_3['每个主题对应概率'] = list(topics)  # Column for probabilities of each topic

# Save the DataFrame to a CSV file
df.to_csv("data_topic.csv", index=False)


### 2.3可视化 

In [None]:
import pyLDAvis
import pyLDAvis.lda_model
print(pyLDAvis.__version__)


In [None]:
pyLDAvis.enable_notebook()
pic = pyLDAvis.lda_model.prepare(lda, tf, tf_vectorizer)
pyLDAvis.display(pic)
pyLDAvis.save_html(pic, 'lda_pass'+str(n_topics)+'.html')
pyLDAvis.display(pic)

### 2.4困惑度 

In [None]:
import matplotlib.pyplot as plt

In [None]:
plexs = []
scores = []
n_max_topics = 16
for i in range(1,n_max_topics):
    print(i)
    lda = LatentDirichletAllocation(n_components=i, max_iter=50,
                                    learning_method='batch',
                                    learning_offset=50,random_state=0)
    lda.fit(tf)
    plexs.append(lda.perplexity(tf))
    scores.append(lda.score(tf))

In [None]:
n_t=15#区间最右侧的值。注意：不能大于n_max_topics
x=list(range(1,n_t+1))
plt.plot(x,plexs[0:n_t])
plt.xlabel("number of topics")
plt.ylabel("perplexity")
plt.show()