In [11]:
import pandas as pd

In [12]:
df = pd.read_csv("comment.csv", encoding='gb18030')

In [13]:
df.shape

(8615, 1)

In [14]:
X = df[['comment']]

In [15]:
X.head()

Unnamed: 0,comment
0,今天突然想起来好久没有吃麦当劳的巨无霸了，临近中午，就直奔麦当劳了，这家店，以前来过几次，每...
1,和母上大人一起吃板烧，第一次喝了大鲜柠特饮，真的还不错，向一直没喝过的朋友推荐哦，另外注册会...
2,常营华联购物中心一层。人不少。排队，机器自助都可以。很久没吃麦当劳了，点了新品。自助点餐结账...
3,看电影快到时间了，赶紧买了一个麦香鱼套餐，可没想到这个汉堡挤变形了一样，也不像是新鲜的，国际...
4,基本上每天都来…近就是很方便！


In [16]:
import jieba

In [17]:
def chinese_word_cut(mytext):
    return " ".join(jieba.cut(mytext))

In [18]:
X["content_cutted"] = df.comment.apply(chinese_word_cut)

In [19]:
X.head()

Unnamed: 0,comment,content_cutted
0,今天突然想起来好久没有吃麦当劳的巨无霸了，临近中午，就直奔麦当劳了，这家店，以前来过几次，每...,今天 突然 想 起来 好久 没有 吃 麦当劳 的 巨无霸 了 ， 临近 中午 ， 就 直奔 ...
1,和母上大人一起吃板烧，第一次喝了大鲜柠特饮，真的还不错，向一直没喝过的朋友推荐哦，另外注册会...,和 母上 大人 一起 吃板 烧 ， 第一次 喝 了 大鲜 柠特 饮 ， 真的 还 不错 ， ...
2,常营华联购物中心一层。人不少。排队，机器自助都可以。很久没吃麦当劳了，点了新品。自助点餐结账...,常营 华联 购物中心 一层 。 人 不少 。 排队 ， 机器 自助 都 可以 。 很久没 吃...
3,看电影快到时间了，赶紧买了一个麦香鱼套餐，可没想到这个汉堡挤变形了一样，也不像是新鲜的，国际...,看 电影 快到 时间 了 ， 赶紧 买 了 一个 麦 香鱼 套餐 ， 可 没想到 这个 汉堡...
4,基本上每天都来…近就是很方便！,基本上 每天 都 来 … 近 就是 很 方便 ！


In [20]:
def get_custom_stopwords(stop_words_file):
    with open(stop_words_file) as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list

In [21]:
stop_words_file = "cs.txt"
stopwords = get_custom_stopwords(stop_words_file)

In [22]:
stopwords[-10:]

['呃', '呗', '咚', '咦', '喏', '啐', '喔唷', '嗬', '嗯', '嗳']

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
vect = CountVectorizer()

In [25]:
term_matrix = pd.DataFrame(vect.fit_transform(X.content_cutted).toarray(), columns=vect.get_feature_names())

In [26]:
term_matrix.head()

Unnamed: 0,00,01,03,04,05,06,08,083,09,10,...,鼻祖,齊全,齐全,齐肩,齿间,龙大哥,龙德,龙游,龙眼,龟速
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
max_df = 0.8 # 在超过这一比例的文档中出现的关键词（过于平凡），去除掉。
min_df = 3 # 在低于这一数量的文档中出现的关键词（过于独特），去除掉。

In [28]:
tf_vectorizer = CountVectorizer(max_df = max_df,
                       min_df = min_df,
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',
                       stop_words=frozenset(stopwords))
tf = tf_vectorizer.fit_transform(df.comment)

  'stop_words.' % sorted(inconsistent))


In [29]:
term_matrix.head()

Unnamed: 0,00,01,03,04,05,06,08,083,09,10,...,鼻祖,齊全,齐全,齐肩,齿间,龙大哥,龙德,龙游,龙眼,龟速
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
from sklearn.decomposition import LatentDirichletAllocation

In [31]:
n_topics = 4

In [32]:
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50, learning_method='online', learning_offset=50.,
                                random_state=0)
result_topic = lda.fit(tf)
print(result_topic)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=4, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [33]:
# 显示每个主题下的前若干关键词
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

In [34]:
n_top_words = 10

In [None]:
tf_feature_names = vect.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
29 dates enough 一扬 一身 dz 一模一样 一半儿 一个样 mccafe

Topic #1:
一幅 yeyeye 三明治 一年四季 9012 jio me coming call 三件套

Topic #2:
一站 50 too 一塌糊涂 三思 一买 donalds 12d3 __ 一度

Topic #3:
promotions pink think want our 一手 niki your mini clean



In [None]:
import pyLDAvis.sklearn

In [None]:
# 主题可视化
f_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
29 dates enough 一扬 一身 dz 一模一样 一半儿 一个样 mccafe

Topic #1:
一幅 yeyeye 三明治 一年四季 9012 jio me coming call 三件套

Topic #2:
一站 50 too 一塌糊涂 三思 一买 donalds 12d3 __ 一度

Topic #3:
promotions pink think want our 一手 niki your mini clean



In [None]:
vis = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.show(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [07/Jul/2019 08:36:43] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [07/Jul/2019 08:36:43] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [07/Jul/2019 08:36:43] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [07/Jul/2019 08:36:43] "GET /LDAvis.js HTTP/1.1" 200 -
127.0.0.1 - - [07/Jul/2019 08:36:44] code 404, message Not Found
127.0.0.1 - - [07/Jul/2019 08:36:44] "GET /favicon.ico HTTP/1.1" 404 -
