In [106]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn
from gensim.models import ldamodel
from gensim.models import Word2Vec
import gensim.corpora
from ast import literal_eval

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle

In [2]:
data = pd.read_csv('./clean_jd_data.csv')

In [20]:
data.pseg_words = data.pseg_words.apply(literal_eval)

In [64]:
attrs = list(
    set([attr for ls in data.pseg_words for elem in ls for wd, attr in elem]) 
    - set(['a', 'd', 'i', 'l'])
)

In [75]:
data['words'] = [[wd for elem in ls for wd, attr in elem if attr in attrs and len(wd) > 1] for ls in data.pseg_words]
data_text = data[['words']]
train_headlines = [value[0] for value in data_text.values]

# LDA implement

In [None]:
num_topics = 10

In [79]:
id2word = gensim.corpora.Dictionary(train_headlines)
corpus = [id2word.doc2bow(text) for text in train_headlines]
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

In [80]:
def get_lda_topics(model, num_topics, topn):
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn=topn)
        word_dict['Topic' + '{:02d}'.format(i+1)] = [word[0] for word in words]
    return pd.DataFrame(word_dict)

In [81]:
get_lda_topics(lda, num_topics, 30)

Unnamed: 0,Topic01,Topic02,Topic03,Topic04,Topic05,Topic06,Topic07,Topic08,Topic09,Topic10
0,手机,手机,手机,没有,手机,耳机,手机,手机,值得,指纹
1,颜值,运行,价格,问题,快递,没有,问题,屏幕,购买,解锁
2,手感,速度,颜色,手机,物流,手机,知道,华为,机子,魅族
3,vivo,拍照,支持,失望,收到,赠品,客服,荣耀,不行,手机
4,朋友,电池,苹果,想象,包装,声音,垃圾,王者,没有,系统
5,照片,充电,黑色,功能,购物,买来,出现,摄像头,充电器,屏幕
6,拿到,外观,国产,使用,速度,评论,不到,手感,手机,三星
7,像素,玩游戏,开心,钢化,发货,音乐,不能,使用,推荐,小米
8,入手,屏幕,白色,不会,服务,机器,售后,系统,能力,识别
9,拍照,手感,蓝色,屏幕,质量,客服,信号,性能,说好,体验


# NMF implement

In [84]:
sentences = [' '.join(text) for text in train_headlines]

In [85]:
# The CountVectorizer module return a matrix of size(Documents X Features), where the value of 
# a cell is going to be the number of times of the feature (word) appear in that document.
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(sentences)

In [87]:
# Set a TFIDF transformer, transform the counts with the model and normalize the values
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [90]:
model = NMF(n_components=num_topics, init='nndsvd')
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=10, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [102]:
def get_nmf_topics(model, num_topics, topn):
    feat_names = vectorizer.get_feature_names()
    word_dict = {}
    
    for i in range(num_topics):
        words_ids = model.components_[i].argsort()[:-topn-1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic ' + '{:02d}'.format(i+1)] = words
    return pd.DataFrame(word_dict)

In [139]:
num_topics = 5
get_nmf_topics(model, num_topics, 30)

Unnamed: 0,Topic 01,Topic 02,Topic 03,Topic 04,Topic 05
0,手机,屏幕,没有,物流,电池
1,收到,系统,问题,收到,充电
2,华为,反应,想象,包装,不行
3,玩游戏,起来,赠品,质量,耗电
4,快递,玩游戏,发票,服务,玩游戏
5,垃圾,分辨率,收到,购物,小时
6,问题,看着,客服,评价,外观
7,游戏,三星,钢化,客服,问题
8,性能,视频,发现,服务态度,能力
9,颜值,颜色,包装,使用,系统
