In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn
from gensim.models import ldamodel
from gensim.models import Word2Vec
import gensim.corpora
from ast import literal_eval

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [4]:
data = pd.read_csv('./input/clean_jd_data.csv')

In [5]:
data.pseg_words = data.pseg_words.apply(literal_eval)

In [6]:
attrs = list(
    set([attr for ls in data.pseg_words for elem in ls for wd, attr in elem]) 
    - set(['a', 'd', 'i', 'l'])
)

In [7]:
data['words'] = [[wd for elem in ls for wd, attr in elem if attr in attrs and len(wd) > 1] for ls in data.pseg_words]
data_text = data[['words']]
train_headlines = [value[0] for value in data_text.values]

# LDA implement

In [27]:
num_topics = 7

In [86]:
id2word = gensim.corpora.Dictionary(train_headlines) 
id2word.filter_extremes(no_below=50, no_above=0.10) 
corpus = [id2word.doc2bow(text) for text in train_headlines] # wordId_wordcount pair
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

In [90]:
# check word frequency
id_word = pd.DataFrame.from_dict(id2word.id2token, orient='index', columns=['word']).reset_index()
id_freq = pd.DataFrame.from_dict(id2word.dfs, orient='index', columns=['freq']).reset_index()
id_word.merge(id_freq, how='left', on='index').sort_values(by=['freq']).head()

Unnamed: 0,index,word,freq
1870,1870,黑影,50
1866,1866,逗我,50
1865,1865,解决办法,50
1522,1522,条款,50
1429,1429,时刻,50


In [88]:
def get_lda_topics(model, num_topics, topn):
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn=topn)
        word_dict['Topic' + '{:02d}'.format(i+1)] = [word[0] for word in words]
    return pd.DataFrame(word_dict)

In [109]:
get_lda_topics(lda, num_topics, 30)

Unnamed: 0,Topic01,Topic02,Topic03,Topic04,Topic05
0,三星,客服,手感,拍照,华为
1,知道,魅族,运行,游戏,支持
2,苹果,小米,速度,荣耀,内存
3,系统,问题,外观,王者,好评
4,朋友,售后,拍照,性能,希望
5,信号,不能,电池,玩游戏,vivo
6,垃圾,不行,值得,像素,红魔
7,充电,不到,颜值,摄像头,国产
8,使用,失望,颜色,使用,充电
9,体验,只能,性价比,手感,电量


# NMF implement

In [96]:
sentences = [' '.join(text) for text in train_headlines]

In [97]:
# The CountVectorizer module return a matrix of size(Documents X Features), where the value
# of a cell is going to be the number of times of the feature (word) appear in that document.
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(sentences)

In [104]:
# Set a TFIDF transformer, transform the counts with the model and normalize the values
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [105]:
model = NMF(n_components=num_topics, init='nndsvd')
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=5, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [106]:
def get_nmf_topics(model, num_topics, topn):
    feat_names = vectorizer.get_feature_names()
    word_dict = {}
    
    for i in range(num_topics):
        words_ids = model.components_[i].argsort()[:-topn-1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic ' + '{:02d}'.format(i+1)] = words
    return pd.DataFrame(word_dict)

In [108]:
num_topics = 5
get_nmf_topics(model, num_topics, 30)

Unnamed: 0,Topic 01,Topic 02,Topic 03,Topic 04,Topic 05
0,手机,屏幕,没有,物流,电池
1,收到,手感,耳机,速度,拍照
2,快递,拍照,问题,收到,充电
3,拍照,速度,想象,外观,外观
4,运行,指纹,赠品,手感,不行
5,手感,外观,发票,运行,耗电
6,问题,运行,知道,快递,玩游戏
7,华为,解锁,收到,包装,速度
8,玩游戏,系统,客服,质量,运行
9,外观,反应,充电,服务,问题
