In [1]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [28]:
train = pd.read_csv("amazon_train.csv", header = 0)
train.dropna(axis=0, how="any", inplace=True)

In [29]:
train

Unnamed: 0,reviewText,summary,Category
0,This Hamper looks really good and is bigger th...,Great Hamper!,Home and Kitchen
1,"I needed a good zester, and although I already...",DISSAPOINTED,Home and Kitchen
2,I like this a lot for use in my kitchen and ba...,Great for the Price,Home and Kitchen
3,I am a big fan of lattes... and at $3 or $4 a ...,Love this,Home and Kitchen
4,"I haven't spilled anything on it, but I'm sure...",Good for the price,Home and Kitchen
5,I bought two of these pillows to fill pillowca...,"Comfy, basic pillow.",Home and Kitchen
6,"I ordered the wrong size, but that was my faul...",Pretty Good,Home and Kitchen
7,I loved all the information I found in this bo...,great book,Home and Kitchen
8,We've received great service from this robot. ...,My wife loves me for it! :-),Home and Kitchen
9,"seems to work ok, but it is not very precise. ...",im betting it will break inside a year.,Home and Kitchen


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [19]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [20]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=1000,
                                   stop_words='english')

In [30]:
tfidf = tfidf_vectorizer.fit_transform(train['reviewText'])

In [31]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=1000,
                                stop_words='english')

In [32]:
tf = tf_vectorizer.fit_transform(train['reviewText'])

In [42]:
nmf = NMF(n_components=10, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

In [43]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, 20)


Topics in NMF model:
Topic #0:
like just really don time used little ve use bought work does make small better water did using need got
Topic #1:
baby seat loves son old months car toy daughter stroller toys soft month play crib cute little sleep babies bought
Topic #2:
coffee cup water maker hot filter cups pot tea machine press brew carafe grounds filters mug drink grind beans espresso
Topic #3:
great works product price looks work recommend buy little fits highly worked perfectly bought value item perfect sturdy size addition
Topic #4:
easy use clean makes install food make recommend store sturdy assemble quick super kitchen highly cleaning easier wash perfect lot
Topic #5:
34 old case fit says wife green rifle legs storage description said husband loves style mattress short medium cup going
Topic #6:
knife blade knives sharp edge handle sheath steel cut pocket cutting hand carry grip blades clip opening box block excellent
Topic #7:
good quality product price nice looks recommend 

In [44]:
lda = LatentDirichletAllocation(n_topics=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [45]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [46]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)


Topics in LDA model:
Topic #0:
box baby air room night like bed light mattress pillow fan sleep play soft little watch really cute sight bright
Topic #1:
fit bag just like size seat use small fits great carry car little pocket big easy don easily baby hold
Topic #2:
just 34 time bought old great used got use did months ve new product using didn year like day really
Topic #3:
use easy clean food just make love great used pan time cooking like oven cook heat using oil stove kitchen
Topic #4:
bike don money plastic ve buy better bags just like products good best work pack ones ice used lot time
Topic #5:
coffee water unit machine cup pot filter press hot use maker make inches time model steam filters cons grill good
Topic #6:
knife good quality blade set like price steel sharp handle nice knives great use just edge cut stainless non iron
Topic #7:
scope mount rifle couldn handy bar pressure batteries shower throw device safety button 22 mounted set case rings tape 8217
Topic #8:
great go