In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words

### 1.1 Load the data

In [2]:
# Load data
amreviews = pd.read_csv("amazon-reviews.csv.bz2", sep='\t')
#Viewing the data
amreviews.sample(5)

Unnamed: 0,date,summary,review,rating
188381,2015-08-24,Another great product from Baxter; superb hand...,I have very dry hands and just spent 2 weeks i...,5
141672,2014-03-17,Good for babies who don't like bottles!,I purchased these nipples for my 3 month old w...,4
130527,2014-06-29,Great to keep stroller brand new!,Love it! Now I can keep the stroller in great ...,5
52227,2011-02-05,Ok but doesn't stick to the tray,The bowl works good to catch dropped food but ...,2
93422,2013-02-20,"Not perfect, but usable","The fabric is soft, but really thin. One of t...",4


In [3]:
# view dimensions
amreviews.shape

(205331, 4)

### 1.2 Remove missing and empty observations for review and rating

In [4]:
#Check for na values
amreviews.review.isna().sum()

80

In [5]:
#drop na 
amreviews_mod = amreviews.drop(amreviews[amreviews.review.isna()].index).reset_index()

In [6]:
#Check data
amreviews_mod.sample(3)

Unnamed: 0,index,date,summary,review,rating
63752,63779,2013-12-02,"My son loves it, I love it but be warned for t...","I wish I could give more stars than 3, my son ...",3
84637,84671,2013-12-09,super soft,I absolutely love these blankets!!! They are s...,5
142369,142423,2013-08-12,Wonderful car seat,My little girl was not a fan of her infant car...,5


In [7]:
# drop previous index column
amreviews_mod.drop('index', axis = 1, inplace= True)

In [8]:
#Check for empty strings 
np.where(amreviews.review.apply(lambda x: x == ''))

(array([], dtype=int64),)

In [9]:
#Check for value counts for rating
amreviews.rating.value_counts(dropna=False)

5    120434
4     42916
3     21911
2     10939
1      9131
Name: rating, dtype: int64

There are no missing values/empty values for review and rating in the sample.

### Topic modeling using LDA (Latent Dirichlet Allocation) and Count Vectorizer

In [42]:
# convert in BOW
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=stop_words.ENGLISH_STOP_WORDS, binary=True)
cv_X = vectorizer.fit_transform(amreviews_mod.review.values)

In [43]:
from sklearn.decomposition import LatentDirichletAllocation

In [44]:
#Initialize LDA with number of topics and random seed
LDA = LatentDirichletAllocation(n_components=8, random_state= 4321)
LDA.fit(cv_X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=None,
                          perp_tol=0.1, random_state=4321,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

From what I understood the number of components are the number of topics you think the dataset contains. This initial estimate of number of topics will be informed by domain knowledge or other business related insights that you have. 

In my case, a previous assignment showed decent results when we selected the number of clusters as 15 (again another unsupervised learning exercise)

In [45]:
# Get vocabulary of words
lda_all_words = vectorizer.get_feature_names()

In [46]:
#show sample
print(lda_all_words[-20:])

['zoomed', 'zooming', 'zooms', 'zoon', 'zooper', 'zoos', 'zorb', 'zoya', 'zoyas', 'zt', 'zucchini', 'zuccini', 'zulily', 'zurich', 'zutano', 'zwilling', 'zz', 'zzz', 'zzzipme', 'zzzz']


In [47]:
#Get topics and words (top 15)
for tpno, topic in enumerate(LDA.components_):
    print("Words for Topic", tpno+1)
    print([lda_all_words[index] for index in topic.argsort()[-25:]])
    print()

Words for Topic 1
['time', 'nail', 'looking', 'price', 'polish', 'use', 'perfect', 'pretty', 'beautiful', 'light', 'pink', 'little', 'quality', 'just', 'really', 'look', 'colors', 'good', 'product', 'looks', 'nice', 'great', 'like', 'love', 'color']

Words for Topic 2
['using', 'make', 'gate', 'nice', 'quality', 'used', 'need', 've', 'open', 'better', 'sound', 'really', 'little', 'does', 'easy', 'don', 'work', 'price', 'works', 'guitar', 'like', 'use', 'just', 'good', 'great']

Words for Topic 3
['easily', 'doesn', 'water', 'using', 'bought', 'time', 'good', 'cups', 'son', 'cup', 'old', 'really', 'food', 'don', 'diaper', 'bag', 'little', 'love', 'baby', 'like', 'just', 'clean', 'easy', 'great', 'use']

Words for Topic 4
['recommend', 'good', 'likes', 'easy', 'use', 'got', 'play', 'cute', 'bought', 'time', 'really', 'love', 'toys', 'month', 'like', 'toy', 'daughter', 'just', 'great', 'months', 'little', 'son', 'loves', 'old', 'baby']

Words for Topic 5
['milk', 'better', 'pump', 'good',

In [48]:
#Find which topic review belongs to
topic_prediction = LDA.transform(cv_X)

In [49]:
#Add topic number to review dataset
amreviews_mod['topic'] = topic_prediction.argmax(axis=1) + 1

In [50]:
amreviews.sample(10)

Unnamed: 0,date,summary,review,rating
101573,2013-05-28,a good mattress for the price,"This obviously isn't a luxury mattress, but it...",4
184632,2016-01-23,Very Good Results,I've used a prescription retinol that irritate...,5
125492,2013-11-11,Good Deal,Great to keep in diaper bag. Never have the ba...,5
111606,2014-04-01,Works Great,My son says that this seat is comfortable and ...,5
79381,2013-10-03,Good,Works good for intended purpose. Sometimes I d...,4
136987,2013-10-18,"Purple = ""Boy Cup""?",Apparently I missed the memo and purple is now...,1
152584,2014-04-26,Cures standing-in-crib-bawling-at-midnight,All I can say is that I wish someone had told ...,5
187345,2016-02-19,Impossible combination skin TAMED!,I am a 48-year-old woman who has combination s...,5
95523,2013-10-23,CROTCH DANGLER: DAMAGING to your baby's SPINE!,"i received this as a hand me down, it was in g...",1
123841,2013-02-08,Great for all types of bottles!,"Being pickle-minded, I've all brands of baby b...",5


### Topic Modeling with Non Negative Matrix Factorization and TF-IDF

NMF - https://mlexplained.com/2017/12/28/a-practical-introduction-to-nmf-nonnegative-matrix-factorization/

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=stop_words.ENGLISH_STOP_WORDS)
tdf_X = tfidf_vectorizer.fit_transform(amreviews_mod.review.values)

In [52]:
from sklearn.decomposition import NMF

model = NMF(n_components=8,random_state=4321)
model.fit(tdf_X)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=8, random_state=4321, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [53]:
# Get vocabulary of words
tdf_all_words = tfidf_vectorizer.get_feature_names()
print(tdf_all_words[-30:])

['zombie', 'zombies', 'zone', 'zones', 'zonk', 'zonks', 'zoo', 'zoo18m', 'zooley', 'zoom', 'zoomed', 'zooming', 'zooms', 'zoon', 'zooper', 'zoos', 'zorb', 'zoya', 'zoyas', 'zt', 'zucchini', 'zuccini', 'zulily', 'zurich', 'zutano', 'zwilling', 'zz', 'zzz', 'zzzipme', 'zzzz']


In [54]:
#Get topics and words (top 15)
for tpno, topic in enumerate(model.components_):
    print("Words for Topic", tpno+1)
    print([tdf_all_words[index] for index in topic.argsort()[-25:]])
    print()

Words for Topic 1
['good', 'sleep', '34', 'cute', 'soft', 'got', 'easy', 'use', 'play', 'toys', 'crib', 'time', 'month', 'bought', 'really', 'daughter', 'like', 'toy', 'just', 'little', 'son', 'months', 'loves', 'old', 'baby']

Words for Topic 2
['fits', 'strap', 'son', 'easily', 'tray', 'potty', 'carseat', 'use', 'fit', 'comfortable', 'base', 'rear', 'booster', 'infant', 'install', 'child', 'chair', 'facing', 'straps', 'britax', 'seats', 'easy', 'stroller', 'car', 'seat']

Words for Topic 3
['darker', 'goes', 'bright', 'good', 'dark', 'neutral', 'blue', 'light', 'favorite', 'coat', 'colors', 'like', 'perfect', 'looks', 'summer', 'nails', 'coats', 'pink', 'nail', 'pretty', 'essie', 'nice', 'beautiful', 'polish', 'color']

Words for Topic 4
['ve', 'makeup', 'feels', 'smell', 'just', 'used', 'nice', 'don', 'sensitive', 'feel', 'soap', 'using', 'dry', 'products', 'does', 'really', 'scent', 'cream', 'use', 'hair', 'face', 'like', 'good', 'product', 'skin']

Words for Topic 5
['formula', 'j

In [56]:
#Find which topic review belongs to
nmf_topic_prediction = model.transform(tdf_X)

In [57]:
#Add topic number to review dataset
amreviews_mod['nmf_topic'] = nmf_topic_prediction.argmax(axis=1) + 1

In [58]:
amreviews_mod.sample(10)

Unnamed: 0,date,summary,review,rating,topic,nmf_topic
192467,2016-11-07,Smells great!,"This shampoo smells sooooooo good. Seriously,...",5,7,4
154097,2013-07-08,"An acceptable upgrade, still some issues",I should start by saying this was purchased to...,4,6,1
41639,2013-11-29,our little one wouldnt sit in this seat for mo...,Our little one didnt like this chair. We had ...,2,4,2
4288,2014-02-14,Worked well,These worked well and were easy to use but we ...,4,8,4
43257,2011-06-04,Another terrific Skip Hop product,I love this bag to pieces - I bought the 'Pop ...,5,4,8
22019,2011-07-06,A wonderful product!,"This bib has a soft pleasant feel, but seems t...",5,3,6
157605,2013-10-02,Keep This With You!,These waterproof liners are great to keep with...,4,3,6
40100,2014-05-01,SUPER THICK,These are much thicker than other ones I have ...,5,3,6
88823,2012-12-26,Great low profile cover,First off a lot of reviewers here are complain...,5,8,2
74468,2014-04-08,Great washclothes,A must for baby. Great colors. Quality is go...,4,5,7


## LDA or NMF

Just on the basis of comparing the words selected within a topic, I feel that NMF along with TF-IDF does a better job of finding topics. However in order to make the results more fair let's run LDA with TF-IDF and see the results

In [59]:
 LDA.fit(tdf_X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=8, n_jobs=None,
                          perp_tol=0.1, random_state=4321,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [60]:
#Get topics and words (top 15)
for tpno, topic in enumerate(LDA.components_):
    print("Words for Topic", tpno+1)
    print([tdf_all_words[index] for index in topic.argsort()[-25:]])
    print()

Words for Topic 1
['cnd', 'blue', 'really', 'summer', 'quality', 'shade', 'like', 'coat', 'perfect', 'coats', 'looks', 'pretty', 'nice', 'colors', 'essie', 'pink', 'beautiful', 'good', 'nails', 'product', 'nail', 'great', 'polish', 'love', 'color']

Words for Topic 2
['need', 'bought', 'nice', 'quality', 'pedal', 'work', 'don', 'does', 'really', 'little', 'sound', 'door', 'price', 'open', 'like', 'just', 'good', 'use', 'works', 'strings', 'easy', 'great', 'gate', 'guitar', 'stroller']

Words for Topic 3
['bought', 'hold', 'clean', 'cute', 'month', 'chair', 'really', 'months', 'daughter', 'use', 'diaper', 'just', 'like', 'son', 'toys', 'love', 'food', 'little', 'easy', 'old', 'great', 'loves', 'toy', 'bag', 'baby']

Words for Topic 4
['product', 'easy', 'pacifier', 'got', 'used', 'bought', 'loves', 'comfortable', 'really', 'daughter', 'time', 'head', 'love', 'seat', 'great', 'little', 'old', 'months', 'like', 'just', 'son', 'use', 'carrier', 'pillow', 'baby']

Words for Topic 5
['clean'

Based on above results, TF-IDF works better than count vectorizer. While NMF and LDA identify the same set of topics, NMF performs faster and hence my preferred method of choice would still be NMF with TFIDF