In [2]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import gensim
from IPython.display import display, HTML

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


 **Load data file**

In [3]:
data = pd.read_csv('amazon_reviews.txt', sep="\t",  error_bad_lines=False)
x_raw = data['REVIEW_TEXT']
x_raw = data['REVIEW_TEXT'].values.tolist()
x_raw = [a.replace("<br />", "") for a in x_raw]
x_raw = [a.replace("good", "") for a in x_raw]
x_raw = [a.replace("great", "") for a in x_raw]
x_raw = [a.replace("use", "") for a in x_raw]
x_raw = [a.replace("just", "") for a in x_raw]
x_raw = [a.replace("really", "") for a in x_raw]
x_raw = [a.replace("like", "") for a in x_raw]
x_raw = [a.replace("product", "") for a in x_raw]

In [4]:
data[['DOC_ID', 'PRODUCT_CATEGORY', 'PRODUCT_TITLE', 'REVIEW_TITLE', 'REVIEW_TEXT']].head()

Unnamed: 0,DOC_ID,PRODUCT_CATEGORY,PRODUCT_TITLE,REVIEW_TITLE,REVIEW_TEXT
0,1,PC,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav..."
1,2,Wireless,Note 3 Battery : Stalion Strength Replacement ...,New era for batteries,Lithium batteries are something new introduced...
2,3,Baby,"Fisher-Price Papasan Cradle Swing, Starlight",doesn't swing very well.,I purchased this swing for my baby. She is 6 m...
3,4,Office Products,Casio MS-80B Standard Function Desktop Calculator,Great computing!,I was looking for an inexpensive desk calcolat...
4,5,Beauty,Shine Whitening - Zero Peroxide Teeth Whitenin...,Only use twice a week,I only use it twice a week and the results are...


**Preprocess**

In [5]:
#Preprocess corpus
import string
from string import punctuation
from nltk.corpus import stopwords

def preprocess(text):
    # word tokenisation, including punctuation removal
    tokenizer = RegexpTokenizer(r'\w+')

    # lowercasing
    text = [t.lower() for t in text]

    # stopword removal
    stop = set(stopwords.words('english'))
    text = [t for t in text if t not in stop]
    
    # lemmatisation
    lemmatiser = WordNetLemmatizer()
    text = [lemmatiser.lemmatize(t) for t in text]

    # remove numbers and empty space
    digits = ' 0123456789'
    text = [t for t in text if t not in digits]

    #tokenize sentences
    for sent in text:
        yield(gensim.utils.simple_preprocess(str(sent)))
        
    return text

**Ngrams**

**Define Model and parameters**

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

model = ['LDA', 'NMF']
vectorizer = ['count', 'tfidf']

def topic_model(data, model, vectorizer, n_topics=10):
    """
    model: method for topic model
    vectorizer: method for representation of words
    n_topics = number of topics
    """

    if vectorizer == 'count':
        vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
        x_train = vect.fit_transform(x)

    if vectorizer == 'tfidf':
        vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
        x_train = vect.fit_transform(x)

    if model == 'LDA':
        LDA = LatentDirichletAllocation(n_components=n_topics, learning_decay=0.5, random_state=42)
        _model = LDA.fit(x_train)
    
    if model == 'NMF':
        nmf = NMF(n_components=n_topics, random_state=42)
        _model = nmf.fit(x_train)

    return _model, vect

**Prepare data to train**

In [7]:
x = list(preprocess(x_raw))
bigram = gensim.models.Phrases(x, min_count=3, threshold=30)
x = [bigram[line] for line in x]
x = [' '.join(i) for i in x]



In [8]:
LDA, vect = topic_model(x, model='LDA', vectorizer='tfidf')
#LDA.get_params()
nmf, vect = topic_model(x, model='NMF', vectorizer='tfidf')
#nmf.get_params()

**Show topics infered**

In [11]:
def create_topics_table(model, vectorizer, n_topics):
    keywords = np.array(vectorizer.get_feature_names())
    topics = []
    for weights in model.components_:
        top_index = (-weights).argsort()[:n_topics]
        #topic = keywords.take(top_index)
        #w = np.sort(-weights)[:5]
        #w = w.round(1)
        #w = w.astype(str)
        #topics.append(ind)
        #ind = map(' '.join, zip(topic, w))
        topics.append(keywords.take(top_index))
    return topics

print('Top 10 words for each topic with LDA')
topic_keywords = create_topics_table(model=LDA, vectorizer=vect, n_topics=10)       
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
display(df_topic_keywords)

print('Top 10 words for each topic with NMF')
topic_keywords2 = create_topics_table(model=nmf, vectorizer=vect, n_topics=10)
df_topic_keywords2 = pd.DataFrame(topic_keywords2)
df_topic_keywords2.columns = ['Word '+str(i) for i in range(df_topic_keywords2.shape[1])]
df_topic_keywords2.index = ['Topic '+str(i) for i in range(df_topic_keywords2.shape[0])]
display(df_topic_keywords2)



Top 10 words for each topic with LDA


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,book,love,bought,hair,time,read,little,got,beca,gift
Topic 1,price,love,nice,quality,bag,easy,time,fit,looks,bought
Topic 2,sound,tv,works,quality,work,easy,price,time,bought,love
Topic 3,skin,dog,taste,food,love,time,eat,little,dogs,best
Topic 4,love,watch,necklace,beautiful,quality,nice,perfect,price,chain,looks
Topic 5,cartridges,screen_protector,love,cap,screen_protectors,job,does,protector,nozzle,printer
Topic 6,movie,game,story,games,film,series,love,characters,movies,fun
Topic 7,weight_loss,book,guitar,taking,energy,better,weight,love,supplement,bottle
Topic 8,hair,works,crate,love,liners,wished,mats,comb,book,fitbit
Topic 9,coffee,game,tea,book,cup,fun,taste,love,cups,flavor


Top 10 words for each topic with NMF


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,time,work,beca,don,got,little,did,bought,does,didn
Topic 1,watch,looks,band,time,watches,wrist,face,wear,movie,look
Topic 2,size,nice,fit,perfect,looks,wear,small,comfortable,color,look
Topic 3,price,quality,great,buy,good,happy,cheap,worth,recommend,sound
Topic 4,love,color,beautiful,kids,buy,design,cute,movie,amazing,necklace
Topic 5,easy,works,install,light,set,clean,recommend,sturdy,assemble,camera
Topic 6,bag,carry,leather,bags,inside,travel,pockets,lot,laptop,zipper
Topic 7,case,phone,iphone,protection,camera,ipad,protect,screen,plastic,battery
Topic 8,game,play,fun,games,kids,playing,played,graphics,son,movie
Topic 9,tv,sound,picture,set,samsung,box,screen,remote,speakers,cable


**Words for each topics (Human defined) LDA vs NMF**

In [16]:
from IPython.display import display, HTML

topics_lda = np.array([['Reading', 'Shopping', 'Entertainment', 'Pet food', 'Accessories', 'Technology', 'Entertainment', 'Health', 'Products', 'Drink']])
df = pd.DataFrame(data=topics_lda)
df.index = ['Words']
df.columns = ['Topic '+str(i) for i in range(10)]
print('LDA')
display(df)
topics_nmf = np.array([['Activities', 'Accessories', 'Clothes', 'Prices', 'Accessories', 'Technology', 'Travel products', 'Technology', 'Games', 'Technology']])
df2 = pd.DataFrame(data=topics_nmf)
df2.index = ['Words']
df2.columns = ['Topic '+str(i) for i in range(10)]
print('\n NMF')
display(df2)

LDA


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
Words,Reading,Shopping,Entertainment,Pet food,Accessories,Technology,Entertainment,Health,Products,Drink



 NMF


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
Words,Activities,Accessories,Clothes,Prices,Accessories,Technology,Travel products,Technology,Games,Technology


**Show reviews and topic assigned**

LDA

In [17]:
# Topic ID for each review in dataset: review ID, review text, topic ID

def show_results(model, vectorizer):
    x_train = vectorizer.fit_transform(x)
    topic_values = model.transform(x_train)
    data['Topic'] = topic_values.argmax(axis=1)
    #data[['REVIEW_TEXT', 'Topic']]

    result = data[['REVIEW_TEXT', 'Topic']][:50]
    from IPython.display import display, HTML

    display(HTML(result.to_html()))

In [18]:
lda_result = show_results(LDA, vect)

Unnamed: 0,REVIEW_TEXT,Topic
0,"When least you think so, this product will save the day. Just keep it around just in case you need it for something.",1
1,Lithium batteries are something new introduced in the market there average developing cost is relatively high but Stallion doesn't compromise on quality and provides us with the best at a low cost.<br />There are so many in built technical assistants that act like a sensor in their particular forté. The battery keeps my phone charged up and it works at every voltage and a high voltage is never risked.,2
2,"I purchased this swing for my baby. She is 6 months now and has pretty much out grown it. It is very loud and doesn't swing very well. It is beautiful though. I love the colors and it has a lot of settings, but I don't think it was worth the money.",1
3,I was looking for an inexpensive desk calcolatur and here it is. It works and does everything I need. Only issue is that it tilts slightly to one side so when I hit any keys it rocks a little bit. Not a big deal.,1
4,"I only use it twice a week and the results are great. I have used other teeth whitening solutions and most of them, for the same results I would have to use it at least three times a week. Will keep using this because of the potency of the solution and also the technique of the trays, it keeps everything in my teeth, in my mouth.",6
5,I'm not sure what this is supposed to be but I would recommend that you do a little more research into the culture of using pipes if you plan on giving this as a gift or using it yourself.,1
6,"Pleased with ping pong table. 11 year old and 13 year old having a blast, plus lots of family entertainment too. Plus better than kids sitting on video games all day. A friend put it together. I do believe that was a challenge, but nothing they could not handle",2
7,"Great vitamin C serum... I really like the oil feeling, not too sticky. I used it last week on some of my recent bug bites and it helps heal the skin faster than normal.",3
8,"I've used tide pods laundry detergent for many years,its such a great detergent to use having a nice scent and leaver the cloths smelling fresh.",8
9,"Everybody wants to fall for their promises. But this is a relatively unheard of brand, some even say a non existant company. Look at how amateur their labels and products are. You have to ask yourself if you would trust this kind of amateur stuff? No way! Don't waste your money.",2


NMF

In [None]:
nmf_result = show_results(nmf, vect)

Unnamed: 0,REVIEW_TEXT,Topic
0,"When least you think so, this product will save the day. Just keep it around just in case you need it for something.",7
1,Lithium batteries are something new introduced in the market there average developing cost is relatively high but Stallion doesn't compromise on quality and provides us with the best at a low cost.<br />There are so many in built technical assistants that act like a sensor in their particular forté. The battery keeps my phone charged up and it works at every voltage and a high voltage is never risked.,7
2,"I purchased this swing for my baby. She is 6 months now and has pretty much out grown it. It is very loud and doesn't swing very well. It is beautiful though. I love the colors and it has a lot of settings, but I don't think it was worth the money.",3
3,I was looking for an inexpensive desk calcolatur and here it is. It works and does everything I need. Only issue is that it tilts slightly to one side so when I hit any keys it rocks a little bit. Not a big deal.,0
4,"I only use it twice a week and the results are great. I have used other teeth whitening solutions and most of them, for the same results I would have to use it at least three times a week. Will keep using this because of the potency of the solution and also the technique of the trays, it keeps everything in my teeth, in my mouth.",0
5,I'm not sure what this is supposed to be but I would recommend that you do a little more research into the culture of using pipes if you plan on giving this as a gift or using it yourself.,0
6,"Pleased with ping pong table. 11 year old and 13 year old having a blast, plus lots of family entertainment too. Plus better than kids sitting on video games all day. A friend put it together. I do believe that was a challenge, but nothing they could not handle",8
7,"Great vitamin C serum... I really like the oil feeling, not too sticky. I used it last week on some of my recent bug bites and it helps heal the skin faster than normal.",2
8,"I've used tide pods laundry detergent for many years,its such a great detergent to use having a nice scent and leaver the cloths smelling fresh.",2
9,"Everybody wants to fall for their promises. But this is a relatively unheard of brand, some even say a non existant company. Look at how amateur their labels and products are. You have to ask yourself if you would trust this kind of amateur stuff? No way! Don't waste your money.",0


In [None]:
!pip install pyLDAvis
from pyLDAvis import sklearn as sklearn_lda
import os
import pickle 
import pyLDAvis
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(10))

if 1 == 1:
    x_train = vect.fit_transform(x)
    LDAvis_prepared = sklearn_lda.prepare(LDA, x_train, vect)
with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
     LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(10) +'.html')

pyLDAvis.display(LDAvis_prepared)

