# Importing Libraries

In [1]:
# import libraries  
import numpy as np
import pandas as pd
import seaborn as sns
import string
import pprint

import matplotlib.pyplot as plt
%matplotlib inline

# For lemmatisation
import spacy          
import nltk

# gensim for LDA 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools pyLDA visualization
import pyLDAvis
import pyLDAvis.gensim  
#from pyLDAvis import gensim_models as pg

# Ignore warning
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Checking Dataset

In [2]:
df = pd.read_csv('Product Review Large Data.csv')
df.sample(10)

Unnamed: 0,id,asins,brand,categories,colors,dateAdded,dateUpdated,dimension,ean,keys,...,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,upc,weight
3091,ACCFHGZFS7GB9CVM,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,1.0,,Mike prblm,Waste of money!,,,,,,
2751,ACCFHGZFS7GB9CVM,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,"This is an amazing product, the sound quality ...",Super!,,,,,,
9605,ACCFSDGXX3S6DVBG,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,Not bad,Worth every penny,,,,,,
1900,ACCFZGAQJGYCYDCM,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,Love it,Best in the market!,,,,,,
8372,ACCFSKBJYWZKXGCP,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,Battery backup is very good 😉 nice products 👍🙂,Fabulous!,,,,,,
6270,ACCFVA3KZ2EYMYX3,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,Osm prodect,Super!,,,,,,
2193,ACCFZGAQJGYCYDCM,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,4.0,,Its a nice product can be used easily I loved ...,Nice product,,,,,,
5115,ACCFKYE2ARGG67WC,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,it's not run by battery ... The sound and bass...,Classy product,,,,,,
2583,ACCFHGZFS7GB9CVM,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,4.0,,Pros-Very good battery backup(10hrs) ...,Good choice,,,,,,
5823,ACCFVA3KZ2EYMYX3,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,1.0,,Horrible experience please don't buy this prod...,Did not meet expectations,,,,,,


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10971 entries, 0 to 10970
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10971 non-null  object 
 1   asins                 1597 non-null   object 
 2   brand                 10971 non-null  object 
 3   categories            10971 non-null  object 
 4   colors                774 non-null    object 
 5   dateAdded             10971 non-null  object 
 6   dateUpdated           10971 non-null  object 
 7   dimension             565 non-null    object 
 8   ean                   898 non-null    float64
 9   keys                  1597 non-null   object 
 10  manufacturer          965 non-null    object 
 11  manufacturerNumber    902 non-null    object 
 12  name                  1597 non-null   object 
 13  prices                1597 non-null   object 
 14  reviews.date          1217 non-null   object 
 15  reviews.doRecommend

In [3]:
print(df.shape)

(10971, 27)


# Preprocessing

1. Tokenize each review (using gensim)
2. Remove stop words (including punctuations)
3. Lemmatize (using spacy)

In [4]:
# tokenize using gensim simple_preprocess
def sent_to_words(sentences, deacc=True): # deacc=True removes punctuations
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))  

# to list
data = df['reviews.text'].values.tolist()
data_words = list(sent_to_words(data))

In [8]:
print(data_words[3])

['bought', 'one', 'of', 'the', 'first', 'paperwhites', 'and', 'have', 'been', 'very', 'pleased', 'with', 'it', 'its', 'been', 'constant', 'companion', 'and', 'suppose', 'ive', 'read', 'on', 'average', 'book', 'every', 'three', 'days', 'for', 'the', 'past', 'however', 'many', 'years', 'on', 'it', 'wouldnt', 'give', 'it', 'up', 'youd', 'have', 'to', 'pry', 'it', 'from', 'my', 'cold', 'dead', 'fingers', 'for', 'sundry', 'logistical', 'reasons', 'ive', 'also', 'made', 'good', 'use', 'of', 'amazons', 'kindle', 'app', 'on', 'my', 'iphone', 'no', 'paperwhite', 'screen', 'naturally', 'and', 'all', 'the', 'cool', 'usability', 'that', 'delivers', 'but', 'it', 'works', 'well', 'and', 'has', 'its', 'own', 'attractions', 'as', 'companion', 'to', 'the', 'kindle', 'of', 'course', 'there', 'are', 'aspects', 'of', 'the', 'paperwhite', 'which', 'would', 'like', 'to', 'critique', 'ah', 'you', 'knew', 'that', 'was', 'coming', 'somewhere', 'didnt', 'you', 'as', 'member', 'of', 'bookbub', 'get', 'daily', 'l

The code below creates a list of stop words. The 'string' module in python comes with a list of punctuation characters, which we'll append to the builtin stopwords of NLTK.

In [9]:
# create list of stop words
# string.punctuation (from the 'string' module) contains a list of punctuations
from nltk.corpus import stopwords
stop_words = stopwords.words('english') + list(string.punctuation)


In [10]:
# Removing stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# call functions

# remove stop words
data_words_nostops = remove_stopwords(data_words)

# initialize spacy 'en' model, use only tagger since we don't need parsing or NER 
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])

In [12]:
print(data_lemmatized[3])

['first', 'constant', 'companion', 'average', 'book', 'day', 'many', 'year', 'd', 'pry', 'cold', 'dead', 'finger', 'sundry', 'logistical', 'reason', 'good', 'use', 'amazon', 'paperwhite', 'screen', 'cool', 'usability', 'deliver', 'attraction', 'companion', 'critique', 'member', 'bookbub', 'daily', 'list', 'alert', 'book', 'deal', 'genre', 'many', 'good', 'world', 'day', 'book', 'good', 'stuff', 'accumulative', 'effect', 'number', 'book', 'paperwhite', 'upward', 'time', 'mind', 'page', 'action', 'kindle', 'glacial', 'slow', 'slow', 'think', 'general', 'consensus', 'many', 'book', 'kindle', 'manner', 'mad', 'amazon', 'state', 'thousand', 'book', 'figure', 'second', 'paperwhite', 'read', 'action', 'read']


comparing the nostop, lemmatised version with the original.

In [13]:
# earphones is lemmatised to earphone. 
print(' '.join(data_words[3]), '\n')

bought one of the first paperwhites and have been very pleased with it its been constant companion and suppose ive read on average book every three days for the past however many years on it wouldnt give it up youd have to pry it from my cold dead fingers for sundry logistical reasons ive also made good use of amazons kindle app on my iphone no paperwhite screen naturally and all the cool usability that delivers but it works well and has its own attractions as companion to the kindle of course there are aspects of the paperwhite which would like to critique ah you knew that was coming somewhere didnt you as member of bookbub get daily list of alerts and book deals in my chosen genres take on many of them however ive found that even with the best will in the world cant keep up some days it seems that for every book read ive bought two theres just so much good stuff out there the accumulative effect of this is that the number of books actually on my paperwhite has been creeping ever upwa

In [14]:
# After lemmatization
print(' '.join(data_lemmatized[3]))

first constant companion average book day many year d pry cold dead finger sundry logistical reason good use amazon paperwhite screen cool usability deliver attraction companion critique member bookbub daily list alert book deal genre many good world day book good stuff accumulative effect number book paperwhite upward time mind page action kindle glacial slow slow think general consensus many book kindle manner mad amazon state thousand book figure second paperwhite read action read


# Creating Dictionary and Corpus

Gensim's LDA requires the data in a certain format. Firstly, it needs the corpus as a dicionary of id-word mapping, where each word has a unique numeric ID. This is for computationally efficiency purposes. Secondly, it needs the corpus as a term-document frequency matrix which contains the frequency of each word in each document.

In [15]:
# create dictionary and corpus

# create dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

In [16]:
print(corpus[3])

[(1, 2), (8, 2), (27, 1), (28, 3), (37, 1), (46, 1), (50, 1), (55, 6), (77, 2), (81, 3), (97, 1), (98, 1), (102, 1), (103, 2), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 2), (110, 1), (111, 1), (112, 1), (113, 1), (114, 1), (115, 1), (116, 1), (117, 1), (118, 1), (119, 1), (120, 1), (121, 1), (122, 1), (123, 1), (124, 1), (125, 1), (126, 3), (127, 1), (128, 1), (129, 1), (130, 1), (131, 1), (132, 1), (133, 1), (134, 1), (135, 2), (136, 1), (137, 1), (138, 2), (139, 1), (140, 1), (141, 1), (142, 1), (143, 1), (144, 1), (145, 1)]


The (3, 7) above represents the fact that the word with id=3 appears 7 times in the second document (review), word id 12 appears twice and so on. The nested list below shows the frequencies of words in the first document.

In [17]:
# Readable format of corpus (term,frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('adjustment', 1),
  ('amazon', 1),
  ('auto', 1),
  ('basis', 1),
  ('case', 1),
  ('certain', 1),
  ('change', 1),
  ('custom', 1),
  ('day', 2),
  ('delivery', 1),
  ('dollar', 1),
  ('easy', 1),
  ('expense', 1),
  ('extra', 1),
  ('fine', 1),
  ('friend', 1),
  ('glad', 1),
  ('great', 1),
  ('hard', 1),
  ('international', 1),
  ('jump', 1),
  ('level', 1),
  ('light', 3),
  ('model', 1),
  ('money', 1),
  ('need', 1),
  ('option', 1),
  ('page', 1),
  ('paperwhite', 4),
  ('party', 1),
  ('press', 1),
  ('pricey', 1),
  ('reading', 1),
  ('receptive', 1),
  ('regardless', 1),
  ('regret', 1),
  ('review', 1),
  ('screen', 1),
  ('sensitive', 1),
  ('service', 1),
  ('setting', 2),
  ('shipping', 2),
  ('specific', 2),
  ('spending', 1),
  ('thing', 1),
  ('third', 1),
  ('time', 3),
  ('touch', 1),
  ('tracking', 1),
  ('trouble', 1),
  ('use', 1),
  ('voyage', 3),
  ('week', 1),
  ('worry', 1)]]

# Building Topic Model 

We'll define 10 topics to start with. 
The hyperparameter **alpha** affects sparsity of the document-topic (**theta**) distributions, whose default value is 1. 
Similarly, the hyperparameter **eta** can also be specified, which affects the topic-word distribution's sparsity.



In [18]:
#help(gensim.models.ldamodel.LdaModel)

In [19]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=100,
                                           alpha=0.1,
                                           per_word_topics=True)

Topics found in the dataset.

In [20]:
# print the 10 topics
pprint.pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.080*"headphone" + 0.078*"nice" + 0.064*"ear" + 0.039*"product" + '
  '0.032*"bud" + 0.027*"sound" + 0.020*"earbud" + 0.019*"people" + '
  '0.018*"noise" + 0.017*"apple"'),
 (1,
  '0.041*"product" + 0.039*"bad" + 0.020*"money" + 0.018*"worth" + '
  '0.018*"item" + 0.017*"option" + 0.016*"review" + 0.015*"fire" + '
  '0.014*"first" + 0.013*"price"'),
 (2,
  '0.025*"day" + 0.024*"time" + 0.024*"month" + 0.022*"mic" + 0.020*"problem" '
  '+ 0.018*"work" + 0.018*"bad" + 0.017*"side" + 0.017*"button" + '
  '0.016*"thing"'),
 (3,
  '0.135*"good" + 0.074*"quality" + 0.060*"sound" + 0.058*"product" + '
  '0.051*"bass" + 0.030*"price" + 0.024*"battery" + 0.018*"awesome" + '
  '0.018*"nice" + 0.016*"backup"'),
 (4,
  '0.026*"kindle" + 0.020*"great" + 0.018*"screen" + 0.017*"new" + '
  '0.015*"book" + 0.015*"speaker" + 0.015*"tablet" + 0.011*"easy" + '
  '0.011*"fire" + 0.011*"music"'),
 (5,
  '0.027*"year" + 0.027*"device" + 0.023*"fire" + 0.020*"prime" + '
  '0.019*"video" + 0.014*"tv"

Evaluate the model using - coherence score. [~0.5 is basic score]

In [21]:
# coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5793929521504352


**pyLDAvis** library help to see excellent interactive visualization.

In [22]:
# visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis