# Topic Modeling

In [5]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content'
!kaggle datasets download -d snap/amazon-fine-food-reviews

Downloading amazon-fine-food-reviews.zip to /content
 97% 235M/242M [00:01<00:00, 118MB/s]
100% 242M/242M [00:01<00:00, 160MB/s]


In [6]:
!unzip amazon-fine-food-reviews.zip

Archive:  amazon-fine-food-reviews.zip
  inflating: Reviews.csv             
  inflating: database.sqlite         
  inflating: hashes.txt              


In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download()
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim import corpora, models
from gensim.models import CoherenceModel

In [12]:
#!pip install pyLDAvis
import pyLDAvis.gensim_models

In [13]:
data = pd.read_csv('Reviews.csv')

In [14]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [15]:
np.random.seed(51)
df = data[['Text']].sample(n = 10005)
test = df[10000:10005]
df = df[:10000]

In [16]:
data.shape, df.shape

((568454, 10), (10000, 1))

In [17]:
df = df.reset_index(drop=True)
test = test.reset_index(drop=True)


## Preprocessing

In [18]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [19]:
doc_sample = df['Text'][0]

print('original text: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized text: ')
print(preprocess(doc_sample))

original text: 
['DESCRIPTION:<br', '/>>', 'Blue', 'Diamond', 'Roasted-Salted', 'Almonds', 'are', 'moderately', 'roasted', 'with', 'canola,', 'safflower,', 'or', 'sunflower', 'oil', '(but', 'not', 'oily', 'or', 'greasy),', 'and', 'lightly', 'salted.', '', 'These', 'are', '"heart', 'healthy"', 'oils', 'which', 'are', 'actually', 'good', 'for', 'you,', 'as', 'is', 'the', 'almond', 'oil', 'itself.', '', 'Roasted', 'Salted', 'Almonds', 'are', 'available', 'in', 'bags', 'or', 'small', 'cans', 'small.<br', '/>>', 'Blue', 'Diamond', 'Smokehouse', 'Almonds', 'are', 'similar', 'with', 'wonderful', 'smokey', 'flavor.', '', 'At', 'this', 'time', 'Smokehouse', 'Almonds', 'are', 'available', 'from', 'Amazon', 'only', 'in', '16', 'oz', '(and', 'smaller)', 'bags.<br', '/>>', 'I', 'have', 'not', 'tried', 'the', '"lightly', 'salted"', 'variety', 'but', 'I', 'presume', 'that', 'they', 'have', 'almost', 'no', 'salt.<br', '/><br', '/>CANS', 'vs', 'BAGS<br', '/>Various', 'types', 'of', 'Blue', 'Diamond', '

In [20]:
processed_text = df['Text'].map(preprocess)

In [21]:
processed_text[:10]

0    [descript, blue, diamond, roast, salt, almond,...
1    [postum, discontinu, look, substitut, come, re...
2    [order, pack, shelton, turkey, jerki, great, a...
3        [buy, freez, dri, chicken, treat, year, love]
4    [husband, natur, peanut, butter, want, complai...
5    [work, heat, microwav, cheap, tasti, lunch, co...
6    [wait, amazon, start, carri, french, fajita, s...
7    [crazi, thing, tast, super, sweet, thing, deli...
8    [expect, pleasant, surpris, empti, pouch, saut...
9    [prefer, brand, green, lipton, avail, groceri,...
Name: Text, dtype: object

## Bag of words on the dataset

In [22]:
dictionary = gensim.corpora.Dictionary(processed_text)

In [23]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 actual
1 add
2 afterward
3 almond
4 amazon
5 arguabl
6 artifici
7 assur
8 avail
9 bag
10 bake


In [24]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [25]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_text]
bow_corpus[10]

[(0, 1),
 (38, 1),
 (70, 1),
 (85, 1),
 (95, 1),
 (116, 1),
 (124, 1),
 (158, 1),
 (197, 1),
 (199, 1),
 (200, 1),
 (212, 1),
 (221, 1),
 (222, 1),
 (223, 1),
 (224, 1),
 (225, 1),
 (226, 1),
 (227, 1),
 (228, 1),
 (229, 1),
 (230, 1),
 (231, 2),
 (232, 1)]

In [26]:
bow_doc_10 = bow_corpus[10]

for i in range(len(bow_doc_10)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_10[i][0], 
                                                     dictionary[bow_doc_10[i][0]], 
                                                     bow_doc_10[i][1]))

Word 0 ("actual") appears 1 time.
Word 38 ("fresh") appears 1 time.
Word 70 ("product") appears 1 time.
Word 85 ("tast") appears 1 time.
Word 95 ("come") appears 1 time.
Word 116 ("tasti") appears 1 time.
Word 124 ("love") appears 1 time.
Word 158 ("want") appears 1 time.
Word 197 ("delici") appears 1 time.
Word 199 ("expect") appears 1 time.
Word 200 ("jasmin") appears 1 time.
Word 212 ("green") appears 1 time.
Word 221 ("aroma") appears 1 time.
Word 222 ("bottl") appears 1 time.
Word 223 ("caffein") appears 1 time.
Word 224 ("categori") appears 1 time.
Word 225 ("comment") appears 1 time.
Word 226 ("correct") appears 1 time.
Word 227 ("despit") appears 1 time.
Word 228 ("earlier") appears 1 time.
Word 229 ("need") appears 1 time.
Word 230 ("remain") appears 1 time.
Word 231 ("tea") appears 2 time.
Word 232 ("unsweeten") appears 1 time.


## TF-IDF

In [27]:
tfidf = models.TfidfModel(bow_corpus)

In [28]:
corpus_tfidf = tfidf[bow_corpus]

In [29]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.022046237481401464),
 (1, 0.02212010095287557),
 (2, 0.7990672278008627),
 (3, 0.04234319605788876),
 (4, 0.030180786324129273),
 (5, 0.04506328315275939),
 (6, 0.10469868783323924),
 (7, 0.1177918964281708),
 (8, 0.028310384927122085),
 (9, 0.01570652035458812),
 (10, 0.04164407798136751),
 (11, 0.10251572594613813),
 (12, 0.09865149176461999),
 (13, 0.026463498246191975),
 (14, 0.08328815596273502),
 (15, 0.13258625616030392),
 (16, 0.043833075763101595),
 (17, 0.02707753095114022),
 (18, 0.07636582372488611),
 (19, 0.03901747862210049),
 (20, 0.02987661987624866),
 (21, 0.04544633587056744),
 (22, 0.054330748908750295),
 (23, 0.03755311577002567),
 (24, 0.029186362066006555),
 (25, 0.043028918377835616),
 (26, 0.026021778029643713),
 (27, 0.03493859865525314),
 (28, 0.12143668923780415),
 (29, 0.03429078549488982),
 (30, 0.02184327627051476),
 (31, 0.030406006518590413),
 (32, 0.04740841667090137),
 (33, 0.04506328315275939),
 (34, 0.04253779466840864),
 (35, 0.02385298362510

## LDA

### LDA using Bag of Words

In [None]:
lda_model_bow = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=10, id2word=dictionary, passes=4)

In [31]:
for idx, topic in lda_model_bow.print_topics(-1):
    print('Topic: {} \nWords: {} \n\n'.format(idx, topic))

Topic: 0 
Words: 0.069*"tast" + 0.043*"chip" + 0.032*"flavor" + 0.027*"like" + 0.026*"coconut" + 0.024*"good" + 0.014*"brand" + 0.013*"pasta" + 0.012*"tri" + 0.012*"bag" 


Topic: 1 
Words: 0.037*"chocol" + 0.015*"love" + 0.015*"store" + 0.015*"flavor" + 0.015*"candi" + 0.014*"tast" + 0.013*"amazon" + 0.013*"buy" + 0.012*"local" + 0.011*"like" 


Topic: 2 
Words: 0.032*"tast" + 0.023*"like" + 0.022*"water" + 0.021*"flavor" + 0.020*"drink" + 0.013*"good" + 0.012*"product" + 0.011*"sugar" + 0.008*"sauc" + 0.008*"sweet" 


Topic: 3 
Words: 0.062*"food" + 0.021*"ingredi" + 0.014*"natur" + 0.012*"product" + 0.011*"organ" + 0.011*"protein" + 0.010*"healthi" + 0.010*"diet" + 0.010*"high" + 0.009*"health" 


Topic: 4 
Words: 0.052*"great" + 0.041*"love" + 0.019*"flavor" + 0.018*"tast" + 0.017*"good" + 0.016*"product" + 0.013*"price" + 0.012*"salt" + 0.011*"amazon" + 0.011*"time" 


Topic: 5 
Words: 0.035*"product" + 0.024*"order" + 0.023*"amazon" + 0.015*"time" + 0.014*"packag" + 0.014*"ship" 

Top 10 keywords that contribute to Topic 0 are: "tast", "chip", "flavor".. and so on

#### Coherence Score

In [32]:
coherence_model_bow = CoherenceModel(model=lda_model_bow, texts=processed_text, dictionary=dictionary, coherence='c_v')
coherence_bow = coherence_model_bow.get_coherence()
print('\nCoherence Score: ', coherence_bow)


Coherence Score:  0.44752474208157017


In [33]:
#### Coherence Score

#### Visualize the topics-keywords

In [34]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model_bow, bow_corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

If you move the cursor over one of the bubbles, the words and bars on the right-hand side will update. These words are the salient keywords that form the selected topic.

### LDA using TF-IDF

In [None]:
lda_model_tfidf = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=10, id2word=dictionary, passes=4)

In [36]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {} \n\n'.format(idx, topic))

Topic: 0 
Word: 0.008*"product" + 0.007*"store" + 0.007*"food" + 0.006*"price" + 0.006*"amazon" + 0.006*"good" + 0.005*"great" + 0.005*"order" + 0.005*"bread" + 0.005*"free" 


Topic: 1 
Word: 0.009*"cracker" + 0.009*"great" + 0.008*"chocol" + 0.008*"tast" + 0.008*"love" + 0.007*"cooki" + 0.007*"snack" + 0.007*"almond" + 0.007*"good" + 0.006*"flavor" 


Topic: 2 
Word: 0.021*"treat" + 0.013*"dog" + 0.010*"love" + 0.009*"chew" + 0.007*"food" + 0.006*"product" + 0.006*"smell" + 0.005*"time" + 0.005*"great" + 0.005*"teeth" 


Topic: 3 
Word: 0.040*"coffe" + 0.012*"flavor" + 0.011*"drink" + 0.010*"tast" + 0.009*"strong" + 0.009*"cup" + 0.009*"brew" + 0.008*"like" + 0.008*"bitter" + 0.007*"blend" 


Topic: 4 
Word: 0.013*"chip" + 0.013*"snack" + 0.011*"ginger" + 0.010*"mint" + 0.010*"flavor" + 0.009*"cooki" + 0.008*"chocol" + 0.008*"sweet" + 0.008*"bar" + 0.008*"great" 


Topic: 5 
Word: 0.019*"food" + 0.017*"cat" + 0.010*"order" + 0.009*"fast" + 0.009*"servic" + 0.008*"product" + 0.008*"de

**Coherence Score**


In [37]:
coherence_model_tfidf = CoherenceModel(model=lda_model_tfidf, texts=processed_text, dictionary=dictionary, coherence='c_v')
coherence_tfidf = coherence_model_tfidf.get_coherence()
print('\nCoherence Score: ', coherence_tfidf)


Coherence Score:  0.4283271142535156


**Visualize the topics-keywords**


In [38]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
vis

  default_term_info = default_term_info.sort_values(


### Testing model on unseen document

In [39]:
unseen_document = test['Text'][0]
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model_bow[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_bow.print_topic(index, 5)))

Score: 0.4686657786369324	 Topic: 0.052*"great" + 0.041*"love" + 0.019*"flavor" + 0.018*"tast" + 0.017*"good"
Score: 0.28577226400375366	 Topic: 0.062*"food" + 0.021*"ingredi" + 0.014*"natur" + 0.012*"product" + 0.011*"organ"
Score: 0.21055391430854797	 Topic: 0.035*"like" + 0.028*"cooki" + 0.027*"tast" + 0.022*"good" + 0.018*"snack"
