<h2 id="Latent-Dirichlet-Allocation(LDA)-for-Topic-Modelling" style="text-align: center;">Latent Dirichlet Allocation(LDA) for Topic Modelling</h2>
<h2 id="
"></h2>
<h3 id="Using-the-Amazon-fine-food-reviews-dataset" style="text-align: center;">Using the Amazon fine food reviews dataset</h3>
<h5 id="Link-:-https://www.kaggle.com/snap/amazon-fine-food-reviews-" style="text-align: center;">Link : https://www.kaggle.com/snap/amazon-fine-food-reviews</h5>
<p style="text-align: center;">..........................................................................................................</p>
<h6 id="For-performing-LDA-based-topic-modelling-,--I-will-be-using-the-gensim-package-for-LDA-topic-modelling-&amp;--pyLDAvis-for-visualization-of-LDA-topic-model" style="text-align: center;">For performing LDA based topic modelling , I will be using the gensim package for LDA topic modelling &amp; pyLDAvis for visualization of LDA topic model</h6>

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

import re
import string

import spacy

import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Reading the data

In [2]:
review_data= pd.read_csv("Reviews.csv")

# print(review_data.head(2))

review_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
print("Length of the data is :" , len(review_data))

print('Unique Products : ' , len(review_data.groupby('ProductId')))


print('Unique Users : ', len(review_data.groupby('UserId')))


Length of the data is : 568454
Unique Products :  74258
Unique Users :  256059


### Cleaning Text

In [4]:
def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    #print('cleaned:'+text1)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    
    return text2.lower()


In [5]:
import nltk
# nltk.download('stopwords') 

In [6]:
review_data.dropna(axis = 0, how ='any',inplace=True) 


review_data['Text'] = review_data['Text'].apply(clean_text)
review_data['Num_words_text'] = review_data['Text'].apply(lambda x:len(str(x).split())) 

print('-------Dataset --------')

print(review_data['Score'].value_counts())
print(len(review_data))

print('-------------------------')
max_review_data_sentence_length  = review_data['Num_words_text'].max()

mask = (review_data['Num_words_text'] < 100) & (review_data['Num_words_text'] >=20)
df_short_reviews = review_data[mask]
df_sampled = df_short_reviews.groupby('Score').apply(lambda x: x.sample(n=20000)).reset_index(drop = True)

print('No of Short reviews')
print(len(df_short_reviews))


-------Dataset --------
Score
5    363102
4     80654
1     52264
3     42638
2     29743
Name: count, dtype: int64
568401
-------------------------
No of Short reviews
373279


### Pre-Process the Text

In [8]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

# remove stopwords from the text
df_sampled['Text']=df_sampled['Text'].apply(remove_stopwords)


### Lemmetization

In [9]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output


In [10]:
text_list=df_sampled['Text'].tolist()
print(text_list[1])

print('-'*60)

tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

wasnt cracklin like rocklin every full rocks pork rinds literally chipped tooth eating avoid itits worth vending machine quality
------------------------------------------------------------
['full', 'rock', 'pork', 'rind', 'tooth', 'avoid', 'itit', 'worth', 'machine', 'quality']


### Create vocabulary dictionary and document term matrix

In [11]:
dictionary = corpora.Dictionary(tokenized_reviews)

doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]


### Creating the object for LDA model using gensim library


In [None]:
LDA = gensim.models.ldamodel.LdaModel

# Build LDA model
lda_model = LDA(corpus=doc_term_matrix, 
                id2word=dictionary, 
                num_topics=10, 
                random_state=100,
                chunksize=1000, 
                passes=50,
                iterations=100)

In [12]:
lda_model.print_topics()

[(0,
  '0.044*"price" + 0.042*"store" + 0.032*"product" + 0.029*"great" + 0.028*"good" + 0.024*"amazon" + 0.022*"order" + 0.019*"time" + 0.018*"local" + 0.015*"year"'),
 (1,
  '0.069*"vanilla" + 0.047*"morning" + 0.030*"yummy" + 0.030*"french" + 0.029*"pill" + 0.029*"seed" + 0.023*"combination" + 0.019*"espresso" + 0.016*"salad" + 0.015*"nuts"'),
 (2,
  '0.060*"chocolate" + 0.049*"chip" + 0.042*"good" + 0.023*"taste" + 0.023*"sweet" + 0.022*"bar" + 0.021*"flavor" + 0.021*"great" + 0.019*"smooth" + 0.018*"protein"'),
 (3,
  '0.030*"time" + 0.028*"cookie" + 0.026*"good" + 0.015*"little" + 0.013*"great" + 0.013*"bread" + 0.012*"piece" + 0.011*"work" + 0.010*"thing" + 0.010*"minute"'),
 (4,
  '0.066*"product" + 0.016*"bottle" + 0.013*"great" + 0.012*"wheat" + 0.011*"fiber" + 0.011*"flour" + 0.011*"plastic" + 0.011*"free" + 0.010*"container" + 0.010*"recipe"'),
 (5,
  '0.070*"snack" + 0.047*"salt" + 0.037*"candy" + 0.037*"butter" + 0.031*"great" + 0.025*"peanut" + 0.024*"popcorn" + 0.023*"c

### Visualize the topics

In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
vis

###  measuring how good the model is. lower the better.

In [14]:
print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix,total_docs=10000))


Perplexity:  -8.899764404994013


### Computing Coherence Score

In [None]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print('\nCoherence Score: ', coherence_lda)

#### Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. In my experience, topic coherence score, in particular, has been more helpful.

### Method to find optimal number of topics 

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=tokenized_reviews, start=2, limit=50, step=1)

### Visualizatoin with Graph

In [None]:
limit=50; start=2; step=1;
x = range(start, limit, step)

plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')

# Print the coherence scores
plt.show()



### Printing the coherence scores

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))


### Selecting the model and printing the topics


In [None]:
optimal_model = model_list[7]

model_topics = optimal_model.show_topics(formatted=False)

optimal_model.print_topics(num_words=10)

### Visualize the topics

In [None]:
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim.prepare(optimal_model, doc_term_matrix, dictionary)

vis

### Saving to PDF

In [None]:
!jupyter nbconvert --to webpdf --allow-chromium-download Topic_Modeling(LDA).ipynb