# Topic Modeling using LDA

In [78]:
import pandas as pd
import spacy
import en_core_web_sm
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim
import nltk
from nltk import FreqDist


### Read in the data

In [44]:
df = pd.read_json('Automotive_5.json', lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A3F73SC1LY51OO,B00002243X,Alan Montgomery,"[4, 4]",I needed a set of jumper cables for my new car...,5,Work Well - Should Have Bought Longer Ones,1313539200,"08 17, 2011"
1,A20S66SKYXULG2,B00002243X,alphonse,"[1, 1]","These long cables work fine for my truck, but ...",4,Okay long cables,1315094400,"09 4, 2011"
2,A2I8LFSN2IS5EO,B00002243X,Chris,"[0, 0]",Can't comment much on these since they have no...,5,Looks and feels heavy Duty,1374710400,"07 25, 2013"
3,A3GT2EWQSO45ZG,B00002243X,DeusEx,"[19, 19]",I absolutley love Amazon!!! For the price of ...,5,Excellent choice for Jumper Cables!!!,1292889600,"12 21, 2010"
4,A3ESWJPAVRPWB4,B00002243X,E. Hernandez,"[0, 0]",I purchased the 12' feet long cable set and th...,5,"Excellent, High Quality Starter Cables",1341360000,"07 4, 2012"


Check how many reviews are there

In [45]:
print("The number of reviews in the file are :", df.shape[0])

The number of reviews in the file are : 20473


### Basic NLP analysis

Load the English NLP model from SpaCy

In [58]:
nlp = en_core_web_sm.load()

Extract all the tokens from each review, remove stopwords, lemmatize

In [60]:
text = df['reviewText'].tolist()

all_tokens = []

for doc in nlp.pipe(text):

    tokens = [token.lemma_ for token in doc if valid_token(token)]
    all_tokens.append(tokens)

In [69]:
all_tokens[-2:]

[['mask',
  'course',
  'describe',
  'half',
  'face',
  'long',
  'protection',
  'neck',
  'great',
  'cut',
  'wind',
  'jacket',
  'tuck'],
 ['good',
  'light',
  'weight',
  'cool',
  'night',
  'rain',
  'half',
  'helmet',
  'like',
  'wear',
  'light']]

### Topic Modeling using LDA (Latent Dirichlet Allocation)

Create a dictionary of every term in the corpus

In [70]:
dictionary = corpora.Dictionary(all_tokens)

Convert the list of tokenized reviews into a Document-Term-Matrix

In [73]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in all_tokens]

Create the LDA model

In [85]:
LDA = gensim.models.ldamodel.LdaModel

LDA_model = LDA(corpus = doc_term_matrix,
                id2word = dictionary,
                num_topics=5, 
                chunksize=1000,
                passes=50,
                random_state=100)

In [86]:
LDA_model.print_topics()

[(0,
  '0.067*"light" + 0.025*"bulb" + 0.019*"bright" + 0.014*"look" + 0.011*"led" + 0.009*"white" + 0.008*"instal" + 0.008*"color" + 0.007*"Jeep" + 0.007*"replace"'),
 (1,
  '0.026*"car" + 0.019*"product" + 0.016*"clean" + 0.015*"use" + 0.013*"towel" + 0.012*"good" + 0.012*"like" + 0.010*"work" + 0.010*"wash" + 0.009*"water"'),
 (2,
  '0.019*"battery" + 0.019*"car" + 0.012*"use" + 0.011*"power" + 0.011*"charge" + 0.010*"plug" + 0.010*"oil" + 0.009*"work" + 0.008*"device" + 0.008*"filter"'),
 (3,
  '0.012*"use" + 0.010*"work" + 0.009*"easy" + 0.009*"need" + 0.009*"small" + 0.009*"fit" + 0.007*"like" + 0.007*"place" + 0.007*"cover" + 0.007*"hold"'),
 (4,
  '0.016*"work" + 0.015*"blade" + 0.014*"good" + 0.012*"wiper" + 0.011*"buy" + 0.010*"price" + 0.010*"great" + 0.009*"tire" + 0.008*"easy" + 0.008*"like"')]

### Visualize the LDA model

In [87]:
pyLDAvis.enable_notebook()

pyLDAvis.gensim.prepare(LDA_model, doc_term_matrix, dictionary)