In [1]:
import numpy as np
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_json('reviews_Cell_Phones_and_Accessories_5.json', lines=True)

In [3]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,120401325X,"[0, 0]",4,They look good and stick good! I just don't li...,"05 21, 2014",A30TL5EWN6DFXT,christina,Looks Good,1400630400
1,120401325X,"[0, 0]",5,These stickers work like the review says they ...,"01 14, 2014",ASY55RVNIL0UD,emily l.,Really great product.,1389657600
2,120401325X,"[0, 0]",5,These are awesome and make my phone look so st...,"06 26, 2014",A2TMXE2AFO7ONB,Erica,LOVE LOVE LOVE,1403740800
3,120401325X,"[4, 4]",4,Item arrived in great time and was in perfect ...,"10 21, 2013",AWJ0WZQYMYFQ4,JM,Cute!,1382313600
4,120401325X,"[2, 3]",5,"awesome! stays on, and looks great. can be use...","02 3, 2013",ATX7CZYFXI1KW,patrice m rogoza,leopard home button sticker for iphone 4s,1359849600


<b> From the text analysis that we did for "sentiment_analysis_ml" and "sentiment_analysis_ml" we already know the nature of text, we will now proceed directly with text processing and will build a model for topic extraction<b>

In [4]:
df = df.drop(labels=['asin', 'helpful','reviewTime','reviewerID','reviewerName','summary', 'unixReviewTime'], axis=1)

### Let's Preprocess the text

In [5]:
# #Removing puncuations from the text

# punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

# def fix_punc(sentence, punct):
#     for p in punct:
#         sentence = sentence.replace(p, '')
#     return sentence

In [6]:
# df['reviewText'] = df['reviewText'].apply(lambda x: fix_punc(x, punct))

In [7]:
df['issapce'] = df['reviewText'].apply(lambda x: x.isspace())

In [8]:
df['issapce'].value_counts()

False    194439
Name: issapce, dtype: int64

In [9]:
# Convert to list
import re

data = df['reviewText'].head(500).values.tolist()

# Removing Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Removing new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [10]:
print(data[:5])

['They look good and stick good! I just dont like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just wont buy a product like this again', 'These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)', 'These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!! Great quality!', 'Item arrived in great time and was in perfect condition. However, I ordered these buttons because they were a great deal and included a FREE screen protector. I never received one. Though its not a big deal, it wouldve been nice to get it since they claim it comes with one.', 'awesome! stays on, and looks great. can be used on multiple apple products. especially having nails, it helps to have an elevated key.']


<b> Tokeninzing the corpus

In [11]:
def tokenize(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) #deacc=True will remove puncuations

data_words = list(tokenize(data))

In [12]:
data_words[:1]

[['they',
  'look',
  'good',
  'and',
  'stick',
  'good',
  'just',
  'dont',
  'like',
  'the',
  'rounded',
  'shape',
  'because',
  'was',
  'always',
  'bumping',
  'it',
  'and',
  'siri',
  'kept',
  'popping',
  'up',
  'and',
  'it',
  'was',
  'irritating',
  'just',
  'wont',
  'buy',
  'product',
  'like',
  'this',
  'again']]

#### Let's move ahead by creating a Bi-gram model

This model will find two words frequently occurring together in the document.

In [13]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)

# Faster way to get a sentence clubbed as bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

<b> Time to remove stopwords, lemmatize the corpus and prepare bigrams

In [14]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

<b> Above series of steps are very time consuming, we will be saving pickle file for the lemmatized data so that incase of any faliure we don't have to run these steps again

In [16]:
# import pickle

# file = open('lemma', 'wb')
# pickle.dump(data_lemmatized, file)                      
# file.close() 

In [17]:
# with open("lemma", "rb") as input_file:
#     data_lemmatized = pickle.load(input_file)

In [18]:
data_lemmatized[:1]

[['look',
  'good',
  'stick',
  'good',
  'do',
  'not',
  'like',
  'rounded',
  'shape',
  'always',
  'bump',
  'siri',
  'keep',
  'pop',
  'irritate',
  'not',
  'buy',
  'product']]

Let's build the vocab from corpus and create Term Document Frequency

In [19]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

### Time to build a model for performing Topic Modelling

<b> We will be using LDA Model for topic modelling.
    LDA uses <a href="https://en.wikipedia.org/wiki/Dirichlet_distribution"> __Dirichlet Distribution__</a> to
    find the Latent Topics in the document.
    Details regaring how the algorithm performs the topic modelling can be found on this <a href="http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf">Link</a>.

In [20]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=17,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

#### Very Computationally expensive took around 2 hours to get trained, on the local CPU

In [21]:
# file = open('lda', 'wb')

# pickle.dump(lda_model, file)                      
# file.close() 

In [22]:
# with open("lda", "rb") as input_file:
#     lda_model = pickle.load(input_file)

In [23]:
[topic for topic in lda_model.print_topics()]

[(0,
  '0.043*"want" + 0.041*"give" + 0.039*"design" + 0.032*"color" + 0.031*"tablet" + 0.027*"receive" + 0.027*"small" + 0.020*"star" + 0.016*"item" + 0.015*"excellent"'),
 (1,
  '0.126*"charge" + 0.066*"device" + 0.060*"charger" + 0.041*"power" + 0.031*"review" + 0.027*"plug" + 0.021*"light" + 0.021*"car" + 0.018*"ipad" + 0.016*"samsung"'),
 (2,
  '0.044*"use" + 0.042*"not" + 0.034*"phone" + 0.029*"get" + 0.028*"battery" + 0.026*"work" + 0.025*"good" + 0.025*"do" + 0.022*"well" + 0.019*"be"'),
 (3,
  '0.132*"iphone" + 0.124*"product" + 0.045*"note" + 0.035*"mount" + 0.035*"fast" + 0.021*"picture" + 0.019*"everything" + 0.015*"holder" + 0.014*"bulk" + 0.013*"pad"'),
 (4,
  '0.097*"case" + 0.051*"phone" + 0.041*"great" + 0.033*"look" + 0.033*"screen" + 0.027*"fit" + 0.023*"easy" + 0.022*"love" + 0.020*"feel" + 0.019*"nice"'),
 (5,
  '0.118*"cable" + 0.103*"usb" + 0.089*"port" + 0.032*"apple" + 0.024*"wall" + 0.023*"connect" + 0.022*"sample" + 0.021*"micro" + 0.018*"simple" + 0.015*"lig

<b> Interpreting topics:<br>
* Topic 0 is represented by '0.043*"want" + 0.041*"give" + 0.039*"design" + 0.032*"color" + 0.031*"tablet" + 0.027*"receive" + 0.027*"small" + 0.020*"star" + 0.016*"item" + 0.015*"excellent"'<br>
    
__It means the top 10 keywords that contribute to this topic are: ‘want’, ‘give’, ‘design’.. and so on and the weight of ‘want’ on topic 0 is 0.043.__

<b> Let's check the coherence score it is measurue to judge how good the model is.
    
*    Interpretation of Coherence Score is present in this StackOverflow post <a href="https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad">Link</a>

In [24]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.40262278606849095


<b>Note: Scope of Improvement:</b><br>
* Score is not bad actually it indicates that number of topics chosen is not correct, we can further experiment with the number of topics to obtain, so as to get a better Coherence Score.<br>
* Tri-grams and orhe n-grams models can be experimented with,

<a href="https://en.wikipedia.org/wiki/Elbow_method_(clustering)">__Elbow Method__ </a>can be used to find the right number of topics.

### Let's Visualize our Topics

(Warning: Long Running Process)

In [25]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

<b> The visualization shows how topics distribution of different topics, along with the frequency words that form those topics

Thanks for Reading!! The End.