### Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import seaborn as sns
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


True

### Reading the data

In [2]:
df = pd.read_csv('K8 Reviews v0.2.csv')
df.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [4]:
## Checking the overall sentiment of all the reviews

In [3]:
df['sentiment'].value_counts()

0    7712
1    6963
Name: sentiment, dtype: int64

#### Converting everything into lower

In [7]:
reviews = list(df['review'].values)
reviews = [review.lower() for review in reviews]
print("Reviews in lower case:")
print (reviews[0])

Reviews in lower case:
good but need updates and improvements


### Tokenizing the reviews

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [11]:
review_tokens = [nltk.word_tokenize(i) for i in reviews]
print ("Preview of the first review")
review_tokens[0]

Preview of the first review


['good', 'but', 'need', 'updates', 'and', 'improvements']

### Part of speech tagging

In [15]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [17]:
pos_tagged = [nltk.pos_tag(review) for review in review_tokens]
print ("Words with tag") 
pos_tagged[0]

Words with tag


[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

### We only take nouns for topic model

In [18]:
reviews_noun = []
for review in pos_tagged:
    reviews_noun.append([postag for postag in review if postag[1].startswith('NN')])

In [20]:
# taking only nouns
reviews_noun[0]

[('updates', 'NNS'), ('improvements', 'NNS')]

### Lemmatization

In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [23]:
lem = nltk.WordNetLemmatizer()
lem_reviews =[]
for  review in reviews_noun:
    lem_reviews.append([lem.lemmatize(postag[0]) for postag in review])

In [24]:
lem_reviews[0]

['update', 'improvement']

### Removing stopword if any

In [26]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [37]:
from nltk.corpus import stopwords
from string import punctuation
clean_reviews = []
for review in lem_reviews:
    clean_reviews.append([word for word in review if word not in (stopwords.words('English') + list(punctuation))])

### Create a topic model using LDA on the cleaned-up data

In [34]:
#pip install gensim

In [31]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


In [38]:
# Create Dictionary
id2word = corpora.Dictionary(clean_reviews)
print ("Please preview the corpora dictionary")
print (id2word)

Please preview the corpora dictionary
Dictionary(9263 unique tokens: ['improvement', 'update', 'amazon', 'backup', 'battery']...)


In [44]:
# Create Corpus
texts = clean_reviews
clean_reviews[11]

['phone', 'charger', 'damage', 'month']

In [49]:
clean_reviews[1]

['mobile',
 'battery',
 'hell',
 'backup',
 'hour',
 'us',
 'idle',
 'discharged.this',
 'lie',
 'amazon',
 'lenove',
 'battery',
 'charger',
 'hour']

In [47]:
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
print ("Please preview the document frequency")
print (corpus[1])

Please preview the document frequency
[(2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]


In [68]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=5, 
                                       random_state=42,
                                       passes=10,
                                       per_word_topics=True)

In [69]:
topics = lda_model.show_topics(formatted=False)

In [70]:
topics

[(0,
  [('battery', 0.09446416),
   ('phone', 0.059518974),
   ('mobile', 0.053178284),
   ('camera', 0.043486353),
   ('price', 0.043251663),
   ('backup', 0.025694003),
   ('range', 0.01674236),
   ('day', 0.014820264),
   ('performance', 0.014253314),
   ('issue', 0.012682653)]),
 (1,
  [('phone', 0.15663216),
   ('time', 0.03171112),
   ('issue', 0.024254732),
   ('service', 0.022964116),
   ('amazon', 0.022695575),
   ('day', 0.021630546),
   ('problem', 0.019731885),
   ('charger', 0.018641122),
   ('month', 0.01707269),
   ('mobile', 0.011383156)]),
 (2,
  [('camera', 0.0924362),
   ('phone', 0.080211505),
   ('quality', 0.04725482),
   ('..', 0.04316735),
   ('note', 0.028529173),
   ('performance', 0.022421695),
   ('feature', 0.018987993),
   ('battery', 0.017650802),
   ('sound', 0.013644137),
   ('k8', 0.011680217)]),
 (3,
  [('money', 0.050411116),
   ('network', 0.04562543),
   ('phone', 0.026861923),
   ('sim', 0.022047102),
   ('value', 0.018796293),
   ('hai', 0.018622

In [71]:
0.09446416 + 0.059518974+0.053178284+0.043486353+ 0.043251663+ 0.025694003+ 0.01674236+ 0.014820264+ 0.014253314+0.012682653

0.3780920280000001