## Mounting Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading Required Libraries

In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import LdaMulticore

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading Data

In [3]:
# using the SQLite Table to read data.
con = sqlite3.connect(F'/content/drive/MyDrive/Aazon_Review_Data/database.sqlite') 

#filtering only positive and negative reviews i.e. ignoring neutral reviews with Score = 3
df = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3""", con)
print(df.shape)
df.head()

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
#proprtion of review scores
df.Score.value_counts()

5    363122
4     80655
1     52268
2     29769
Name: Score, dtype: int64

In [5]:
# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
df['Score'] = np.where(df['Score'] > 3, 1, 0)
df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


## Data Cleaning

In [6]:
#Sorting data according to ProductId in ascending order
df.sort_values('ProductId', axis = 0, ascending = True, inplace = True, kind = 'quicksort', na_position = 'last')

In [7]:
#Deduplication of entries
df.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep ='first', inplace = True)
df.shape

(364173, 10)

<b>Insight:-</b> In some rows, value of HelpfulnessNumerator can be greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions.
It is not possible that more number of people found these reviews useful compared to number of people who have seen this review.

In [8]:
#removing records with HelpfulnessNumerator > HelpfulnessDenominator
df = df[df.HelpfulnessNumerator <= df.HelpfulnessDenominator]
df.shape

(364171, 10)

In [9]:
#selecting required columns
df = df[['Text', 'Score']]
df.head()

Unnamed: 0,Text,Score
138706,this witty little book makes my son laugh at l...,1
138688,"I grew up reading these Sendak books, and watc...",1
138689,This is a fun way for children to learn their ...,1
138690,This is a great little book to read aloud- it ...,1
138691,This is a book of poetry about the months of t...,1


In [10]:
#Distribution of +ve and -ve Reviews
df['Score'].value_counts()

1    307061
0     57110
Name: Score, dtype: int64

## Text Preprocessing

### Converting to Lower Case

In [11]:
#converting text reviews to lower case
df['Text'] = df['Text'].str.lower()
df['Text'].head()

138706    this witty little book makes my son laugh at l...
138688    i grew up reading these sendak books, and watc...
138689    this is a fun way for children to learn their ...
138690    this is a great little book to read aloud- it ...
138691    this is a book of poetry about the months of t...
Name: Text, dtype: object

### Remove URLS

In [12]:
#remove urls from text reviews
df['Text'] = df['Text'].apply(lambda x : re.sub('http[s]?://\S+', ' ', x))
df['Text'].head()

138706    this witty little book makes my son laugh at l...
138688    i grew up reading these sendak books, and watc...
138689    this is a fun way for children to learn their ...
138690    this is a great little book to read aloud- it ...
138691    this is a book of poetry about the months of t...
Name: Text, dtype: object

### Remove HTML Tags

In [13]:
#removing html tags
df['Text'] = df['Text'].apply(lambda x: re.sub('<[^<]+?>', ' ', x))
df['Text'][5857]

'you will love this jerky if you are a beer drinker. recommend this product for sure for them nights around the campfires!'

### Remove Alphanumeric Words

In [14]:
#remove words with numbers python
df['Text'] = df['Text'].apply(lambda x: re.sub('\S*\d\S*', ' ', x))
df['Text'][5857]

'you will love this jerky if you are a beer drinker. recommend this product for sure for them nights around the campfires!'

### Remove Special Characters

In [15]:
#remove special character
df['Text'] = df['Text'].apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))
df['Text'][5857]

'you will love this jerky if you are a beer drinker recommend this product for sure for them nights around the campfires '

### Remove Single Length Words

In [16]:
#removing single character words
df['Text'] = df['Text'].apply(lambda x: re.sub(r'\b[a-zA-Z]\b', ' ', x))
df['Text'][5857]

'you will love this jerky if you are   beer drinker recommend this product for sure for them nights around the campfires '

### Remove Multiple White Spaces

In [17]:
#replace multi white spaces
df['Text'] = df['Text'].apply(lambda x: re.sub('\s{2,}', ' ', x))
df['Text'][5857]

'you will love this jerky if you are beer drinker recommend this product for sure for them nights around the campfires '

#### Remove Spaces from Start & End of Text

In [18]:
#removing spaces from start and end of text reviews
df['Text'] = df['Text'].str.strip()  #lstrip for start and rstrip for end
df['Text'].head()

138706    this witty little book makes my son laugh at l...
138688    grew up reading these sendak books and watchin...
138689    this is fun way for children to learn their mo...
138690    this is great little book to read aloud it has...
138691    this is book of poetry about the months of the...
Name: Text, dtype: object

### Word Tokenization

In [19]:
#word tokenization
df['Text'] = df['Text'].apply(lambda x: word_tokenize(x)) 
df['Text'].head()

138706    [this, witty, little, book, makes, my, son, la...
138688    [grew, up, reading, these, sendak, books, and,...
138689    [this, is, fun, way, for, children, to, learn,...
138690    [this, is, great, little, book, to, read, alou...
138691    [this, is, book, of, poetry, about, the, month...
Name: Text, dtype: object

### Stop Word Removal

In [20]:
#stopwords 
stop_words = stopwords.words('english')  #to remove stopwords
negative = ["no", "nor", "not", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
          "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn',
          "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
          'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'don', "don't"]
stop_words = [x for x in stop_words if x not in negative]
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', '

In [21]:
#stopword removal
df['Text'] = df['Text'].apply(lambda x: [item for item in x if item not in stop_words])
df['Text'].head()

138706    [witty, little, book, makes, son, laugh, loud,...
138688    [grew, reading, sendak, books, watching, reall...
138689    [fun, way, children, learn, months, year, lear...
138690    [great, little, book, read, aloud, nice, rhyth...
138691    [book, poetry, months, year, goes, month, cute...
Name: Text, dtype: object

### Replacing Negative Words by "Not"

In [22]:
#replace negative words by not to reduce the dimension of data
def replace_by_not(x, y):
    for i, n in enumerate(x):
        if n in y:
            x[i] = 'not'
    return x

df['Text'] = df['Text'].apply(lambda x: replace_by_not(x, negative))
df['Text'].head()

138706    [witty, little, book, makes, son, laugh, loud,...
138688    [grew, reading, sendak, books, watching, reall...
138689    [fun, way, children, learn, months, year, lear...
138690    [great, little, book, read, aloud, nice, rhyth...
138691    [book, poetry, months, year, goes, month, cute...
Name: Text, dtype: object

### Stemming

In [23]:
#stemming
stemmer = PorterStemmer()  #for stemming

df['Text'] = df['Text'].apply(lambda x: [stemmer.stem(item) for item in x])
df['Text'].head()

138706    [witti, littl, book, make, son, laugh, loud, r...
138688    [grew, read, sendak, book, watch, realli, rosi...
138689    [fun, way, children, learn, month, year, learn...
138690    [great, littl, book, read, aloud, nice, rhythm...
138691    [book, poetri, month, year, goe, month, cute, ...
Name: Text, dtype: object

In [24]:
#Extracting X & y for Training
X = df['Text']
y = df['Score']

## Featurization - Bag of Words, TF-IDF

### Bag of Word: Uni-gram (Count)

In [25]:
#creating bag of words vector
bow = Dictionary(X)
bow.filter_extremes(no_below = 20, no_above = 0.9, keep_n = 5000)
bow_corpus = [bow.doc2bow(doc) for doc in X]

### TF-IDF

In [26]:
#creating tf-idf vector
tfidf = TfidfModel(bow_corpus)
tfidf_corpus = tfidf[bow_corpus]

## LDA on Bag of Words

In [30]:
#building lda model on tf-idf vector
lda_bow = LdaMulticore(bow_corpus, num_topics = 10, id2word = bow, passes = 5, workers = 3, random_state = 1)

In [31]:
#topics from model
for idx, topic in lda_bow.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.045*"not" + 0.021*"packag" + 0.020*"order" + 0.020*"product" + 0.018*"bag" + 0.018*"box" + 0.012*"would" + 0.012*"one" + 0.011*"receiv" + 0.010*"open"
Topic: 1 
Words: 0.046*"price" + 0.035*"store" + 0.034*"amazon" + 0.025*"buy" + 0.023*"great" + 0.022*"product" + 0.021*"find" + 0.019*"not" + 0.017*"local" + 0.014*"order"
Topic: 2 
Words: 0.070*"coffe" + 0.045*"tea" + 0.035*"cup" + 0.023*"flavor" + 0.022*"not" + 0.019*"drink" + 0.016*"tast" + 0.013*"like" + 0.010*"good" + 0.010*"tri"
Topic: 3 
Words: 0.029*"love" + 0.017*"not" + 0.014*"year" + 0.012*"tri" + 0.011*"one" + 0.010*"friend" + 0.009*"time" + 0.008*"candi" + 0.008*"order" + 0.008*"get"
Topic: 4 
Words: 0.032*"chocol" + 0.026*"cooki" + 0.023*"bar" + 0.023*"not" + 0.023*"sugar" + 0.016*"use" + 0.015*"free" + 0.014*"mix" + 0.014*"butter" + 0.013*"peanut"
Topic: 5 
Words: 0.043*"food" + 0.036*"dog" + 0.029*"not" + 0.020*"treat" + 0.020*"cat" + 0.017*"love" + 0.014*"eat" + 0.012*"like" + 0.011*"one" + 0.010*"che

## LDA on TF-IDF

In [32]:
#building lda model on tf-idf vector
lda_tfidf = LdaMulticore(tfidf_corpus, num_topics = 10, id2word = bow, passes = 5, workers = 3, random_state = 1)

In [33]:
#topics from model
for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.011*"price" + 0.011*"order" + 0.011*"amazon" + 0.009*"product" + 0.009*"store" + 0.009*"ship" + 0.007*"buy" + 0.007*"box" + 0.006*"find" + 0.006*"purchas"
Topic: 1 
Words: 0.010*"candi" + 0.009*"gift" + 0.008*"keurig" + 0.008*"cooki" + 0.007*"love" + 0.007*"chocol" + 0.006*"order" + 0.005*"great" + 0.004*"bought" + 0.004*"box"
Topic: 2 
Words: 0.036*"tea" + 0.015*"drink" + 0.009*"flavor" + 0.008*"tast" + 0.007*"water" + 0.007*"ginger" + 0.006*"like" + 0.006*"energi" + 0.006*"green" + 0.006*"not"
Topic: 3 
Words: 0.014*"sauc" + 0.007*"chicken" + 0.006*"use" + 0.006*"soup" + 0.006*"flavor" + 0.006*"salt" + 0.006*"noodl" + 0.006*"cook" + 0.006*"chees" + 0.006*"oil"
Topic: 4 
Words: 0.009*"coconut" + 0.009*"tast" + 0.009*"water" + 0.008*"sugar" + 0.008*"flavor" + 0.007*"drink" + 0.007*"chocol" + 0.006*"like" + 0.006*"syrup" + 0.006*"sweet"
Topic: 5 
Words: 0.012*"bar" + 0.012*"snack" + 0.009*"cereal" + 0.009*"peanut" + 0.008*"cooki" + 0.008*"chocol" + 0.007*"butter" + 0.

## Checking Performance on Unseen Data

### Performance on BoW LDA

In [34]:
#predicting topic on unseen bow vector
unseen_document = ['dog', 'food', 'not', 'taste', 'great']
bow_vector = bow.doc2bow(unseen_document)
for index, score in sorted(lda_bow[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_bow.print_topic(index, 5)))

Score: 0.8199504613876343	 Topic: 0.043*"food" + 0.036*"dog" + 0.029*"not" + 0.020*"treat" + 0.020*"cat"
Score: 0.02001105062663555	 Topic: 0.046*"price" + 0.035*"store" + 0.034*"amazon" + 0.025*"buy" + 0.023*"great"
Score: 0.020007941871881485	 Topic: 0.025*"not" + 0.021*"snack" + 0.018*"eat" + 0.016*"like" + 0.016*"tast"
Score: 0.020005034282803535	 Topic: 0.070*"coffe" + 0.045*"tea" + 0.035*"cup" + 0.023*"flavor" + 0.022*"not"
Score: 0.02000500075519085	 Topic: 0.072*"not" + 0.047*"tast" + 0.044*"like" + 0.035*"flavor" + 0.021*"tri"
Score: 0.020004939287900925	 Topic: 0.024*"use" + 0.022*"sauc" + 0.016*"cook" + 0.016*"make" + 0.012*"flavor"
Score: 0.02000434324145317	 Topic: 0.029*"love" + 0.017*"not" + 0.014*"year" + 0.012*"tri" + 0.011*"one"
Score: 0.020004283636808395	 Topic: 0.032*"chocol" + 0.026*"cooki" + 0.023*"bar" + 0.023*"not" + 0.023*"sugar"
Score: 0.020003588870167732	 Topic: 0.045*"not" + 0.021*"packag" + 0.020*"order" + 0.020*"product" + 0.018*"bag"
Score: 0.0200033485

By 82% probability, this sentence is assigned topic related to "Domestic Animal Products"

### Performance on TF-IDF LDA

In [35]:
#predicting topic on unseen tf-idf vector
unseen_document = ['dog', 'food', 'not', 'taste', 'great']
bow_vector = bow.doc2bow(unseen_document)
tfidf_vector = tfidf[bow_vector]
for index, score in sorted(lda_tfidf[tfidf_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_tfidf.print_topic(index, 5)))

Score: 0.6783929467201233	 Topic: 0.027*"dog" + 0.017*"food" + 0.016*"cat" + 0.014*"treat" + 0.009*"love"
Score: 0.035735245794057846	 Topic: 0.012*"bar" + 0.012*"snack" + 0.009*"cereal" + 0.009*"peanut" + 0.008*"cooki"
Score: 0.03573523461818695	 Topic: 0.011*"price" + 0.011*"order" + 0.011*"amazon" + 0.009*"product" + 0.009*"store"
Score: 0.03573481738567352	 Topic: 0.015*"gluten" + 0.013*"bread" + 0.011*"free" + 0.010*"flour" + 0.010*"pasta"
Score: 0.03573479503393173	 Topic: 0.054*"coffe" + 0.024*"cup" + 0.011*"flavor" + 0.009*"roast" + 0.008*"brew"
Score: 0.03573440760374069	 Topic: 0.014*"sauc" + 0.007*"chicken" + 0.006*"use" + 0.006*"soup" + 0.006*"flavor"
Score: 0.03573383390903473	 Topic: 0.010*"candi" + 0.009*"gift" + 0.008*"keurig" + 0.008*"cooki" + 0.007*"love"
Score: 0.03573360666632652	 Topic: 0.036*"tea" + 0.015*"drink" + 0.009*"flavor" + 0.008*"tast" + 0.007*"water"
Score: 0.03573327884078026	 Topic: 0.009*"coconut" + 0.009*"tast" + 0.009*"water" + 0.008*"sugar" + 0.008

By 68% probability, this sentence is assigned topic related to "Domestic Animal Products"