## Mounting Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading Required Libraries

In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading Data

In [3]:
# using the SQLite Table to read data.
con = sqlite3.connect(F'/content/drive/MyDrive/Aazon_Review_Data/database.sqlite') 

#filtering only positive and negative reviews i.e. ignoring neutral reviews with Score = 3
df = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3""", con)
print(df.shape)
df.head()

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
#proprtion of review scores
df.Score.value_counts()

5    363122
4     80655
1     52268
2     29769
Name: Score, dtype: int64

In [5]:
# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
df['Score'] = np.where(df['Score'] > 3, 1, 0)
df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


## Data Cleaning

In [6]:
#Sorting data according to ProductId in ascending order
df.sort_values('ProductId', axis = 0, ascending = True, inplace = True, kind = 'quicksort', na_position = 'last')

In [7]:
#Deduplication of entries
df.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep ='first', inplace = True)
df.shape

(364173, 10)

<b>Insight:-</b> In some rows, value of HelpfulnessNumerator can be greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions.
It is not possible that more number of people found these reviews useful compared to number of people who have seen this review.

In [8]:
#removing records with HelpfulnessNumerator > HelpfulnessDenominator
df = df[df.HelpfulnessNumerator <= df.HelpfulnessDenominator]
df.shape

(364171, 10)

In [9]:
#selecting required columns
df = df[['Text', 'Score']]
df.head()

Unnamed: 0,Text,Score
138706,this witty little book makes my son laugh at l...,1
138688,"I grew up reading these Sendak books, and watc...",1
138689,This is a fun way for children to learn their ...,1
138690,This is a great little book to read aloud- it ...,1
138691,This is a book of poetry about the months of t...,1


In [10]:
#Distribution of +ve and -ve Reviews
df['Score'].value_counts()

1    307061
0     57110
Name: Score, dtype: int64

## Text Preprocessing

In [11]:
#stopwords 
stop_words = stopwords.words('english') 
negative = ["no", "nor", "not", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
          "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn',
          "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
          'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'don', "don't"]
stop_words = [x for x in stop_words if x not in negative]

#stemming
stemmer = PorterStemmer()  

#function to replace negative words by not to reduce the dimension of data
def replace_by_not(x):
    if x in negative:
       x = 'not'
    return x

In [None]:
#text preprocessing function for regular nlp usecase
'''
def preprocess(text):
    text = text.lower()                                                         #to lower case
    text = re.sub('http[s]?://\S+', ' ', text)                                  #removing urls
    text = re.sub('<[^<]+?>', ' ', text)                                        #removing html tags
    text = re.sub('\S*\d\S*', ' ', text)                                        #removing alphanumeric words
    text = re.sub('[^A-Za-z]+', ' ', text)                                      #removing special characters
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)                                   #removing single character (length) words
    text = re.sub('\s{2,}', ' ', text)                                          #removing multiple white spaces
    text = text.strip()                                                         #removing spaces from start & end of the text
    text_tokenized = word_tokenize(text)                                        #tokenization
    
    tokens = []
    for token in text_tokenized:                              
        if token not in stop_words:                                             #removing stopwords  
            token = replace_by_not(token)                                       #replacing negative words by "NOT" to reduce the dimension of data       
            token = stemmer.stem(token)                                         #stemming
            tokens.append(token)                           
    #return " ".join(tokens) 
'''

For word2vec, stemming and lemmatization is not advised, so we will be not using in our text pre-processing function

In [13]:
#text preprocessing function
def preprocess(text):
    text = text.lower()                                                         #to lower case
    text = re.sub('http[s]?://\S+', ' ', text)                                  #removing urls
    text = re.sub('<[^<]+?>', ' ', text)                                        #removing html tags
    text = re.sub('\S*\d\S*', ' ', text)                                        #removing alphanumeric words
    text = re.sub('[^A-Za-z]+', ' ', text)                                      #removing special characters
    text = re.sub(r'\b[a-zA-Z]\b', ' ', text)                                   #removing single character (length) words
    text = re.sub('\s{2,}', ' ', text)                                          #removing multiple white spaces
    text = text.strip()                                                         #removing spaces from start & end of the text
    text_tokenized = word_tokenize(text)                                        #tokenization
    
    tokens = []
    for token in text_tokenized:                              
        if token not in stop_words:                                             #removing stopwords  
            token = replace_by_not(token)                                       #replacing negative words by "NOT" to reduce the dimension of data       
            tokens.append(token)                                                #returns text back in sentence form
    return tokens                                                               #returns tokenized text in list

In [14]:
df['Text'] = df['Text'].apply(lambda x: preprocess(x))
df.head()

Unnamed: 0,Text,Score
138706,"[witty, little, book, makes, son, laugh, loud,...",1
138688,"[grew, reading, sendak, books, watching, reall...",1
138689,"[fun, way, children, learn, months, year, lear...",1
138690,"[great, little, book, read, aloud, nice, rhyth...",1
138691,"[book, poetry, months, year, goes, month, cute...",1


In [15]:
#Extracting X & y for Training
X = df['Text']
y = df['Score']

### TF-IDF

In [16]:
#tf-idf
tfidf_vect = TfidfVectorizer(token_pattern = None, tokenizer = lambda doc: doc, preprocessor = lambda doc: doc)
tfidf_vect.fit(X)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2',
                preprocessor=<function <lambda> at 0x7f738a9e5b90>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern=None,
                tokenizer=<function <lambda> at 0x7f738a9e5cb0>, use_idf=True,
                vocabulary=None)

In [17]:
#dictionary of words and corresponding tf-idf value
tfidf_dict = dict(zip(tfidf_vect.get_feature_names(), list(tfidf_vect.idf_)))

## Word2Vec

### Building Word2Vec Model

In [18]:
#word2vec model using gensim library
word2vec = Word2Vec(X, min_count = 10, window = 10, size = 64, workers = 3)

### Vocabulary of Word2Vec Model

In [None]:
#vobabulary of word2vec model
vocabulary = word2vec.wv.vocab

### Creating Vector Representation of a Word

In [20]:
#calculate vector for a word
word2vec.wv['dog']

array([ 0.7229124 , -3.4830492 ,  4.244896  ,  0.70068   ,  0.07882409,
       -2.6090617 , -1.3388602 ,  0.35897344, -3.4839792 , -1.2036916 ,
       -0.20898417,  3.1573386 ,  1.6266875 ,  2.5189881 ,  1.8964871 ,
       -6.4068894 ,  2.113712  ,  4.3559256 ,  1.5529048 , -2.9751668 ,
       -5.738968  ,  1.7389332 , -0.30703536, -4.9913597 ,  0.9612307 ,
        0.9580144 ,  3.6220937 ,  1.46552   ,  0.8251408 , -5.970856  ,
       -6.1367397 , -1.924054  , -3.1767282 ,  5.8738804 , -4.216028  ,
        0.36617166, -1.631965  ,  2.5191867 ,  5.2866163 ,  4.262648  ,
        1.418561  ,  3.0736923 , -0.32477742, -0.3761637 , -0.4259393 ,
        4.8278246 , -2.2038004 , -0.07478376, -1.202969  , -1.4905683 ,
        3.5475652 , -2.2692435 ,  1.3682714 ,  0.7110651 ,  0.04366476,
       -1.1621705 , -2.278641  ,  8.123048  ,  5.2455087 ,  1.7950321 ,
        1.8934386 ,  2.6983702 , -0.01424707,  1.9505115 ], dtype=float32)

### Similar Word

In [21]:
#finding similar words
word2vec.wv.most_similar('dog')

[('pup', 0.875423789024353),
 ('dogs', 0.8712745904922485),
 ('puppy', 0.8593417406082153),
 ('pooch', 0.8139731884002686),
 ('pups', 0.7978197336196899),
 ('gsd', 0.7693132162094116),
 ('yorkie', 0.7663488984107971),
 ('doggies', 0.7636624574661255),
 ('doggie', 0.7470970749855042),
 ('cat', 0.7420973181724548)]

In [22]:
#finding similar words
word2vec.wv.most_similar('mango')

[('pineapple', 0.8814773559570312),
 ('peach', 0.8462610840797424),
 ('melon', 0.8063310384750366),
 ('kiwi', 0.7611942291259766),
 ('honeydew', 0.7581152319908142),
 ('pear', 0.7445927858352661),
 ('pomegranate', 0.7336922883987427),
 ('watermelon', 0.732054591178894),
 ('tangerine', 0.729391872882843),
 ('berry', 0.7151153683662415)]

### Odd One Out

In [23]:
#finding the odd one out
word2vec.doesnt_match(("mango", "dog", "peach"))

'dog'

### Average of Word2Vec Vectors for Document Vector

In [24]:
#avg word2vec function
def avg_word2vec(x):
    vectors = []
    for item in x:
        if item in vocabulary:                   #checking if word is in word2vec vocabulary
           vectors.append(word2vec.wv[item])     #vector representation of the word

    return np.mean(vectors, axis = 0)            #average of all available vectors

In [25]:
#calculating avg word2vec
df['avg_word2vec'] = df['Text'].apply(lambda x: avg_word2vec(x))
df.head()

Unnamed: 0,Text,Score,avg_word2vec
138706,"[witty, little, book, makes, son, laugh, loud,...",1,"[0.7342949, 0.30224502, -0.05032634, 0.4428327..."
138688,"[grew, reading, sendak, books, watching, reall...",1,"[0.7397334, 0.7307607, 0.5771166, -0.9406805, ..."
138689,"[fun, way, children, learn, months, year, lear...",1,"[0.86246383, 0.059025608, 1.039951, -0.3798154..."
138690,"[great, little, book, read, aloud, nice, rhyth...",1,"[0.5747234, 0.5307194, 0.32028013, -1.3253409,..."
138691,"[book, poetry, months, year, goes, month, cute...",1,"[0.56683445, 0.6151396, 0.89789563, 0.1481977,..."


### Average of Word2Vec Vectors with TF-IDF for Document Vector

In [26]:
def tfidf_word2vec(x):
    vectors = []
    tfidf_sum = 0
    for item in x:
        if item in vocabulary:                                    #checking if word is in word2vec vocabulary
           tfidf_value = tfidf_dict.get(item)                     #getting tf-idf value of word
           vectors.append(word2vec.wv[item] * tfidf_value)        #word vector multiplied by tf-idf value
           tfidf_sum += tfidf_value                               #sum of tf-idf of all words as denominator

    return np.sum(vectors, axis = 0)/tfidf_sum                    #weighted average of all word vectors by tf-idf

In [27]:
#calculating tf-idf weighthed word2vec
df['tfidf_word2vec'] = df['Text'].apply(lambda x: tfidf_word2vec(x))
df.head()

Unnamed: 0,Text,Score,avg_word2vec,tfidf_word2vec
138706,"[witty, little, book, makes, son, laugh, loud,...",1,"[0.7342949, 0.30224502, -0.05032634, 0.4428327...","[0.5169127, 0.3292227, -0.1576288, 0.6622015, ..."
138688,"[grew, reading, sendak, books, watching, reall...",1,"[0.7397334, 0.7307607, 0.5771166, -0.9406805, ...","[0.62105983, 0.62610567, 0.40871078, -0.538037..."
138689,"[fun, way, children, learn, months, year, lear...",1,"[0.86246383, 0.059025608, 1.039951, -0.3798154...","[0.5883172, 0.086780876, 0.85090166, -0.059307..."
138690,"[great, little, book, read, aloud, nice, rhyth...",1,"[0.5747234, 0.5307194, 0.32028013, -1.3253409,...","[0.32820088, 0.32443893, 0.10845523, -0.725164..."
138691,"[book, poetry, months, year, goes, month, cute...",1,"[0.56683445, 0.6151396, 0.89789563, 0.1481977,...","[0.3901421, 0.6132452, 0.65380883, 0.4601233, ..."
