# Amazon Fine Food Reviews Analysis

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
import os

os.chdir("/kaggle/input/amazon-reviews/")
from dataset import *
from utils import *
os.chdir("/kaggle/working/")

# Modules for handling text data
from sklearn.feature_extraction.text import TfidfTransformer , TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer 
from nltk.stem.wordnet import WordNetLemmatizer
#Evaluation Metrics
from sklearn.metrics import auc , roc_curve , confusion_matrix


# 1.Loading the Data

In [2]:
sqldatapath = "../input/amazon-fine-food-reviews/database.sqlite"

filter_data= SQLdata(sqldatapath)

In [3]:
filter_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


# 2. Data Prepocessing
* Converting the **Score Columm** in to Positive ( score >3 ) and Negative ( score <3 )

In [4]:
actualScore = filter_data['Score']
positiveNegative = actualScore.map(partition) 
filter_data['Score'] = positiveNegative

# 3. Data Cleaning: Deduplication
There are lot of entries which are duplicate and should be removed. 

In [5]:
filter_data[filter_data['UserId']=='AZY10LLTJ71NX']

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
32313,35174,B001ATMQK2,AZY10LLTJ71NX,"undertheshrine ""undertheshrine""",1,1,Positive,1296691200,Have you seen how much Ranch 99 is trying to c...,I bought this 6 pack because for the price tha...
306745,332195,B001P7AXXG,AZY10LLTJ71NX,"undertheshrine ""undertheshrine""",1,1,Positive,1303776000,BEST MICROWAVE POPCORN EVER!!!!,This popcorn is probably the best microwave po...
307530,333057,B000MYW2ZA,AZY10LLTJ71NX,"undertheshrine ""undertheshrine""",0,0,Positive,1334707200,works for me. lost 10-15 pounds my first month,I was recommended to try green tea extract to ...
314733,340773,B0043CVIBG,AZY10LLTJ71NX,"undertheshrine ""undertheshrine""",4,4,Positive,1303776000,girl scout thin mint in disguise,if you love thin mint cookies that the girl sc...
374253,404703,B006P7E5ZI,AZY10LLTJ71NX,"undertheshrine ""undertheshrine""",0,0,Positive,1334707200,works for me. lost 10-15 pounds my first month,I was recommended to try green tea extract to ...


* In the above example we see that **UserID , ProfileName , Time , Summary , Text** are same. This is because the user has given review of a product which has different varieties. For exampple : A user has reviewed a potato chips of one flavour but the the chips has different flavour on the same page.

In [6]:
# Sorting the data by Product ID 
sorted_data = filter_data.sort_values('ProductId' , ascending=True , axis=0 , inplace=False)

# Dropping the duplicates
final = sorted_data.drop_duplicates(subset={'UserId' , 'ProfileName', 'Time' , 'Text'} , inplace=False , keep='first')

**Observation**:- It was also seen that in two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions

In [7]:
filter_data[filter_data['HelpfulnessNumerator']> filter_data['HelpfulnessDenominator']]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
41159,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,Positive,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...
59301,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,Positive,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...


In [8]:
final = final[final['HelpfulnessNumerator']< final['HelpfulnessDenominator']]

In [9]:
print('The final shape of the data is',final.shape)
print('The count of positive and negative reviews \n', final['Score'].value_counts())

The final shape of the data is (68271, 10)
The count of positive and negative reviews 
 Positive    40027
Negative    28244
Name: Score, dtype: int64


# 4. Text Processing: Stemming , Stop Words removal , Lemmatization

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)

In [10]:
# Get all the stopwords from English Language
stop = set(stopwords.words('english'))

# Initialize the Snowball stemming
snow = SnowballStemmer('english')

In [11]:
print(stop)

{'same', 'couldn', 'too', 'of', 'will', 'after', 'again', 'him', 'now', 'mightn', 'did', 'yourselves', 'mustn', 'no', "it's", 'its', 'then', 'wouldn', 'won', "wouldn't", 'before', 'own', 'here', 'don', 'been', 'yourself', 'while', 'few', 'them', 'that', "shouldn't", 'why', 'does', 'd', 's', 'these', 'hasn', 'any', 'where', "should've", 'between', 'other', 'some', 'her', 'ain', 'the', 'be', 'all', 'at', 'which', 'it', 'if', 'with', 'off', 'didn', 'there', 'has', 'had', 'an', "couldn't", 'needn', 'so', 't', 'just', 'wasn', 'should', 'shan', 'myself', 'above', 'out', 'shouldn', 'more', 'below', 'our', "mightn't", "you've", 'being', 'about', 've', 'll', 'but', 'i', 'hadn', "hasn't", "weren't", "you're", 'those', 'or', 'doesn', 'each', 'from', 'because', 'were', "you'll", 'by', 'his', "aren't", "you'd", "haven't", 'over', 'hers', 'y', 'for', 'she', 'you', 'most', 'me', 'we', 'what', 'themselves', 'further', 'is', "that'll", 'ma', 'he', 'himself', 'in', 'do', 'than', "wasn't", 'o', 'when', "

In [12]:
print(snow.stem('tasty'))

tasti


In [13]:
i =0
str1 = ' '
final_string = []
all_pos_words = []
all_neg_words = []
s = " "
import time
from tqdm import tqdm
start = time.time()

for sent in tqdm(final['Text'].values):
    filtered_sent =[]
    sent = cleanhtml(sent)  # remove HTML tags
    for w in sent.split():  
        for clean_words in cleanpunc(w).split():
            if ((clean_words.isalpha()) & (len(clean_words)>2)):
                if (clean_words.lower() not in stop):
                    s = (snow.stem(clean_words.lower())).encode('utf8')
                    filtered_sent.append(s)   # storing all filterd words 
                    if (final['Score'].values[i] == 'Positive'):
                        all_pos_words.append(s)  # storing all pos words
                    if (final['Score'].values[i] == 'Negative'):
                        all_neg_words.append(s)  # storing all neg words
                else:
                    continue
            else:
                continue
    str1 = b" ".join(filtered_sent)
    final_string.append(str1)
    
    i+=1

end = time.time()

print('The total time to run the cell: ', (end-start))

100%|██████████| 68271/68271 [01:47<00:00, 634.79it/s]

The total time to run the cell:  107.55213165283203





In [14]:
final['CleanedText'] = final_string

In [15]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,Positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,b'book poetri month year goe month cute littl ...
138707,150525,6641040,A2QID6VCFTY51R,Rick,1,2,Positive,1025481600,"In December it will be, my snowman's anniversa...","My daughter loves all the ""Really Rosie"" books...",b'daughter love realli rosi book introduc real...
138708,150526,6641040,A3E9QZFE9KXH8J,R. Mitchell,11,18,Negative,1129507200,awesome book poor size,This is one of the best children's books ever ...,b'one best children book ever written mini ver...
138709,150529,6641040,A25ACLV5KPB4W,"Matt Hetling ""Matt""",0,1,Positive,1108425600,"Nice cadence, catchy rhymes",In June<br />I saw a charming group<br />of ro...,b'june saw charm group rose begin droop pep ch...
138676,150493,6641040,AMX0PJKV4PPNJ,"E. R. Bird ""Ramseelbird""",71,72,Positive,1096416000,Read it once. Read it twice. Reading Chicken S...,"These days, when a person says, ""chicken soup""...",b'day person say chicken soup probabl go follo...


In [16]:
# conn = sqlite3.connect('/kaggle/working/final.sqlite')
# c= conn.cursor()
# conn.text_factory = str
# final.to_sql('Amazon Reviews' , conn , if_exists='replace')

# 5. Bag of Words
* n this model, a text (such as a sentence or a document) is represented as the bag (multiset) of its words, disregarding grammar and even word order but keeping multiplicity.

In [17]:
count_vect = CountVectorizer()
final_count = count_vect.fit_transform(final['Text'].values)

In [18]:
print("The type of vectors", type(final_count))
print('THE shape of the vector', final_count.get_shape())

The type of vectors <class 'scipy.sparse.csr.csr_matrix'>
THE shape of the vector (68271, 59870)


# 6.Bi-Grams and n-grams

* an n-gram is a contiguous sequence of n items from a given sample of text or speech. The items can be phonemes, syllables, letters, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus. When the items are words, n-grams may also be called shingles

In [19]:
#computing frequently occuring words in positive rewiew and negative rewiew
freq_pos = nltk.FreqDist(all_pos_words)
freq_neg = nltk.FreqDist(all_neg_words)

print("Most Common positive words: " , freq_pos.most_common(20))
print("\nMost Common positive words: " , freq_neg.most_common(20))

Most Common positive words:  [(b'like', 20581), (b'tast', 19019), (b'good', 16179), (b'use', 15308), (b'flavor', 14467), (b'one', 14230), (b'product', 14100), (b'great', 13101), (b'coffe', 12531), (b'love', 12030), (b'tri', 11641), (b'tea', 11516), (b'get', 10756), (b'make', 10541), (b'would', 8536), (b'food', 8365), (b'time', 7862), (b'realli', 7651), (b'eat', 7613), (b'buy', 7464)]

Most Common positive words:  [(b'tast', 16926), (b'like', 16111), (b'product', 14802), (b'one', 10266), (b'would', 9005), (b'tri', 8972), (b'flavor', 8720), (b'use', 7665), (b'food', 7460), (b'good', 7428), (b'coffe', 7179), (b'get', 7123), (b'buy', 6905), (b'order', 6404), (b'tea', 5831), (b'even', 5635), (b'amazon', 5524), (b'box', 5516), (b'make', 5154), (b'eat', 4989)]


**OBSERVATION:** As it can be seen many words in positive and negative rewiew overlap. So its a good idea to consider the pair of words.

In [20]:
# Bi-gram 

count_vect = CountVectorizer(ngram_range=(1,2))
final_bi_gram_count = count_vect.fit_transform(final['Text'].values)

In [21]:
print("The shape of new vector matrix after Bi_gram " , final_bi_gram_count.get_shape())

The shape of new vector matrix after Bi_gram  (68271, 1163345)


# 7. TF-IDF

* In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general

In [22]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
fina__tfidf = tf_idf_vect.fit_transform(final['Text'].values)

In [23]:
print("the shape of the vector after TF-IDF: ", fina__tfidf.get_shape())

the shape of the vector after TF-IDF:  (68271, 1163345)


In [24]:
features = tf_idf_vect.get_feature_names()
len(features)

1163345

In [25]:
# checking few features 
features[109000:109010]

['bake so',
 'bake solved',
 'bake some',
 'bake something',
 'bake special',
 'bake stuff',
 'bake such',
 'bake sugar',
 'bake sweets',
 'bake than']

**OBSERVATION**: These words are Bi-grams but there may be some Uni-grams as well.

In [26]:
# convert a row of sparsematrix into array
print(fina__tfidf[3,:].toarray()[0])

[0. 0. 0. ... 0. 0. 0.]


In [27]:
# getting top n features of TF-IDF

def top_n_feat(row , features , top=25):
    
    topn_ids= np.argsort(row)[::-1][:top]  #sorting the vector and reversing and pick top 25
    topn_feat = [(features[i] , row[i]) for i in topn_ids]
    df = pd.DataFrame(topn_feat)
    df.columns = [ 'Features' , 'tfidf']
    return df

In [28]:
top_tfidf = top_n_feat(fina__tfidf[2,:].toarray()[0] , features)

In [29]:
top_tfidf

Unnamed: 0,Features,tfidf
0,bewilderment to,0.190952
1,is mini,0.190952
2,bewilderment,0.190952
3,my bewilderment,0.190952
4,mini version,0.190952
5,books ever,0.190952
6,best children,0.190952
7,children books,0.190952
8,not portrayed,0.190952
9,email regarding,0.184183


# 8. Word2Vec

* Word2vec takes as its input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. Word vectors are positioned in the vector space such that words that share common contexts in the corpus are located in close proximity to one another in the space

In [30]:
from gensim.models import word2vec , KeyedVectors

In [31]:
model = KeyedVectors.load_word2vec_format('../input/gnewsvector/GoogleNews-vectors-negative300.bin' , binary=True)

In [32]:
model.wv['computer']

  """Entry point for launching an IPython kernel.


array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [33]:
model.wv.similarity('woman' , 'man')

  """Entry point for launching an IPython kernel.


0.76640123

In [34]:
model.wv.most_similar('woman')

  """Entry point for launching an IPython kernel.


[('man', 0.7664012312889099),
 ('girl', 0.7494640946388245),
 ('teenage_girl', 0.7336829900741577),
 ('teenager', 0.631708562374115),
 ('lady', 0.6288785934448242),
 ('teenaged_girl', 0.6141784191131592),
 ('mother', 0.607630729675293),
 ('policewoman', 0.6069462299346924),
 ('boy', 0.5975908041000366),
 ('Woman', 0.5770983099937439)]

In [35]:
import gensim
from tqdm import tqdm
i=0
list_of_sent =[]

for sent in tqdm(final['Text'].values):
    filtered_sent=[]
    sent= cleanhtml(sent)
    for w in sent.split():
        for cleanW in cleanpunc(w).split():
            if (cleanW.isalpha()):
                filtered_sent.append(cleanW.lower())
            else:
                continue
    list_of_sent.append(filtered_sent)
    

100%|██████████| 68271/68271 [00:20<00:00, 3393.78it/s]


In [36]:
print(final['Text'].values[0])

This is a book of poetry about the months of the year.  It goes through each month and has a cute little poem to go along with it.  I love this book because it is a really fun way to learn the months and the poems are very creative. The author's purpose for writing this book was to give children a fun way to learn the months.  The children can also learn things about poetry and rhythm through reading this book.


In [37]:
print(list_of_sent[0])

['this', 'is', 'a', 'book', 'of', 'poetry', 'about', 'the', 'months', 'of', 'the', 'year', 'it', 'goes', 'through', 'each', 'month', 'and', 'has', 'a', 'cute', 'little', 'poem', 'to', 'go', 'along', 'with', 'it', 'i', 'love', 'this', 'book', 'because', 'it', 'is', 'a', 'really', 'fun', 'way', 'to', 'learn', 'the', 'months', 'and', 'the', 'poems', 'are', 'very', 'creative', 'the', 'author', 's', 'purpose', 'for', 'writing', 'this', 'book', 'was', 'to', 'give', 'children', 'a', 'fun', 'way', 'to', 'learn', 'the', 'months', 'the', 'children', 'can', 'also', 'learn', 'things', 'about', 'poetry', 'and', 'rhythm', 'through', 'reading', 'this', 'book']


In [38]:
word2vecmodel = gensim.models.Word2Vec(list_of_sent , min_count=5, size=50 , workers=4)
words = list(word2vecmodel.wv.vocab)
print(len(words))

18438


In [39]:
word2vecmodel.wv.most_similar('tasty')

[('yummy', 0.862500011920929),
 ('satisfying', 0.8510292172431946),
 ('delicious', 0.8106536865234375),
 ('filling', 0.803415834903717),
 ('versatile', 0.7963504791259766),
 ('flavorful', 0.7885638475418091),
 ('nutritious', 0.7743555307388306),
 ('moist', 0.7722365856170654),
 ('crunchy', 0.7326051592826843),
 ('salty', 0.7304224967956543)]

In [40]:
word2vecmodel.wv.most_similar('like')

[('okay', 0.6167913675308228),
 ('resemble', 0.604689359664917),
 ('alright', 0.5949361324310303),
 ('prefer', 0.5874310731887817),
 ('mean', 0.5840886831283569),
 ('gross', 0.5762603282928467),
 ('think', 0.574055016040802),
 ('burnt', 0.572217583656311),
 ('dislike', 0.5696036219596863),
 ('fake', 0.5673271417617798)]

In [41]:
count_vect_feat = count_vect.get_feature_names()
count_vect_feat.index('like')
print(count_vect_feat[574544])

like


# 9. Average Word2Vec , TF-IDF weighted Word2Vec

In [42]:
# Compute Word to vector

sent_vectors = []

for sent in tqdm(list_of_sent):    # looping over all the rewiew
    sent_vec = np.zeros(50)         # len of the word vector
    count_word =0                  #count the valid word in the sentence
    for word in sent:              #looping over each word in the sentence
        try:
            vec =word2vecmodel.wv[word] #converting each word to vector
            sent_vec+=vec                #adding vector to the defined vector
            count_word+=1               #counting the number of words converted. used later for average
        except:
            continue
    sent_vec/= count_word
    sent_vectors.append(sent_vec)

  from ipykernel import kernelapp as app
100%|██████████| 68271/68271 [00:27<00:00, 2517.91it/s]


In [43]:
print(len(sent_vectors))
print(len(sent_vectors[0]))

68271
50


# TF-IDF weighted W2V

In [44]:
tfidf_sent_vectors =[]
row=0
for sent in tqdm(list_of_sent[:10]):
    sent_vec= np.zeros(50)
    weight_sum=0
    for word in sent:
        try:
            vec = word2vecmodel.wv[word]
            tfidf = fina__tfidf[row , features.index(word)]
            sent_vec+= (vec*tfidf)
            weight_sum +=tfidf
        except:
            pass
    sent_vec/=weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row+=1

100%|██████████| 10/10 [01:13<00:00,  5.37s/it]
