***William Su*** <br>
CMPE256 - S22<br>
Final Project test <br>
Amazon food reviews sentiment analysis <br>

In [20]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
nltk.download('stopwords')
nltk.download('punkt')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import clear_output

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wilson289296/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

[nltk_data] Downloading package punkt to
[nltk_data]     /home/wilson289296/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **Data prep**

In [2]:
data = pd.read_csv('Reviews.csv')
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


***Drop unneeded columns***

In [3]:
data = data.loc[:, [col in ['Score', 'Summary', 'Text'] for col in data.columns]]
data.head()

Unnamed: 0,Score,Summary,Text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,5,Great taffy,Great taffy at a great price. There was a wid...


***Combine Summary and Text features***

In [4]:
combined = data['Summary'] + ' ' + data['Text']
data = data.drop(['Summary', 'Text'], axis = 1)
data['Text'] = combined
data

Unnamed: 0,Score,Text
0,5,Good Quality Dog Food I have bought several of...
1,1,Not as Advertised Product arrived labeled as J...
2,4,"""Delight"" says it all This is a confection tha..."
3,2,Cough Medicine If you are looking for the secr...
4,5,Great taffy Great taffy at a great price. The...
...,...,...
568449,5,Will not do without Great for sesame chicken.....
568450,2,disappointed I'm disappointed with the flavor....
568451,5,Perfect for our maltipoo These stars are small...
568452,5,Favorite Training and reward treat These are t...


***Turn Score into Pos/Neg/Neu Sentiment***

In [5]:
def calcSentiment(score):
    if score > 3:
        return 1
    elif score < 3:
        return -1
    else:
        return 0

In [6]:
data['Sentiment'] = data['Score'].apply(lambda x: calcSentiment(x))
data = data.drop(['Score'], axis = 1)
data

Unnamed: 0,Text,Sentiment
0,Good Quality Dog Food I have bought several of...,1
1,Not as Advertised Product arrived labeled as J...,-1
2,"""Delight"" says it all This is a confection tha...",1
3,Cough Medicine If you are looking for the secr...,-1
4,Great taffy Great taffy at a great price. The...,1
...,...,...
568449,Will not do without Great for sesame chicken.....,1
568450,disappointed I'm disappointed with the flavor....,-1
568451,Perfect for our maltipoo These stars are small...,1
568452,Favorite Training and reward treat These are t...,1


# **NLP word processing**
(These take a while to run, especially stemming)

In [7]:
data['CleanText'] = data['Text'].apply(str) #some reviews are floats for some reason

In [8]:
#make lowercase
data['CleanText'] = data['CleanText'].apply(lambda x: x.lower())

In [9]:
#remove punctuation
data['CleanText'] = data['CleanText'].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))

In [10]:
#tokenization
data['CleanText'] = data['CleanText'].apply(lambda x: nltk.word_tokenize(x))

In [11]:
#stopword removal
stop_words = stopwords.words('english')
data['CleanText'] = data['CleanText'].apply(lambda x: [word for word in x if word not in stop_words])

In [12]:
#stemming
porter = PorterStemmer()
data['CleanText'] = data['CleanText'].apply(lambda x: [porter.stem(word) for word in x])

In [13]:
#turn all reviews into sets, as duplicate words aren't needed
data['CleanText'] = data['CleanText'].apply(set)

In [14]:
#replace and clean up
data = data.drop(['Text'], axis=1)
data['Text'] = data['CleanText']
data = data.drop(['CleanText'], axis=1)
data

Unnamed: 0,Sentiment,Text
0,1,"{better, found, qualiti, like, meat, smell, vi..."
1,-1,"{actual, repres, product, error, peanutsth, ar..."
2,1,"{tini, filbert, lion, case, around, delight, c..."
3,-1,"{extract, secret, cough, made, robitussin, sod..."
4,1,"{lover, great, quick, deliveri, deal, wide, pr..."
...,...,...
568449,1,"{great, itwil, good, recip, without, better, e..."
568450,-1,"{worth, note, small, time, especi, use, thicke..."
568451,1,"{star, train, made, make, like, small, littl, ..."
568452,1,"{dog, groom, seem, reward, good, calori, doggi..."


***Separate data into positive/negative datasets***

In [15]:
pos = data.loc[data['Sentiment'] == 1]
neg = data.loc[data['Sentiment'] == -1]

In [16]:
pos
neg

Unnamed: 0,Sentiment,Text
0,1,"{better, found, qualiti, like, meat, smell, vi..."
2,1,"{tini, filbert, lion, case, around, delight, c..."
4,1,"{lover, great, quick, deliveri, deal, wide, pr..."
5,1,"{mani, melon, enjoy, husband, favorit, five, o..."
6,1,"{well, great, fraling, happen, beachthem, love..."
...,...,...
568448,1,"{dont, 5, ground, amount, good, spice, there, ..."
568449,1,"{great, itwil, good, recip, without, better, e..."
568451,1,"{star, train, made, make, like, small, littl, ..."
568452,1,"{dog, groom, seem, reward, good, calori, doggi..."


Unnamed: 0,Sentiment,Text
1,-1,"{actual, repres, product, error, peanutsth, ar..."
3,-1,"{extract, secret, cough, made, robitussin, sod..."
12,-1,"{put, bowl, sit, need, similar, touch, relat, ..."
16,-1,"{tv, love, lock, stay, like, time, fresh, movi..."
26,-1,"{flavor, plan, chewi, red, would, buy, candi, ..."
...,...,...
568433,-1,"{put, aftertast, fail, close, noth, like, spoo..."
568434,-1,"{bean, bad, ive, kick, find, noth, like, rice,..."
568435,-1,"{low, bowl, coupl, 3, like, realli, tortellini..."
568446,-1,"{great, back, badbr, anis, wrong, sent, order,..."


# **Naive Bayes**

In [23]:
def get_probs(word, pos, neg): #gets probabilities of each individual word in positive and negative datasets
    pres = sum(pos['Text'].apply(lambda x: word in x))/len(pos)
    nres = sum(neg['Text'].apply(lambda x: word in x))/len(neg)
    return {
        'pos': pres,
        'neg': nres
    }

def get_total_probs(wordset, pos, neg):
    posprob = 1
    negprob = 1
    for word in wordset: #get probabilities of each individual word and multiply all together
        prob = get_probs(word, pos, neg)
        posprob *= prob['pos']
        negprob *= prob['neg']
    return {
        'pos': posprob,
        'neg': negprob,
        'pred': 1 if posprob > negprob else -1
    }

In [18]:
#grab non-neutral data to use as test data
non_neutral = data.loc[abs(data['Sentiment']) == 1]

In [25]:
#sample some data from test set to run tests on
num_samples = 100
test_set = non_neutral.sample(num_samples)

In [26]:
#test naive bayes algorithm on test set
results = []
count = 0
for index, row in test_set.iterrows():
    review = row['Text']
    probs = get_total_probs(review, pos, neg)
    probs['tru'] = row['Sentiment']
    results.append(probs)
    count += 1
    clear_output(wait=True)
    print(f"Prediction {count} of {num_samples} done.")

Prediction 100 of 100 done.


***Evaluate accuracy of results***

In [27]:
correct = 0
for attempt in results:
    if attempt['tru'] == attempt['pred']:
        correct += 1
correct/len(results)

0.88