In [36]:
import numpy as np
import pandas as pd
import re
import nltk
import sklearn
import warnings
from platform import python_version
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
print(python_version())

3.9.5


In [37]:
#! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz

## Read Data

In [38]:
# Could not load it directly from the url:
ratings_df = pd.read_csv("./data/amazon_reviews_us_Kitchen_v1_00.tsv", sep="\t",
                         error_bad_lines=False, warn_bad_lines=False)

In [39]:
ratings_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,37000337,R3DT59XH7HXR9K,B00303FI0G,529320574,Arthur Court Paper Towel Holder,Kitchen,5.0,0.0,0.0,N,Y,Beautiful. Looks great on counter,Beautiful. Looks great on counter.,2015-08-31
1,US,15272914,R1LFS11BNASSU8,B00JCZKZN6,274237558,Olde Thompson Bavaria Glass Salt and Pepper Mi...,Kitchen,5.0,0.0,1.0,N,Y,Awesome & Self-ness,I personally have 5 days sets and have also bo...,2015-08-31
2,US,36137863,R296RT05AG0AF6,B00JLIKA5C,544675303,Progressive International PL8 Professional Man...,Kitchen,5.0,0.0,0.0,N,Y,Fabulous and worth every penny,Fabulous and worth every penny. Used for clean...,2015-08-31
3,US,43311049,R3V37XDZ7ZCI3L,B000GBNB8G,491599489,Zyliss Jumbo Garlic Press,Kitchen,5.0,0.0,1.0,N,Y,Five Stars,A must if you love garlic on tomato marinara s...,2015-08-31
4,US,13763148,R14GU232NQFYX2,B00VJ5KX9S,353790155,"1 X Premier Pizza Cutter - Stainless Steel 14""...",Kitchen,5.0,0.0,0.0,N,Y,Better than sex,Worth every penny! Buy one now and be a pizza ...,2015-08-31


## Keep Reviews and Ratings

In [40]:
ratings_df = ratings_df.loc[:, ["review_body", "star_rating"]]
ratings_df.head()

Unnamed: 0,review_body,star_rating
0,Beautiful. Looks great on counter.,5.0
1,I personally have 5 days sets and have also bo...,5.0
2,Fabulous and worth every penny. Used for clean...,5.0
3,A must if you love garlic on tomato marinara s...,5.0
4,Worth every penny! Buy one now and be a pizza ...,5.0


# Labelling Reviews:
## The reviews with rating 4,5 are labelled to be 1 and 1,2 are labelled as 0. Discard the reviews with rating 3'

In [41]:
# Map star rating to sentiment rating:
d_ = {4:1, 5:1, 1:0, 2:0, 3:0}
ratings_df["sentiment"] = ratings_df["star_rating"].map(d_)

In [42]:
# Counts of each rating:
print(ratings_df["star_rating"].value_counts())

5.0    3124759
4.0     731733
1.0     426900
3.0     349547
2.0     241948
Name: star_rating, dtype: int64


In [43]:
# Discard ratings of 3:
count_3 = len(ratings_df[ratings_df["star_rating"] == 3])
ratings_df = ratings_df[~(ratings_df["star_rating"] == 3)]

 ## We select 200000 reviews randomly with 100,000 positive and 100,000 negative reviews.



In [44]:
pos_reviews = ratings_df[ratings_df["sentiment"] == 1].sample(100000)
neg_review = ratings_df[ratings_df["sentiment"] == 0].sample(100000)

ratings_sampled_df = pd.concat([pos_reviews, neg_review])
ratings_sampled_df.reset_index(drop=True, inplace=True)

In [45]:
# Counts of each sentiment rating:
print("Class 0 count:", ratings_df["sentiment"].value_counts()[0], end=", ")
print("Class 1 count:", ratings_df["sentiment"].value_counts()[1], end=", ")
print("Discarded (3) Class:", count_3)

Class 0 count: 668848, Class 1 count: 3856492, Discarded (3) Class: 349547


In [46]:
train, test = train_test_split(ratings_sampled_df, test_size=0.2)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [47]:
# Print 3 reviews BEFORE Data-Cleaning:
print(train["review_body"][62])
print("Rating:", train["star_rating"][62])
print("\b")
print(train["review_body"][245])
print("Rating:", train["star_rating"][245])
print("\b")
print(train["review_body"][97765])
print("Rating:", train["star_rating"][97765])

Ordered this product based on reviews describing color as black (also shown in picture). Color is off white not black. I would have kept product if I could have gotten black.  After further investigation determined product is discontinued.
Rating: 1.0

It started off very well, but after a couple of months of heavy use (mostly smoothies, occasional in-pot pureeing of hot soups) the blender attachment broke and the blade fell out of the casing. The rubber washer just underneath the blade too broke. I am pretty sure that food entered the tube that holds the shaft. The motor is fine, and I can still use it with the other attachment, but not for its primary purpose. As another reviewer said, these companies seem to be building them to fail.    Any suggestions for a durable hand blender?
Rating: 2.0

Took one out of the package and gave to a friend.  He calls me 10 minutes later saying that the knob came completely out.  So I took the last two out of the package and all of those fell off 

# Data Cleaning

## Convert the all reviews into the lower case.

In [48]:
# Find average length of reviews (in characters) before:
print("Average Training Before:", np.mean(train["review_body"].str.len()), end=", ")
print("Average Testing Before:", np.mean(test["review_body"].str.len()))

Average Training Before: 322.96810385778934, Average Testing Before: 326.3098077451936


In [49]:
train["review_body"] = train["review_body"].str.lower()
test["review_body"] = test["review_body"].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].str.lower()


In [50]:
train.head()

Unnamed: 0,review_body,star_rating,sentiment
0,"these are the real deal, people. i previously ...",5.0,1.0
1,this product is a definitely &#34;looks can be...,2.0,0.0
2,"will not use old k-cups, coffee grounds in cof...",1.0,0.0
3,the spinning part hit the sides of the pan. pr...,1.0,0.0
4,round bowls almost seem silly in a rectangular...,5.0,1.0


In [51]:
test.head()

Unnamed: 0,review_body,star_rating,sentiment
0,"the glass is too thin for a tea pot that size,...",2.0,0.0
1,this product was definitely not what i was exp...,1.0,0.0
2,this was a piece of junk! it came to me in two...,1.0,0.0
3,"it's a great idea in theory, but everything le...",1.0,0.0
4,i couldn't wait to start making real milkshake...,2.0,0.0


## remove the HTML and URLs from the reviews

In [52]:
train["review_body"].str.match(r'http\S+').sum()

1

In [53]:
train["review_body"].str.match(r'www\S+').sum()

0

In [54]:
train["review_body"] = train["review_body"].replace(r'http\S+|www.\S+', '', regex=True)
test["review_body"] = test["review_body"].replace(r'http\S+|www.\S+', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].replace(r'http\S+|www.\S+', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].replace(r'http\S+|www.\S+', '', regex=True)


In [55]:
train["review_body"].str.match(r'http\S+').sum()

0

In [56]:
train["review_body"].str.match(r'www\S+').sum()

0

## remove non-alphabetical characters

In [57]:
train["review_body"] = train["review_body"].replace(r'[^a-z|\s]', '', regex=True)
test["review_body"] = test["review_body"].replace(r'[^a-z|\s]', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].replace(r'[^a-z|\s]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].replace(r'[^a-z|\s]', '', regex=True)


## Remove the extra spaces between the words

In [58]:
train["review_body"] = train["review_body"].replace(r'\s\s+', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].replace(r'\s\s+', ' ', regex=True)


In [59]:
test["review_body"] = test["review_body"].replace(r'\s\s+', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].replace(r'\s\s+', ' ', regex=True)


## perform contractions on the reviews.

In [60]:
test["review_body"].str.match("wont").sum()

10

In [61]:
train["review_body"].str.match("wont").sum()

26

In [62]:
def contractionfunction(s):
    contraction_d = {"wont":"will not", "dont":"do not", "hasnt":"has not", "havent":"have not",
                     "im":"i am", "id":"i would", "itll": "it will", "ive": "i have", "isnt":"is not",
                     "lets":"let us", "mustve": "must have", "shed":"she would", "shell": "she will",
                     "thats": "that is", "theyd": "they had", "theyll": "they will", "weve": "we have"}
    for k_ in contraction_d.keys():
        if type(s) != str:
            s = str(s)
        s = s.replace(k_, contraction_d[k_])
    return s

test["review_body"] = test["review_body"].apply(contractionfunction)
train["review_body"] = train["review_body"].apply(contractionfunction)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].apply(contractionfunction)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].apply(contractionfunction)


In [63]:
test["review_body"].str.match("wont").sum()

0

In [64]:
train["review_body"].str.match("wont").sum()

0

In [65]:
# Find average length of reviews (in characters) AFTER Cleaning (but before stopwords/lemmatization):
print("Average Training After:", np.mean(train["review_body"].str.len()))
print("Average Testing After:", np.mean(test["review_body"].str.len()))

Average Training After: 313.9336875
Average Testing After: 316.952775


# Pre-processing

## remove the stop words 

In [66]:
stop_words = set(stopwords.words('english'))

In [67]:
def remove_stopwords(s):
    tokens = word_tokenize(s)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

train["review_body"] = train["review_body"].apply(remove_stopwords)
test["review_body"] = test["review_body"].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].apply(remove_stopwords)


## perform lemmatization  

In [68]:
from nltk.stem import WordNetLemmatizer
lem_obj = WordNetLemmatizer()
def lemmatize(s):
    #Should already be tokenized from above:
    lemmatized_tokens = [lem_obj.lemmatize(word) for word in s]
    return lemmatized_tokens

train["review_body"] = train["review_body"].apply(lemmatize)
test["review_body"] = test["review_body"].apply(lemmatize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].apply(lemmatize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].apply(lemmatize)


In [69]:
# Convert the tokenized values back into strings:
def stringify(s):
    str_ = " ".join([word for word in s])
    return str_

train["review_body"] = train["review_body"].apply(stringify)
test["review_body"] = test["review_body"].apply(stringify)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].apply(stringify)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].apply(stringify)


In [70]:
# Find average length of reviews (in characters) AFTER stopwords/lemmatize:
print("Average Training After:", np.mean(train["review_body"].str.len()))
print("Average Testing After:", np.mean(test["review_body"].str.len()))

Average Training After: 197.4351625
Average Testing After: 199.278775


In [71]:
# Print 3 reviews AFTER Data Cleaning/Pre Processing:
print(train["review_body"][62])
print("\b")
print(train["review_body"][245])
print("\b")
print(train["review_body"][97765])

ordered product based review describing color black also shown picture color white black would kept product could gotten black investigation determined product discontinued

started well couple month heavy use mostly smoothy occasional inpot pureeing hot soup blender attachment broke blade fell casing rubber washer underneath blade broke pretty sure food entered tube hold shaft motor fine still use attachment pri amary purpose another reviewer sai would company seem building fail suggestion durable hand blender

took one package gave friend call minute later saying knob came completely took last two package fell well waste money buy


# TF-IDF Feature Extraction

In [72]:
### DO NOT try to turn sparse matrix into numpy array:
reviews_train = train["review_body"].values
reviews_test = test["review_body"].values

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(reviews_train)
#print(vectorizer.get_feature_names())
X_test = vectorizer.transform(reviews_test)
print(X_train.shape)
print(X_test.shape)

(160000, 85727)
(40000, 85727)


In [73]:
Y_train = train["sentiment"].values
Y_test = test["sentiment"].values

# Perceptron

In [74]:
perceptron = Perceptron(tol=1e-3, random_state=42)
perceptron.fit(X_train, Y_train)
print("Training Accuracy:", perceptron.score(X_train, Y_train))

y_pred = perceptron.predict(X_train)
precision, recall, f1, support = precision_recall_fscore_support(Y_train, y_pred, average="macro")
print("Precision (Training):", precision)
print("Recall (Training):", recall)
print("F1 (Training):", f1)

print("\b")

print("Test Accuracy:", perceptron.score(X_test, Y_test))
y_pred_test = perceptron.predict(X_test)
precision, recall, f1, support = precision_recall_fscore_support(Y_test, y_pred_test, average="macro")
print("Precision (Test):", precision)
print("Recall (Test):", recall)
print("F1 (Test):", f1)

Training Accuracy: 0.9168125
Precision (Training): 0.9168741145848449
Recall (Training): 0.9167970757074642
F1 (Training): 0.9168068376653886

Test Accuracy: 0.8596
Precision (Test): 0.8595989543839582
Recall (Test): 0.8596393490772624
F1 (Test): 0.8595959058166136


# SVM

In [75]:
svm = LinearSVC(max_iter=1000)
svm.fit(X_train, Y_train)
print("Training Accuracy:", svm.score(X_train, Y_train))

y_pred = svm.predict(X_train)
precision, recall, f1, support = precision_recall_fscore_support(Y_train, y_pred, average="macro")
print("Precision (Training):", precision)
print("Recall (Training):", recall)
print("F1 (Training):", f1)

print("\b")

print("Test Accuracy:", svm.score(X_test, Y_test))
y_pred_test = svm.predict(X_test)
precision, recall, f1, support = precision_recall_fscore_support(Y_test, y_pred_test, average="macro")
print("Precision (Test):", precision)
print("Recall (Test):", recall)
print("F1 (Test):", f1)

Training Accuracy: 0.94015625
Precision (Training): 0.9401555293852055
Recall (Training): 0.9401583969498426
F1 (Training): 0.9401560808450846

Test Accuracy: 0.898025
Precision (Test): 0.8980309085197598
Recall (Test): 0.8979936635680386
F1 (Test): 0.8980091276067395


# Logistic Regression

In [76]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
print("Training Accuracy:", lr.score(X_train, Y_train))

y_pred = lr.predict(X_train)
precision, recall, f1, support = precision_recall_fscore_support(Y_train, y_pred, average="macro")
print("Precision (Training):", precision)
print("Recall (Training):", recall)
print("F1 (Training):", f1)

print("\b")

print("Test Accuracy:", lr.score(X_test, Y_test))
y_pred_test = lr.predict(X_test)
precision, recall, f1, support = precision_recall_fscore_support(Y_test, y_pred_test, average="macro")
print("Precision (Test):", precision)
print("Recall (Test):", recall)
print("F1 (Test):", f1)

Training Accuracy: 0.9146625
Precision (Training): 0.9146720068835619
Recall (Training): 0.9146699832699575
F1 (Training): 0.9146624863459979

Test Accuracy: 0.8999
Precision (Test): 0.8999230245323118
Recall (Test): 0.8998564178671116
F1 (Test): 0.8998813456420234


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Naive Bayes

In [43]:
mnb = MultinomialNB()
mnb.fit(X_train, Y_train)
print("Training Accuracy:", mnb.score(X_train, Y_train))

y_pred = mnb.predict(X_train)
precision, recall, f1, support = precision_recall_fscore_support(Y_train, y_pred, average="macro")
print("Precision (Training):", precision)
print("Recall (Training):", recall)
print("F1 (Training):", f1)

print("\b")

print("Test Accuracy:", mnb.score(X_test, Y_test))
y_pred_test = mnb.predict(X_test)
precision, recall, f1, support = precision_recall_fscore_support(Y_test, y_pred_test, average="macro")
print("Precision (Test):", precision)
print("Recall (Test):", recall)
print("F1 (Test):", f1)

Training Accuracy: 0.8906875
Precision (Training): 0.8909527051463344
Recall (Training): 0.8907027408887509
F1 (Training): 0.8906715592983733

Test Accuracy: 0.872075
Precision (Test): 0.872466702587781
Recall (Test): 0.872000041070888
F1 (Test): 0.8720229094048294
