In [53]:
import numpy as np
import pandas as pd
import re
import nltk
import sklearn
import warnings
from platform import python_version
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
print(python_version())

3.9.5


### Read Data:

In [2]:
# Could not load it directly from the url:
ratings_df = pd.read_csv("./data/amazon_reviews_us_Kitchen_v1_00.tsv", sep="\t",
                         error_bad_lines=False, warn_bad_lines=False)

In [3]:
ratings_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,37000337,R3DT59XH7HXR9K,B00303FI0G,529320574,Arthur Court Paper Towel Holder,Kitchen,5.0,0.0,0.0,N,Y,Beautiful. Looks great on counter,Beautiful. Looks great on counter.,2015-08-31
1,US,15272914,R1LFS11BNASSU8,B00JCZKZN6,274237558,Olde Thompson Bavaria Glass Salt and Pepper Mi...,Kitchen,5.0,0.0,1.0,N,Y,Awesome & Self-ness,I personally have 5 days sets and have also bo...,2015-08-31
2,US,36137863,R296RT05AG0AF6,B00JLIKA5C,544675303,Progressive International PL8 Professional Man...,Kitchen,5.0,0.0,0.0,N,Y,Fabulous and worth every penny,Fabulous and worth every penny. Used for clean...,2015-08-31
3,US,43311049,R3V37XDZ7ZCI3L,B000GBNB8G,491599489,Zyliss Jumbo Garlic Press,Kitchen,5.0,0.0,1.0,N,Y,Five Stars,A must if you love garlic on tomato marinara s...,2015-08-31
4,US,13763148,R14GU232NQFYX2,B00VJ5KX9S,353790155,"1 X Premier Pizza Cutter - Stainless Steel 14""...",Kitchen,5.0,0.0,0.0,N,Y,Better than sex,Worth every penny! Buy one now and be a pizza ...,2015-08-31


In [23]:
# Simplify Dataset: Ensure all have reviews
ratings_df = ratings_df.loc[:, ["review_body", "star_rating"]]
ratings_df = ratings_df[ratings_df["review_body"].notnull()]
ratings_df.head()

Unnamed: 0,review_body,star_rating
0,Beautiful. Looks great on counter.,5.0
1,I personally have 5 days sets and have also bo...,5.0
2,Fabulous and worth every penny. Used for clean...,5.0
3,A must if you love garlic on tomato marinara s...,5.0
4,Worth every penny! Buy one now and be a pizza ...,5.0


In [24]:
# Gather 50k of each rating through random selection:
rating_1 = ratings_df[ratings_df["star_rating"] == 1].sample(50000)
rating_2 = ratings_df[ratings_df["star_rating"] == 2].sample(50000)
rating_3 = ratings_df[ratings_df["star_rating"] == 3].sample(50000)
rating_4 = ratings_df[ratings_df["star_rating"] == 4].sample(50000)
rating_5 = ratings_df[ratings_df["star_rating"] == 5].sample(50000)

ratings_sampled_df = pd.concat([rating_1, rating_2, rating_3, rating_4, rating_5])
ratings_sampled_df = ratings_sampled_df.sample(frac=1)
ratings_sampled_df.reset_index(drop=True, inplace=True)
ratings_sampled_df.head()

Unnamed: 0,review_body,star_rating
0,This microwave is not only gorgeous but its al...,5.0
1,"These work great, but are way too big. Waste ...",2.0
2,"This coffee machine is v.good, u'll enjoy the ...",4.0
3,I kept them in the freezer for a week before u...,2.0
4,Best cocktail ice cube trays out there on the ...,5.0


In [25]:
# Map star rating to sentiment rating: We will assign -1 as neutral
d_ = {4:1, 5:1, 1:0, 2:0, 3:-1}
ratings_sampled_df["sentiment"] = ratings_sampled_df["star_rating"].map(d_)
ratings_sampled_df.head()

Unnamed: 0,review_body,star_rating,sentiment
0,This microwave is not only gorgeous but its al...,5.0,1
1,"These work great, but are way too big. Waste ...",2.0,0
2,"This coffee machine is v.good, u'll enjoy the ...",4.0,1
3,I kept them in the freezer for a week before u...,2.0,0
4,Best cocktail ice cube trays out there on the ...,5.0,1


In [26]:
# Train-test Split
train, test = train_test_split(ratings_sampled_df, test_size=0.2)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

### Word Embedding:

In [37]:
# Load in the word2vec:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

In [46]:
#Check Smeantic Similarities: (close enough!)
result_vec = word2vec["king"] - word2vec["man"] + word2vec["woman"]
word2vec.most_similar(positive=result_vec, topn=3)

[('king', 0.844939112663269),
 ('queen', 0.7300516366958618),
 ('monarch', 0.6454660296440125)]

In [84]:
word2vec.most_similar(positive="excellent", topn=3)

[('terrific', 0.7409728765487671),
 ('superb', 0.7062715888023376),
 ('exceptional', 0.681470513343811)]

In [49]:
# Train a Word2Vec model using own dataset: embedding_size=300, window_size=11, min_word_cnt=10
# Clean data:
train["review_body"] = train["review_body"].str.lower()
test["review_body"] = test["review_body"].str.lower()

train["review_body"] = train["review_body"].replace(r'http\S+|www.\S+', '', regex=True)
test["review_body"] = test["review_body"].replace(r'http\S+|www.\S+', '', regex=True)


train["review_body"] = train["review_body"].replace(r'[^a-z|\s]', '', regex=True)
test["review_body"] = test["review_body"].replace(r'[^a-z|\s]', '', regex=True)

train["review_body"] = train["review_body"].replace(r'\s\s+', ' ', regex=True)
test["review_body"] = test["review_body"].replace(r'\s\s+', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].replace(r'http\S+|www.\S+', '', regex=True)
A value

In [54]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(s):
    tokens = word_tokenize(s)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

train["review_body"] = train["review_body"].apply(remove_stopwords)
test["review_body"] = test["review_body"].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["review_body"] = train["review_body"].apply(remove_stopwords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["review_body"] = test["review_body"].apply(remove_stopwords)


In [58]:
train_sentences = np.array(train["review_body"])
test_sentences = np.array(test["review_body"])

In [60]:
model = gensim.models.Word2Vec(train_sentences, min_count=10, vector_size=200, window=11)

In [70]:
# Test semantic similarities:
result_vec = model.wv["king"] - model.wv["man"] + model.wv["woman"]
model.wv.similar_by_vector(result_vec)

[('king', 0.6559873819351196),
 ('trio', 0.5477477312088013),
 ('saucepans', 0.519656240940094),
 ('mc', 0.518571674823761),
 ('allclads', 0.5153471231460571),
 ('hardanodized', 0.512095034122467),
 ('wearever', 0.5071931481361389),
 ('scanpan', 0.5062934756278992),
 ('multiclad', 0.4969087243080139),
 ('duncan', 0.48269400000572205)]

In [75]:
model.wv.similar_by_word("excellent")

[('outstanding', 0.7673604488372803),
 ('wonderful', 0.7341710925102234),
 ('fantastic', 0.7265551090240479),
 ('terrific', 0.7205982804298401),
 ('superb', 0.6952676773071289),
 ('exceptional', 0.6444137096405029),
 ('great', 0.596105694770813),
 ('exceeds', 0.572962760925293),
 ('pleased', 0.5425746440887451),
 ('reasonable', 0.5404192805290222)]

From the Semantic Similarities above we can see that our model actually performs relatively well given the results of the top 10 closest words to Excellent. However we can see the problem emerges that the reviews do not necessarily contain all the words we might want such as "Queen", which may or may not be in the data based on the randomness involved in selecting reviews. One would prefer to utilize the all encompassing Google word2vec to help with unorthodox words, and we can trust that the performance is better. Yet, the create w2v does produce results faster.

### Simple Models: