## Bag of Words

taking the data and preprocessing to generate a bag of words for each ASIN

In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# this one takes a lil while so be careful
df = pd.read_json('data/Sports_and_Outdoors_Reviews_training.json', lines=True)

In [3]:
# take out reviews without reviewText and sort them by ASIN
df = df[df['reviewText'].notna()]
df = df[df['asin'].notna()]
df_sorted = df.sort_values(by='asin')
df_sorted

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image,vote
2045683,3,True,"08 22, 2015",C2411D15F4A20416FCAFE7A43A56A281,00018C9635D55E22BF157AA13E91226F,"{'Size:': ' 10 M US', 'Color:': ' Black Blue'}",01DEFBA5E93EEA664091896F8CDB6C25,The inner sole for one boot separated after ab...,Durability is questionable,1440201600,,
2045682,3,True,"08 26, 2015",047026268E0D0964A9FC3A50ACFAB5C2,00018C9635D55E22BF157AA13E91226F,"{'Size:': ' 10 M US', 'Color:': ' Black Blue'}",86FAF3FA09A7B2BDBF294E6F8C8D1D5F,Don't like those weird little tabs on both sid...,Eh.,1440547200,,
2045681,5,True,"09 3, 2015",B18B5DBB1F7ABA0FFE5B6ADB5731B15E,00018C9635D55E22BF157AA13E91226F,"{'Size:': ' 9 M US', 'Color:': ' Black Blue'}",1DC25EE2576C4AB06C2BA15A1D1444AB,These fit like a glove! Comfortable & easy on ...,Gotta Love these!,1441238400,,
2045680,5,True,"10 22, 2015",5C7E5E823C3844AB68555EEC453C076A,00018C9635D55E22BF157AA13E91226F,"{'Size:': ' 7 M US', 'Color:': ' Black Blue'}",760C63E8E5E8DC3FAA01878D37BA5678,"My daughter loved these, they fit perfect and ...",Good water shoes,1445472000,,
2045679,5,True,"11 26, 2015",C418CD6FE9C8A03BFD20E42FCA708437,00018C9635D55E22BF157AA13E91226F,"{'Size:': ' 7 M US', 'Color:': ' Black Blue'}",78A397887ECE7A37D36C566BE5600CC8,GOOD,Five Stars,1448496000,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2059208,2,True,"04 12, 2013",066BC853F9AC93102314A1D1A84EAA79,FFFF9DFFBC5FB24D6AB70F651206363C,,5108C2162963C0091D28956C37FE60D1,The vanes are decent but never got a chance to...,Glue,1365724800,,2
2059207,1,True,"06 6, 2013",56B1A216DB3CEC28A424208F212F6B5D,FFFF9DFFBC5FB24D6AB70F651206363C,,EB4A046EACEBBD6F7CFE4E7AE4B6EABA,The glue they included is common super glue wi...,I am sadly disappointed with this product.,1370476800,,
2059206,1,True,"06 10, 2013",F3D81F74EE5EB9EC69F5B7ED6D3AEAE0,FFFF9DFFBC5FB24D6AB70F651206363C,,7834811EC2A447475C8D6A833C243329,"These don't stick well to the shaft, even with...",Cruddy Vanes,1370822400,,
2059205,5,True,"06 17, 2013",39800C5B4C05ACE64CEFBF7DC94C2DFC,FFFF9DFFBC5FB24D6AB70F651206363C,,A3ECFCAB0B49CC91EBA790B28B1B57AF,these fletches are great and i do mean great. ...,what can i say,1371427200,,


In [4]:
# # some NLTK things
# stemmer = SnowballStemmer("english", ignore_stopwords=True)
# stop_words = stopwords.words("english")
# tokenizer = RegexpTokenizer(r'\w+')

In [5]:
# def process_text(text):
#     words = tokenizer.tokenize(text.lower())
#     for w in words:
#         if w in stop_words:
#             words.remove(w)
#     words = [stemmer.stem(w) for w in words]
#     return words

In [6]:
# df_processed = df_sorted.apply(lambda x: x.map(process_text) if x.name == 'reviewText' else x)

In [7]:
# df_processed

In [8]:
# showing breakdown of review ratings
df_sorted.overall.value_counts()

5    1544003
4     397940
3     168761
1      89173
2      81544
Name: overall, dtype: int64

In [9]:
# generating document term matrix (DTM) 
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english', ngram_range = (1,1), tokenizer=token.tokenize)
text_counts = cv.fit_transform(df_sorted['reviewText'])

In [10]:
# splitting data for testing and training to see how well model performs
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, df_sorted['overall'], test_size = 0.25, random_state = 5)

In [11]:
# fitting the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
# checking accuracy
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print(str('{:04.2f}'.format(accuracy_score*100)) + '%')

70.27%
