In [1]:
import pandas
import re
import nltk

#nltk.download()  

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [2]:
#Read in data set as a tab-delimited file

data = pandas.read_csv("amazon_alexa.tsv", delimiter = "\t")

In [3]:
#Look at dataset

data

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
5,5,31-Jul-18,Heather Gray Fabric,I received the echo as a gift. I needed anothe...,1
6,3,31-Jul-18,Sandstone Fabric,"Without having a cellphone, I cannot use many ...",1
7,5,31-Jul-18,Charcoal Fabric,I think this is the 5th one I've purchased. I'...,1
8,5,30-Jul-18,Heather Gray Fabric,looks great,1
9,5,30-Jul-18,Heather Gray Fabric,Love it! I’ve listened to songs I haven’t hear...,1


In [6]:
def text_cleaner(data):
    #Removes non-letter characters 
    letters_only = re.sub("[^a-zA-Z]"," ",data)
    lower_case = letters_only.lower()
    words = lower_case.split()
    #Takes out unmeaningful stop words
    filtered = []
    for w in words:
        if w not in stopwords.words('english'):
            filtered.append(w)
    #Creates a long string with words 
    clean_text = " ".join(words)
    return clean_text

In [7]:
#Cleans the reviews column
data["verified_reviews"] = data["verified_reviews"].apply(text_cleaner)

In [8]:
#Split into training & testing set

(train_reviews, test_reviews, train_target, test_target) = \
train_test_split(data['verified_reviews'], data['rating'],\
test_size = 0.2)

In [9]:
#Bag of Words with 5000 most common words

vectorizer = CountVectorizer(analyzer='word', \
max_features = 5000)

#Find the 5000 most common words in the train set
vectorizer.fit(train_reviews)

#Look at which words are most common
print(vectorizer.get_feature_names())



In [10]:
#Tranform the data

train_word_columns = vectorizer.transform(train_reviews).toarray()
test_word_columns = vectorizer.transform(test_reviews).toarray()

#Take a look at what one of these data sets looks like now
print(train_word_columns)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [11]:
#Feed into Multinomial Naive Bayes

mnb = MultinomialNB()
mnb.fit(train_word_columns,train_target)
preds = mnb.predict(test_word_columns)
print(accuracy_score(preds,test_target))

0.753968253968254


In [26]:
#Alpha is a smoothing parameter for Multinomial Naive Bayes

mnb = MultinomialNB(alpha = 0.3)
mnb.fit(train_word_columns,train_target)
preds = mnb.predict(test_word_columns)
print(accuracy_score(preds,test_target))

0.7777777777777778


In [13]:
#Feed into Gaussian Naive Bayes

gnb = GaussianNB()
gnb.fit(train_word_columns,train_target)
preds = gnb.predict(test_word_columns)
print(accuracy_score(preds,test_target))

0.4031746031746032


In [27]:
#Feed into Bernoulli Naive Bayes

bnb = BernoulliNB()
bnb.fit(train_word_columns,train_target)
preds = bnb.predict(test_word_columns)
print(accuracy_score(preds,test_target))

0.7158730158730159


In [28]:
#Feed into Support Vector Machine

svc = SVC()
svc.fit(train_word_columns,train_target)
preds = svc.predict(test_word_columns)
print(accuracy_score(preds,test_target))

0.7158730158730159


In [16]:
#Support Vector Machines with PCA

pca = PCA(n_components=400, whiten = 'True')
pca.fit(train_word_columns)
train_transformed = pca.transform(train_word_columns)
test_transformed = pca.transform(test_word_columns)

svc = SVC()
svc.fit(train_transformed, train_target)
preds = svc.predict(test_transformed)
print(accuracy_score(preds,test_target))

0.7571428571428571


In [17]:
#Gaussian Naive Bayes with PCA

gnb = GaussianNB()
gnb.fit(train_transformed, train_target)
preds = gnb.predict(test_transformed)
print(accuracy_score(preds,test_target))

0.6095238095238096


In [18]:
#Bernoulli Naive Bayes with PCA

bnb = BernoulliNB()
bnb.fit(train_transformed, train_target)
preds = bnb.predict(test_transformed)
print(accuracy_score(preds,test_target))

0.665079365079365


In [19]:
#Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(train_word_columns,train_target)
preds = rfc.predict(test_word_columns)
print(accuracy_score(preds,test_target))

  from numpy.core.umath_tests import inner1d


0.8031746031746032


In [20]:
#Find optimal number of decision trees in the random forest

rfc = RandomForestClassifier(n_estimators = 100)
rfc.fit(train_word_columns,train_target)
preds = rfc.predict(test_word_columns)
print(accuracy_score(preds,test_target))

0.8095238095238095


In [21]:
#Find optimal number of decision trees in the random forest

rfc = RandomForestClassifier(n_estimators = 50)
rfc.fit(train_word_columns,train_target)
preds = rfc.predict(test_word_columns)
print(accuracy_score(preds,test_target))

0.8063492063492064
