#Imports


In [232]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string # Imports the library
import re




import nltk  #imports the natural language toolkit
from nltk.corpus import stopwords #stopwords
from sklearn import datasets

#Vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#OverSamplers
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

#UnderSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import NearMiss

#ModelSelection/Validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#Pipeline
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

#Models

#Multiclass
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

#One Vs One
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import SVC

#One Vs Rest
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression


#Metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from collections import Counter

nltk.download('wordnet')

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#from gensim.utils import lemmatize as gs_lemmatize
#from pattern.en import parse


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [186]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Function Definitions


In [243]:
#DEFINE STOPWORD LIST
stopwords =['in', 'of', 'at', 'a','the', 'and']

def rm_stopwords(review):
  tokens = nltk.word_tokenize(review)
  stopworded = ' '.join(word for word in tokens if word.lower() not in stopwords)
  return stopworded

def rm_punct(review):
  #remove punctuation
  tokens = nltk.word_tokenize(review)
  cleaned_review = ' '.join(token for token in tokens if token not in string.punctuation)
  return cleaned_review

def trim_repeats(review):
  tokens = nltk.word_tokenize(review)
  trimmed = ' '.join(re.sub(r'(.)\1+', r'\1\1', word) for word in tokens)
  return trimmed

wnl = nltk.WordNetLemmatizer()
def lemmatize(review):
  tokens = nltk.word_tokenize(review)
  lemmatized = ' '.join(wnl.lemmatize(word) for word in tokens)
  return lemmatized
def tag(review):
  tokens = nltk.word_tokenize(review)
  taggedtokens = nltk.pos_tag(tokens)
  tpls= []
  for tpl in taggedtokens:
    tpls.append('/'.join(tpl))
  tagged_text = ' '.join(tpl for tpl in tpls)
  return tagged_text

def cross_validate(k, X,Y, pipeline):
  kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=23)
  accuracy = []
  precision = []
  recall = []
  i = 0
  f1 = []
  for train, test in kfold.split(X, Y):

      fit = pipeline.fit(X[train], Y[train])
      prediction = fit.predict(X[test])
      scores = fit.score(X[test],Y[test])
        
      accuracy.append(scores * 100)
      
      
      precision.append(precision_score(Y[test], prediction, average='macro')*100)
      print('Confusion Matrix: \n', confusion_matrix(Y[test], prediction))
      print('\nAccuracy: ', accuracy_score(Y[test], prediction))
      print('Per Class Metrics:')
      print('                1           2          3         4          5')
      print('Precision:',precision_score(Y[test], prediction, average=None))
      recall.append(recall_score(Y[test], prediction, average='macro')*100)
      print('Recall:   ',recall_score(Y[test], prediction, average=None))
      f1.append(f1_score(Y[test], prediction, average='macro')*100)
      print('F1 Score: ',f1_score(Y[test], prediction, average=None))
      print('-'*67)

  print("Average Macro Accuracy : %.2f ± %.2f%%" % (np.mean(accuracy), np.std(accuracy)))
  print("Average Macro Precision: %.2f ± %.2f%%" % (np.mean(precision), np.std(precision)))
  print("Average Macro Recall   : %.2f ± %.2f%%" % (np.mean(recall), np.std(recall)))
  print("Average Macro F1 score : %.2f ± %.2f%%" % (np.mean(f1), np.std(f1)))




#1) Read in Reviews

In [239]:
unaltered = pd.read_csv('unaltered10000reviews.csv')
unaltered.dropna()

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
2,4,love the gyro plate. Rice is so good and I als...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...
...,...,...
9995,3,First visit...Had lunch here today - used my G...
9996,4,Should be called house of deliciousness!\n\nI ...
9997,4,I recently visited Olive and Ivy for business ...
9998,2,My nephew just moved to Scottsdale recently so...


In [240]:
expanded = pd.read_csv('expanded10000reviews.csv')
expanded.dropna()

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5
...,...,...
9995,Should be called house of deliciousness!\n\nI ...,4
9996,I recently visited Olive and Ivy for business ...,4
9997,My nephew just moved to Scottsdale recently so...,2
9998,4-5 locations.. all 4.5 star average.. I think...,5


In [241]:
ourdata = pd.read_csv('unaltered1000reviews.csv')
ourdata.dropna()
ourdata = ourdata.drop(columns=['author'])
ourdata = ourdata.drop(columns=['date'])
ourdata = ourdata.drop(columns=['restaurant'])

ourdata['review']= ourdata['review'].str[2:-2]
ourdata['rating']=ourdata['rating'].str[2:-2]
ourdata['rating']=ourdata['rating'].str[:1]

ourdata = ourdata.rename(columns={'review': 'text', 'rating' : 'stars'})
ourdata.head()

Unnamed: 0,stars,text
0,5,Went for my birthday all covid 19 precautions ...
1,5,Had lunch here yesterday with my fiance. I'm v...
2,5,My new favorite local hangout. I am almost emb...
3,2,I really wanted to like this place. I came her...
4,4,"Visited for the first time, Sunday breakfast/b..."


In [242]:
#set preferred dataset
base = expanded.copy()

# |OPTIONAL| See Most common Words in Reviews (Used to determine potential stopwords)

**See Most common Words in All Reviews**

In [195]:
all = base[(base['stars'] == 5)]
Counter(" ".join(all['text'].str.lower()).split()).most_common(100)

[('the', 18606),
 ('and', 13628),
 ('i', 11897),
 ('a', 10066),
 ('to', 8477),
 ('is', 7377),
 ('of', 6062),
 ('it', 4766),
 ('was', 4713),
 ('in', 4394),
 ('for', 4275),
 ('have', 3793),
 ('you', 3736),
 ('my', 3527),
 ('not', 3383),
 ('that', 3254),
 ('with', 3196),
 ('this', 3155),
 ('they', 2849),
 ('are', 2813),
 ('we', 2563),
 ('on', 2370),
 ('but', 2353),
 ('had', 1965),
 ('so', 1843),
 ('as', 1747),
 ('at', 1736),
 ('great', 1632),
 ('place', 1499),
 ('be', 1462),
 ('were', 1402),
 ('very', 1342),
 ('if', 1318),
 ('their', 1310),
 ('all', 1285),
 ('will', 1241),
 ('there', 1236),
 ('just', 1219),
 ('like', 1185),
 ('food', 1176),
 ('one', 1147),
 ('do', 1134),
 ('good', 1104),
 ('am', 1081),
 ('from', 1081),
 ('get', 1080),
 ('me', 1055),
 ('love', 1027),
 ('or', 1023),
 ('out', 1013),
 ('your', 975),
 ('about', 970),
 ('can', 957),
 ('been', 948),
 ('here', 942),
 ('when', 938),
 ('would', 935),
 ('our', 920),
 ('go', 897),
 ('an', 890),
 ('some', 884),
 ('best', 859),
 ('-', 

**See Most common Words in 5 Star Reviews**

In [None]:
fives = base[(base['stars'] == 5)]
Counter(" ".join(fives['text'].str.lower()).split()).most_common(100)

**See Most common Words in 4 Star Reviews**

In [191]:
fours = base[(base['stars'] == 4)]
Counter(" ".join(fours['text'].str.lower()).split()).most_common(100)

NameError: ignored

**See Most common Words in 3 Star Reviews**

In [None]:
threes = base[(base['stars'] == 3)]
Counter(" ".join(threes['text'].str.lower()).split()).most_common(100)

**See Most common Words in 2 Star Reviews**

In [None]:
two = two[(two['stars'] == 2)]
Counter(" ".join(two['text'].str.lower()).split()).most_common(100)

**See Most common Words in 1 Star Reviews**

In [None]:
one = one[(one['stars'] == 1)]
Counter(" ".join(one['text'].str.lower()).split()).most_common(100)

# 2) Remove Stopwords

In [244]:
nostopwords = base.copy()
nostopwords['text'] = nostopwords['text'].apply(rm_stopwords)
#print(nostopwords['text'][3418])
#print(nostopwords['text'][6639])

# (OPTIONAL) Create Other Preprocessed Data Sets

In [None]:
print(base['text'][3418])

In [199]:
nopunct = base.copy()
nopunct['text'] = nopunct['text'].apply(rm_punct)
print(nopunct['text'][3418])
print(nopunct['text'][6639])

Very nice place with extremely friendly staff The food was similar quality to other Mexican food places in the NW Valley ... .good but not spectacular Very good portions for the price I do not live close by but will certainly go back when I am in the area
Looove this place I loved the sexy yet simple atmosphere I felt as though the air was cleaner somehow just walking inside Let me first say that Happy Hour prices were outstanding I have to admit I think my favorite Happy Hour spot StingRay has just been outdone The food was delicious All of it ALL ... OF ... IT From the cucumber salad to their famous appetizer the mustard leaf wrap it is flavorful and fresh The service was pretty good but there was some confusion I think more than once so it delayed the service a bit When I have to stop and say `` Hey where is my fill in the blank I know the service is lacking although I would say not enough to keep me away I was really really happy when I left here might have been my company but we a

In [201]:
trimmed = base.copy()
trimmed['text'] = trimmed['text'].apply(trim_repeats)
print(trimmed['text'][3418])
print(trimmed['text'][6639])

Very nice place with extremely friendly staff . The food was similar quality to other Mexican food places in the NW Valley .. .good but not spectacular . Very good portions for the price . I do not live close by but will certainly go back when I am in the area .
Loove this place ! I loved the sexy , yet simple atmosphere . I felt as though the air was cleaner somehow just walking inside . Let me first say that Happy Hour prices were outstanding ! I have to admit , I think my favorite Happy Hour spot , StingRay , has just been outdone ! The food was delicious ! All of it ! ALL .. OF .. IT ! From the cucumber salad to their famous appetizer , the mustard leaf wrap , it is flavorful and fresh . The service was pretty good , but there was some confusion I think more than once , so it delayed the service a bit . When I have to stop and say , `` Hey , where is my [ fill in the blank ] , I know the service is lacking , although I would say not enough to keep me away ! I was really , really ha

In [202]:
lemmatized = base.copy()
lemmatized['text'] = lemmatized['text'].apply(lemmatize)
print(lemmatized['text'][3418])
print(lemmatized['text'][6639])

Very nice place with extremely friendly staff . The food wa similar quality to other Mexican food place in the NW Valley ... .good but not spectacular . Very good portion for the price . I do not live close by but will certainly go back when I am in the area .
Looove this place ! I loved the sexy , yet simple atmosphere . I felt a though the air wa cleaner somehow just walking inside . Let me first say that Happy Hour price were outstanding ! I have to admit , I think my favorite Happy Hour spot , StingRay , ha just been outdone ! The food wa delicious ! All of it ! ALL ... OF ... IT ! From the cucumber salad to their famous appetizer , the mustard leaf wrap , it is flavorful and fresh . The service wa pretty good , but there wa some confusion I think more than once , so it delayed the service a bit . When I have to stop and say , `` Hey , where is my [ fill in the blank ] , I know the service is lacking , although I would say not enough to keep me away ! I wa really , really happy whe

In [203]:
tagged = base.copy()
tagged['text'] = tagged['text'].apply(tag)
print(tagged['text'][3418])
print(tagged['text'][6639])

Very/RB nice/JJ place/NN with/IN extremely/RB friendly/JJ staff/NN ./. The/DT food/NN was/VBD similar/JJ quality/NN to/TO other/JJ Mexican/JJ food/NN places/NNS in/IN the/DT NW/NNP Valley/NNP .../: .good/NN but/CC not/RB spectacular/JJ ./. Very/RB good/JJ portions/NNS for/IN the/DT price/NN ./. I/PRP do/VBP not/RB live/VB close/RB by/IN but/CC will/MD certainly/RB go/VB back/RB when/WRB I/PRP am/VBP in/IN the/DT area/NN ./.
Looove/VB this/DT place/NN !/. I/PRP loved/VBD the/DT sexy/NN ,/, yet/RB simple/JJ atmosphere/NN ./. I/PRP felt/VBD as/IN though/IN the/DT air/NN was/VBD cleaner/JJR somehow/RB just/RB walking/VBG inside/RB ./. Let/VB me/PRP first/JJ say/VBP that/IN Happy/JJ Hour/NNP prices/NNS were/VBD outstanding/JJ !/. I/PRP have/VBP to/TO admit/VB ,/, I/PRP think/VBP my/PRP$ favorite/JJ Happy/JJ Hour/NNP spot/NN ,/, StingRay/NNP ,/, has/VBZ just/RB been/VBN outdone/RB !/. The/DT food/NN was/VBD delicious/JJ !/. All/DT of/IN it/PRP !/. ALL/DT .../: OF/IN .../: IT/NNP !/. From/IN 

In [204]:
nopunctANDnostopwords = base.copy()
nopunctANDnostopwords['text'] = nopunctANDnostopwords['text'].apply(rm_stopwords)
nopunctANDnostopwords['text'] = nopunctANDnostopwords['text'].apply(rm_punct)

print(nopunctANDnostopwords['text'][3418])
print(nopunctANDnostopwords['text'][6639])

Very nice place with extremely friendly staff food was similar quality to other Mexican food places NW Valley ... .good but not spectacular Very good portions for price I do not live close by but will certainly go back when I am area
Looove this place I loved sexy yet simple atmosphere I felt as though air was cleaner somehow just walking inside Let me first say that Happy Hour prices were outstanding I have to admit I think my favorite Happy Hour spot StingRay has just been outdone food was delicious All it ALL ... ... IT From cucumber salad to their famous appetizer mustard leaf wrap it is flavorful fresh service was pretty good but there was some confusion I think more than once so it delayed service bit When I have to stop say `` Hey where is my fill blank I know service is lacking although I would say not enough to keep me away I was really really happy when I left here might have been my company but we all loved it so that is what mattered


In [205]:
lemmatizedANDnostopwords = base.copy()
lemmatizedANDnostopwords['text'] = lemmatizedANDnostopwords['text'].apply(rm_stopwords)
lemmatizedANDnostopwords['text'] = lemmatizedANDnostopwords['text'].apply(lemmatize)
print(lemmatizedANDnostopwords['text'][3418])
print(lemmatizedANDnostopwords['text'][6639])

Very nice place with extremely friendly staff . food wa similar quality to other Mexican food place NW Valley ... .good but not spectacular . Very good portion for price . I do not live close by but will certainly go back when I am area .
Looove this place ! I loved sexy , yet simple atmosphere . I felt a though air wa cleaner somehow just walking inside . Let me first say that Happy Hour price were outstanding ! I have to admit , I think my favorite Happy Hour spot , StingRay , ha just been outdone ! food wa delicious ! All it ! ALL ... ... IT ! From cucumber salad to their famous appetizer , mustard leaf wrap , it is flavorful fresh . service wa pretty good , but there wa some confusion I think more than once , so it delayed service bit . When I have to stop say , `` Hey , where is my [ fill blank ] , I know service is lacking , although I would say not enough to keep me away ! I wa really , really happy when I left here , might have been my company , but we all loved it so that is w

In [206]:
trimmedANDnostopwords = base.copy()
trimmedANDnostopwords['text'] = trimmedANDnostopwords['text'].apply(trim_repeats)
trimmedANDnostopwords['text'] = trimmedANDnostopwords['text'].apply(rm_stopwords)

print(trimmedANDnostopwords['text'][3418])
print(trimmedANDnostopwords['text'][6639])

Very nice place with extremely friendly staff . food was similar quality to other Mexican food places NW Valley .. .good but not spectacular . Very good portions for price . I do not live close by but will certainly go back when I am area .
Loove this place ! I loved sexy , yet simple atmosphere . I felt as though air was cleaner somehow just walking inside . Let me first say that Happy Hour prices were outstanding ! I have to admit , I think my favorite Happy Hour spot , StingRay , has just been outdone ! food was delicious ! All it ! ALL .. .. IT ! From cucumber salad to their famous appetizer , mustard leaf wrap , it is flavorful fresh . service was pretty good , but there was some confusion I think more than once , so it delayed service bit . When I have to stop say , `` Hey , where is my [ fill blank ] , I know service is lacking , although I would say not enough to keep me away ! I was really , really happy when I left here , might have been my company , but we all loved it so th

In [207]:
all = base.copy()
all['text'] = all['text'].apply(trim_repeats)
all['text'] = all['text'].apply(lemmatize)
all['text'] = all['text'].apply(rm_stopwords)
all['text'] = all['text'].apply(rm_punct)
# all['text'] = all['text'].apply(tag)

print(all['text'][3418])
print(all['text'][6639])

Very nice place with extremely friendly staff food wa similar quality to other Mexican food place NW Valley .. .good but not spectacular Very good portion for price I do not live close by but will certainly go back when I am area
Loove this place I loved sexy yet simple atmosphere I felt though air wa cleaner somehow just walking inside Let me first say that Happy Hour price were outstanding I have to admit I think my favorite Happy Hour spot StingRay ha just been outdone food wa delicious All it ALL .. .. IT From cucumber salad to their famous appetizer mustard leaf wrap it is flavorful fresh service wa pretty good but there wa some confusion I think more than once so it delayed service bit When I have to stop say `` Hey where is my fill blank I know service is lacking although I would say not enough to keep me away I wa really really happy when I left here might have been my company but we all loved it so that is what mattered


# 3) Initialize Pipeline Components


In [245]:
#SET PARAMETERS HERE
maxdf = 0.65
mindf = 1
multiclassLSVC = 'crammer_singer'
multiclassLR = 'ovr'
ngrams = (1,3)
nmversion = 1
rndmst = 23
trees = 1000
clswgt = 'balanced'
sgdloss = 'hinge'
lrslvr = 'liblinear'
#================================================================================
#MODELS
#================================================================================

#one vs one
svc = SVC(random_state=rndmst)

#multiclass
mlp = MLPClassifier(random_state=rndmst)
mlp.out_activation_ = 'softmax'
gnb = GaussianNB()
rfc = RandomForestClassifier(random_state=rndmst, n_estimators=trees)

#one vs rest
lsvc = LinearSVC(random_state=rndmst, multi_class=multiclassLSVC, max_iter=100000000,class_weight=clswgt)
lr = LogisticRegression(multi_class = multiclassLR, random_state=rndmst, max_iter=100000000, C=0.1, penalty = 'l2', solver = lrslvr)
sgd = SGDClassifier(random_state=rndmst,loss=sgdloss,early_stopping=True, class_weight=clswgt)

#================================================================================
#VECTORIZERS
#================================================================================
tvec = TfidfVectorizer(ngram_range=ngrams, max_df=maxdf, min_df = mindf)
cvec = CountVectorizer(ngram_range=ngrams, max_df=maxdf, min_df = mindf)

rus = RandomUnderSampler(random_state=rndmst)
smt = SMOTE(random_state=rndmst)

#================================================================================
#RESAMPLERS
#================================================================================

#Undersamplers
rus = RandomUnderSampler(random_state=rndmst)
enn = EditedNearestNeighbours(random_state=rndmst)
tm = TomekLinks(random_state=rndmst)
nm = NearMiss(random_state=rndmst,version=nmversion)

#Oversamplers
smt = SMOTE(random_state=rndmst)



#4) Construct a Pipeline

In [246]:
pipelinename = "tvec, sgd, ngrams: 1,3"
pipeline = make_pipeline(tvec,sgd)

# 5) Run pipeline on Dataset with Stopwords Removed

In [247]:
cross_validate(5, nostopwords['text'], nostopwords['stars'], pipeline)

Confusion Matrix: 
 [[ 82  41   9   9   8]
 [ 38  71  48  16  12]
 [ 10  33 121 100  29]
 [ 17  22  77 374 215]
 [ 13  10  19 180 446]]

Accuracy:  0.547
Per Class Metrics:
                1           2          3         4          5
Precision: [0.5125     0.40112994 0.44160584 0.55081001 0.62816901]
Recall:    [0.55033557 0.38378378 0.41296928 0.53049645 0.66766467]
F1 Score:  [0.53074434 0.39226519 0.42680776 0.54046243 0.64731495]
-------------------------------------------------------------------
Confusion Matrix: 
 [[ 89  29  14   7  11]
 [ 39  68  53  19   6]
 [ 21  43 116  94  18]
 [  6  19  71 389 220]
 [ 11   3  26 164 464]]

Accuracy:  0.563
Per Class Metrics:
                1           2          3         4          5
Precision: [0.53614458 0.41975309 0.41428571 0.57800892 0.64534075]
Recall:    [0.59333333 0.36756757 0.39726027 0.55177305 0.69461078]
F1 Score:  [0.56329114 0.39193084 0.40559441 0.56458636 0.66906994]
------------------------------------------------------

# (OPTIONAL) Run on Other Preprocessed Data Set(s)

In [175]:
cross_validate(5, base['text'], base['stars'], pipeline)

Confusion Matrix: 
 [[ 81  43   7   7  11]
 [ 42  61  53  20   9]
 [ 13  32 114 107  27]
 [ 10  21  55 394 225]
 [  8   9  13 183 455]]

Accuracy:  0.5525
Per Class Metrics:
                1           2          3         4          5
Precision: [0.52597403 0.36746988 0.47107438 0.55414909 0.6258597 ]
Recall:    [0.54362416 0.32972973 0.3890785  0.55886525 0.68113772]
F1 Score:  [0.53465347 0.34757835 0.42616822 0.55649718 0.65232975]
-------------------------------------------------------------------
Confusion Matrix: 
 [[ 96  31  10   6   7]
 [ 46  69  49  16   5]
 [ 15  45 116  89  27]
 [ 11  22  80 355 237]
 [ 15   4  28 155 466]]

Accuracy:  0.551
Per Class Metrics:
                1           2          3         4          5
Precision: [0.52459016 0.40350877 0.40989399 0.57165862 0.62803235]
Recall:    [0.64       0.37297297 0.39726027 0.5035461  0.69760479]
F1 Score:  [0.57657658 0.38764045 0.40347826 0.53544495 0.66099291]
-----------------------------------------------------

In [148]:
cross_validate(5, nopunct['text'], nopunct['stars'], pipeline)

Confusion Matrix: 
 [[ 76  46   9   9   9]
 [ 42  64  53  14  12]
 [ 14  30 124 100  25]
 [ 12  20  68 371 234]
 [  8  11  18 165 466]]

Accuracy:  0.5505
Per Class Metrics:
                1           2          3         4          5
Precision: [0.5        0.37426901 0.45588235 0.5629742  0.62466488]
Recall:    [0.51006711 0.34594595 0.42320819 0.52624113 0.69760479]
F1 Score:  [0.50498339 0.35955056 0.43893805 0.54398827 0.65912306]
-------------------------------------------------------------------
Confusion Matrix: 
 [[ 86  36  11   8   9]
 [ 40  72  47  21   5]
 [ 15  40 109 105  23]
 [  8  19  70 384 224]
 [ 13   2  24 183 446]]

Accuracy:  0.5485
Per Class Metrics:
                1           2          3         4          5
Precision: [0.5308642  0.4260355  0.41762452 0.54778887 0.63083451]
Recall:    [0.57333333 0.38918919 0.37328767 0.54468085 0.66766467]
F1 Score:  [0.55128205 0.40677966 0.39421338 0.54623044 0.64872727]
----------------------------------------------------

Confusion Matrix: 
 [[ 81  41  11   7   9]
 [ 41  67  50  14  13]
 [ 16  33 125  93  26]
 [ 14  21  71 373 226]
 [  8   8  15 173 464]]

Accuracy:  0.555
Per Class Metrics:
                1           2          3         4          5
Precision: [0.50625    0.39411765 0.45955882 0.56515152 0.62872629]
Recall:    [0.54362416 0.36216216 0.42662116 0.52907801 0.69461078]
F1 Score:  [0.52427184 0.37746479 0.44247788 0.54652015 0.66002845]
-------------------------------------------------------------------
Confusion Matrix: 
 [[ 91  33  10   8   8]
 [ 40  73  45  19   8]
 [ 14  35 109 109  25]
 [ 10  20  58 402 215]
 [ 11   3  23 176 455]]

Accuracy:  0.565
Per Class Metrics:
                1           2          3         4          5
Precision: [0.54819277 0.44512195 0.44489796 0.56302521 0.63994374]
Recall:    [0.60666667 0.39459459 0.37328767 0.57021277 0.68113772]
F1 Score:  [0.57594937 0.41833811 0.40595903 0.56659619 0.65989848]
------------------------------------------------------

In [150]:
cross_validate(5, trimmed['text'], trimmed['stars'], pipeline)

Confusion Matrix: 
 [[ 78  43  10   9   9]
 [ 44  60  54  15  12]
 [ 14  31 125 101  22]
 [ 13  20  69 377 226]
 [  7  11  18 177 455]]

Accuracy:  0.5475
Per Class Metrics:
                1           2          3         4          5
Precision: [0.5        0.36363636 0.45289855 0.55522828 0.62845304]
Recall:    [0.52348993 0.32432432 0.42662116 0.53475177 0.68113772]
F1 Score:  [0.51147541 0.34285714 0.43936731 0.54479769 0.65373563]
-------------------------------------------------------------------
Confusion Matrix: 
 [[ 89  33  11   8   9]
 [ 43  64  50  23   5]
 [ 16  36 109 108  23]
 [  8  15  69 394 219]
 [ 12   3  24 187 442]]

Accuracy:  0.549
Per Class Metrics:
                1           2          3         4          5
Precision: [0.5297619  0.42384106 0.41444867 0.54722222 0.63323782]
Recall:    [0.59333333 0.34594595 0.37328767 0.55886525 0.66167665]
F1 Score:  [0.55974843 0.38095238 0.39279279 0.55298246 0.64714495]
-----------------------------------------------------

In [151]:
cross_validate(5, lemmatized['text'], lemmatized['stars'], pipeline)

Confusion Matrix: 
 [[ 80  36  14   8  11]
 [ 41  58  61  14  11]
 [ 17  26 130  98  22]
 [ 11  13  81 378 222]
 [ 12  11  18 173 454]]

Accuracy:  0.55
Per Class Metrics:
                1           2          3         4          5
Precision: [0.49689441 0.40277778 0.42763158 0.5633383  0.63055556]
Recall:    [0.53691275 0.31351351 0.44368601 0.53617021 0.67964072]
F1 Score:  [0.51612903 0.35258359 0.43551089 0.5494186  0.65417867]
-------------------------------------------------------------------
Confusion Matrix: 
 [[ 88  37  10   6   9]
 [ 43  69  55  14   4]
 [ 13  32 124  91  32]
 [ 10  21  82 350 242]
 [ 13   2  28 163 462]]

Accuracy:  0.5465
Per Class Metrics:
                1           2          3         4          5
Precision: [0.52694611 0.42857143 0.41471572 0.56089744 0.61682243]
Recall:    [0.58666667 0.37297297 0.42465753 0.4964539  0.69161677]
F1 Score:  [0.55520505 0.39884393 0.41962775 0.52671181 0.65208186]
------------------------------------------------------

In [125]:
cross_validate(5, lemmatizedANDnostopwords['text'], lemmatizedANDnostopwords['stars'], pipeline)

Confusion Matrix: 
 [[ 81  43  13   3   9]
 [ 41  76  45  12  11]
 [ 13  34 124  95  27]
 [ 14  17  87 365 222]
 [ 12  11  20 171 454]]

Accuracy:  0.55
Per Class Metrics:
                1           2          3         4          5
Precision: [0.50310559 0.4198895  0.42906574 0.56501548 0.62793914]
Recall:    [0.54362416 0.41081081 0.42320819 0.5177305  0.67964072]
F1 Score:  [0.52258065 0.41530055 0.42611684 0.54034049 0.65276779]
-------------------------------------------------------------------
Confusion Matrix: 
 [[ 90  31  12   7  10]
 [ 34  69  54  21   7]
 [ 19  46 114  93  20]
 [  9  21  73 390 212]
 [ 11   2  27 178 450]]

Accuracy:  0.5565
Per Class Metrics:
                1           2          3         4          5
Precision: [0.55214724 0.40828402 0.40714286 0.56603774 0.64377682]
Recall:    [0.6        0.37297297 0.39041096 0.55319149 0.67365269]
F1 Score:  [0.57507987 0.38983051 0.3986014  0.55954089 0.65837601]
------------------------------------------------------

In [None]:
cross_validate(5, nopunctANDnostopwords['text'], nopunctANDnostopwords['stars'], pipeline)

In [None]:
cross_validate(5, trimmedANDnostopwords['text'], trimmedANDnostopwords['stars'], pipeline)

In [None]:
cross_validate(5, all['text'], all['stars'], pipeline)

In [152]:
cross_validate(5, tagged['text'], tagged['stars'], pipeline)

Confusion Matrix: 
 [[ 85  40   8   3  13]
 [ 53  63  42  13  14]
 [ 17  45 114  76  41]
 [ 17  37  64 333 254]
 [ 14  14  18 139 483]]

Accuracy:  0.539
Per Class Metrics:
                1           2          3         4          5
Precision: [0.45698925 0.31658291 0.46341463 0.59042553 0.6       ]
Recall:    [0.5704698  0.34054054 0.3890785  0.47234043 0.72305389]
F1 Score:  [0.50746269 0.328125   0.42300557 0.5248227  0.65580448]
-------------------------------------------------------------------
Confusion Matrix: 
 [[ 87  40   9   3  11]
 [ 46  91  33  12   3]
 [ 19  64 103  85  21]
 [ 12  39  87 365 202]
 [ 19  15  36 186 412]]

Accuracy:  0.529
Per Class Metrics:
                1           2          3         4          5
Precision: [0.47540984 0.36546185 0.38432836 0.56067588 0.6348228 ]
Recall:    [0.58       0.49189189 0.35273973 0.5177305  0.61676647]
F1 Score:  [0.52252252 0.41935484 0.36785714 0.53834808 0.62566439]
------------------------------------------------------

# (OPTIONAL) Tuning Hyperparameters

In [None]:
lsvc_params = {'penalty': ['l1','l2'],
               'C':[1,10,20]}
lsvc_grid = GridSearchCV(lsvc, param_grid=lsvc_params, cv = 5)
X_train = tvec.fit_transform(nostopwords['text'])
y_train = nostopwords['stars']
lsvc_grid.fit(X_train,y_train)

In [None]:
df = pd.DataFrame(lsvc_grid.cv_results_)
df

In [None]:
lsvc_grid.best_score_

In [None]:
lsvc_grid.best_params_

In [None]:
sgd_params = {'penalty': ['l1','l2', 'elasticnet'],
                'loss' : ['hinge', 'modified_huber', 'log'],
                'class_weight': ['balanced']
              }
sgd_grid = GridSearchCV(sgd, param_grid=sgd_params, cv = 5)
tvec_undersample = make_pipeline(nm,tvec)
X_train = tvec.fit_transform(nostopwords['text'])
y_train = nostopwords['stars']
sgd_grid.fit(X_train,y_train)

In [None]:
df = pd.DataFrame(sgd_grid.cv_results_)
df

In [None]:
sgd_grid.best_score_

In [None]:
sgd_grid.best_params_

In [None]:
lr_params = {'penalty': ['l1','l2'],
            'solver':['liblinear'],
          'C': np.logspace(-10,10,21)}
lr_grid = GridSearchCV(lr, param_grid=lr_params, cv = 5)
X_train = cvec.fit_transform(nostopwords['text'])
y_train = nostopwords['stars']
lr_grid.fit(X_train,y_train)

In [None]:
df = pd.DataFrame(lr_grid.cv_results_)
df

In [None]:
lr_grid.best_score_

In [None]:
lr_grid.best_params_

In [None]:
svc_params = {'kernel' : ['linear', 'rbf'],
          'C': [0.1]}
svc_grid = GridSearchCV(svc, param_grid=svc_params, cv = 5)
X_train = tvec.fit_transform(nostopwords['text'])
y_train = nostopwords['stars']
svc_grid.fit(X_train,y_train)

In [None]:
df = pd.DataFrame(svc_grid.cv_results_)
df

In [None]:
svc_grid.best_score_

In [None]:
svc_grid.best_params_