### Model Building

In [1]:
#Importing Necessary Libraries
import pandas as pd
import  numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
ls = WordNetLemmatizer()

from sklearn.metrics import accuracy_score,confusion_matrix,auc,classification_report,recall_score,precision_score,precision_recall_curve,f1_score,roc_curve

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
#Loading Dataset
reviews=pd.read_csv('Preprocessed_Dataset.csv')

In [4]:
reviews=reviews[['body','cleaned']]
reviews

Unnamed: 0,body,cleaned
0,The media could not be loaded.\n ...,alexa cannot hear after she starts playing
1,I purchased this as a birthday gift for my 7 y...,i purchased this as a birthday gift for my yea...
2,The media could not be loaded.\n ...,here i am uploading video enjoy most idiotic ...
3,The media could not be loaded.\n ...,do not buy this product when i asked alexa th...
4,Its just one if the best deal i ever got on am...,its just one if the best deal i ever got on am...
...,...,...
4152,OK OK,ok ok
4153,I like alexa because this product is amazing. ...,i like alexa because this product is amazing m...
4154,Voice recognition problem persist,voice recognition problem persist
4155,It's a nice product,it is a nice product


In [5]:
#checking for null entries
reviews.isnull().sum()

body        0
cleaned    35
dtype: int64

In [6]:
#removing null cells
reviews=reviews.dropna(axis=0)
reviews

Unnamed: 0,body,cleaned
0,The media could not be loaded.\n ...,alexa cannot hear after she starts playing
1,I purchased this as a birthday gift for my 7 y...,i purchased this as a birthday gift for my yea...
2,The media could not be loaded.\n ...,here i am uploading video enjoy most idiotic ...
3,The media could not be loaded.\n ...,do not buy this product when i asked alexa th...
4,Its just one if the best deal i ever got on am...,its just one if the best deal i ever got on am...
...,...,...
4152,OK OK,ok ok
4153,I like alexa because this product is amazing. ...,i like alexa because this product is amazing m...
4154,Voice recognition problem persist,voice recognition problem persist
4155,It's a nice product,it is a nice product


In [7]:
#splitting the sentence
df_cleaned=reviews.cleaned.str.split()
df_cleaned

0       [alexa, cannot, hear, after, she, starts, play...
1       [i, purchased, this, as, a, birthday, gift, fo...
2       [here, i, am, uploading, video, enjoy, most, i...
3       [do, not, buy, this, product, when, i, asked, ...
4       [its, just, one, if, the, best, deal, i, ever,...
                              ...                        
4152                                             [ok, ok]
4153    [i, like, alexa, because, this, product, is, a...
4154               [voice, recognition, problem, persist]
4155                           [it, is, a, nice, product]
4156                            [awesome, i, love, alexa]
Name: cleaned, Length: 4122, dtype: object

In [8]:
#performing stemming and lemmatization after stopwords removal
stop_words = set(stopwords.words('english'))
df_stemmed = df_cleaned.apply(lambda x: [ps.stem(word) for word in x if word not in stop_words])
df_lemmatized = df_cleaned.apply(lambda x: [ls.lemmatize(word) for word in x if word not in stop_words])

In [9]:
df_stemmed = df_stemmed.apply(lambda x: ' '.join(x))
df_lemmatized = df_lemmatized.apply(lambda x: ' '.join(x))

In [10]:
reviews['Stemmed_text']=df_stemmed.to_frame()
reviews['Lemmatized_text']=df_lemmatized.to_frame()

In [11]:
reviews

Unnamed: 0,body,cleaned,Stemmed_text,Lemmatized_text
0,The media could not be loaded.\n ...,alexa cannot hear after she starts playing,alexa cannot hear start play,alexa cannot hear start playing
1,I purchased this as a birthday gift for my 7 y...,i purchased this as a birthday gift for my yea...,purchas birthday gift year old son sinc sibl t...,purchased birthday gift year old son since sib...
2,The media could not be loaded.\n ...,here i am uploading video enjoy most idiotic ...,upload video enjoy idiot devic everi bought ne...,uploading video enjoy idiotic device every bou...
3,The media could not be loaded.\n ...,do not buy this product when i asked alexa th...,buy product ask alexa kashmir part countri sai...,buy product asked alexa kashmir part country s...
4,Its just one if the best deal i ever got on am...,its just one if the best deal i ever got on am...,one best deal ever got amazon purchas watt wip...,one best deal ever got amazon purchased watt w...
...,...,...,...,...
4152,OK OK,ok ok,ok ok,ok ok
4153,I like alexa because this product is amazing. ...,i like alexa because this product is amazing m...,like alexa product amaz mani problem solv alex...,like alexa product amazing many problem solve ...
4154,Voice recognition problem persist,voice recognition problem persist,voic recognit problem persist,voice recognition problem persist
4155,It's a nice product,it is a nice product,nice product,nice product


In [12]:
df=reviews[['Stemmed_text']]

In [13]:
#finding polarity from textblob
from textblob import TextBlob
df['polarity']=(round(reviews['Stemmed_text'].apply(lambda x:TextBlob(x).sentiment.polarity),4))

In [14]:
df

Unnamed: 0,Stemmed_text,polarity
0,alexa cannot hear start play,0.0000
1,purchas birthday gift year old son sinc sibl t...,0.1881
2,upload video enjoy idiot devic everi bought ne...,0.2000
3,buy product ask alexa kashmir part countri sai...,0.0000
4,one best deal ever got amazon purchas watt wip...,0.3238
...,...,...
4152,ok ok,0.5000
4153,like alexa product amaz mani problem solv alex...,0.0000
4154,voic recognit problem persist,0.0000
4155,nice product,0.6000


In [15]:
pol=[]
for i in df.polarity:
    if i<0:
        pol.append('negative')
    elif i>0:
        pol.append('positive')
    else:
        pol.append('neutral')
df['sentiment']=pol

In [16]:
df.head(10)

Unnamed: 0,Stemmed_text,polarity,sentiment
0,alexa cannot hear start play,0.0,neutral
1,purchas birthday gift year old son sinc sibl t...,0.1881,positive
2,upload video enjoy idiot devic everi bought ne...,0.2,positive
3,buy product ask alexa kashmir part countri sai...,0.0,neutral
4,one best deal ever got amazon purchas watt wip...,0.3238,positive
5,worst experi alexa warranti get product amazon...,-0.4429,negative
6,buy product pleas keep point mind need amazon ...,0.31,positive
7,use alexa dot nd gen today gift dot rd gen dad...,0.1946,positive
8,work beauti bought link entir hous frequent gr...,0.05,positive
9,use get bulb worth make echo dot roughli dolla...,0.1121,positive


In [17]:
df.sentiment.value_counts()

positive    2773
neutral     1074
negative     275
Name: sentiment, dtype: int64

In [18]:
df['target'] = df['sentiment'].replace({'positive':2,'neutral':1,'negative':0})

In [19]:
#vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X=cv.fit_transform(df['Stemmed_text'].values.astype('U'))
y=df['target']

In [20]:
vect = CountVectorizer(min_df=0., max_df=1.0)
X = vect.fit_transform(df['Stemmed_text'])
print(pd.DataFrame(X.A, columns=vect.get_feature_names()).to_string())
df = pd.DataFrame(X.toarray().transpose(), index = vect.get_feature_names())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [21]:
#document term matrix
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4112,4113,4114,4115,4116,4117,4118,4119,4120,4121
aa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aap,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aapl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aaya,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yur,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zomato,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoom,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#Splitting for train and test data
from sklearn.model_selection  import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

### Naive Bayes Classifier

In [23]:
from sklearn.naive_bayes import MultinomialNB
review_classifier_model=MultinomialNB()
review_classifier_model.fit(X_train,y_train)
y_pred=review_classifier_model.predict(X_test)
np.mean(y_pred==y_test)

0.7490909090909091

In [24]:
nb_acc=round(accuracy_score(y_test,y_pred),4)

### Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression(random_state=1)
lr_model.fit(X_train,y_train)

y_pred=lr_model.predict(X_test)
np.mean(y_pred==y_test)

0.9357575757575758

In [26]:
lr_acc=round(accuracy_score(y_test,y_pred),4)

### Decision Tree

In [27]:
from sklearn.tree import DecisionTreeClassifier
dt_model=DecisionTreeClassifier(random_state=1,max_depth=15)
dt_model.fit(X_train,y_train)

y_pred=dt_model.predict(X_test)
np.mean(y_pred==y_test)

0.8690909090909091

In [28]:
dt_acc=round(accuracy_score(y_test,y_pred),4)

### Random Forest

In [29]:
from sklearn.ensemble import RandomForestClassifier
rf_model=RandomForestClassifier(random_state=1)
rf_model.fit(X_train,y_train)

y_pred=rf_model.predict(X_test)
np.mean(y_pred==y_test)

0.9151515151515152

In [30]:
rf_acc=round(accuracy_score(y_test,y_pred),4)

### Ada Boost

In [31]:
from sklearn.ensemble import AdaBoostClassifier
ada_model=AdaBoostClassifier(random_state=1)
ada_model.fit(X_train,y_train)

y_pred=ada_model.predict(X_test)
np.mean(y_pred==y_test)

0.9018181818181819

In [32]:
ada_acc=round(accuracy_score(y_test,y_pred),4)

### SVM with GridSearch

In [33]:
from sklearn.svm import SVC
svc = SVC(random_state=1)

svc_param = ({'C':[0.02,0.2,2,8],
             'kernel':['linear', 'rbf', 'sigmoid'],
             'gamma':['scale', 'auto']
             })

In [34]:
#Grid Search CV:
from sklearn.model_selection import GridSearchCV
grid_cv = GridSearchCV(svc, param_grid=svc_param, cv=5,verbose=5)
grid_cv.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END .............C=0.02, gamma=scale, kernel=linear; total time=   0.3s
[CV 2/5] END .............C=0.02, gamma=scale, kernel=linear; total time=   0.3s
[CV 3/5] END .............C=0.02, gamma=scale, kernel=linear; total time=   0.3s
[CV 4/5] END .............C=0.02, gamma=scale, kernel=linear; total time=   0.3s
[CV 5/5] END .............C=0.02, gamma=scale, kernel=linear; total time=   0.3s
[CV 1/5] END ................C=0.02, gamma=scale, kernel=rbf; total time=   0.4s
[CV 2/5] END ................C=0.02, gamma=scale, kernel=rbf; total time=   0.4s
[CV 3/5] END ................C=0.02, gamma=scale, kernel=rbf; total time=   0.4s
[CV 4/5] END ................C=0.02, gamma=scale, kernel=rbf; total time=   0.4s
[CV 5/5] END ................C=0.02, gamma=scale, kernel=rbf; total time=   0.4s
[CV 1/5] END ............C=0.02, gamma=scale, kernel=sigmoid; total time=   0.3s
[CV 2/5] END ............C=0.02, gamma=scale, k

[CV 2/5] END ...............C=8, gamma=scale, kernel=sigmoid; total time=   0.1s
[CV 3/5] END ...............C=8, gamma=scale, kernel=sigmoid; total time=   0.1s
[CV 4/5] END ...............C=8, gamma=scale, kernel=sigmoid; total time=   0.1s
[CV 5/5] END ...............C=8, gamma=scale, kernel=sigmoid; total time=   0.1s
[CV 1/5] END .................C=8, gamma=auto, kernel=linear; total time=   0.2s
[CV 2/5] END .................C=8, gamma=auto, kernel=linear; total time=   0.2s
[CV 3/5] END .................C=8, gamma=auto, kernel=linear; total time=   0.2s
[CV 4/5] END .................C=8, gamma=auto, kernel=linear; total time=   0.2s
[CV 5/5] END .................C=8, gamma=auto, kernel=linear; total time=   0.2s
[CV 1/5] END ....................C=8, gamma=auto, kernel=rbf; total time=   0.3s
[CV 2/5] END ....................C=8, gamma=auto, kernel=rbf; total time=   0.3s
[CV 3/5] END ....................C=8, gamma=auto, kernel=rbf; total time=   0.3s
[CV 4/5] END ...............

GridSearchCV(cv=5, estimator=SVC(random_state=1),
             param_grid={'C': [0.02, 0.2, 2, 8], 'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid']},
             verbose=5)

In [35]:
grid_cv.best_params_

{'C': 2, 'gamma': 'scale', 'kernel': 'linear'}

In [36]:
grid_svc_pred = grid_cv.predict(X_test)
np.mean(grid_svc_pred==y_test)

0.9563636363636364

In [37]:
svc=SVC(C= 2, gamma= 'scale', kernel= 'linear')
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)

In [38]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.73      0.78        55
           1       0.91      0.98      0.95       230
           2       0.99      0.97      0.98       540

    accuracy                           0.96       825
   macro avg       0.91      0.89      0.90       825
weighted avg       0.96      0.96      0.96       825



In [39]:
svc_acc=round(accuracy_score(y_test,y_pred),4)

In [40]:
data=[['Multinomial Naive Bayes',nb_acc],['Logistic Regression',lr_acc],['Decision Tree',dt_acc],['Random Forest',rf_acc],
      ['Ada Boost',ada_acc],['Support Vector Classifier',svc_acc]]

df=pd.DataFrame(data,columns=['Model','test_accuracy'])

In [41]:
df

Unnamed: 0,Model,test_accuracy
0,Multinomial Naive Bayes,0.7491
1,Logistic Regression,0.9358
2,Decision Tree,0.8691
3,Random Forest,0.9152
4,Ada Boost,0.9018
5,Support Vector Classifier,0.9564


In [42]:
print(confusion_matrix(y_test,y_pred))

[[ 40   8   7]
 [  5 225   0]
 [  3  13 524]]


In [43]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.73      0.78        55
           1       0.91      0.98      0.95       230
           2       0.99      0.97      0.98       540

    accuracy                           0.96       825
   macro avg       0.91      0.89      0.90       825
weighted avg       0.96      0.96      0.96       825



### Manually Testing the Model

In [44]:
test=input('Please enter a review for which sentiment needs to be predicted:\n')

Please enter a review for which sentiment needs to be predicted:
Amazon Echo Dot Review With more and more smart speakers popping up on the market, the consumer has more choices to choose from. The Amazon Echo Dot Fourth Generation allows the user to listen to music, complete tasks, and more by saying one word. The Amazon Echo Dot Fourth Generation is the overall best small smart speaker in the market, because of its great sound quality, accessibility, and affordable price tag. Firstly, the Amazon Echo Dot or just Echo Dot for short, provides great sound quality for when listening to music, listening to the news, or just communicating with the Echo Dot, that is incomparable to others. This is mainly due to the front firing speaker system that allows the rather small device to disperse the sound waves to all walls of a room. When placed in a room, the smart speaker can provide immersive sound that will instantaneously lighten the mood. The latest Echo Dot has much better speaker quality

In [45]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"couldn\'t", "could not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

from nltk.corpus import stopwords
def cleaning(text):
    corpus = []
    text = decontracted(text)
    text = text.lower()                              #lowering the text
    text = re.sub(r'#\S+','',text)                   #Remove the hyper link
    text = re.sub('[^a-z]',' ',text)              #Remove the character other than alphabet
    text = text.split()
    text=[ps.stem(word) for word in text if word not in stopwords.words('english')]
    text=' '.join(text)
    corpus.append(text)
    return corpus

In [46]:
review_cleaned=cleaning(test)
type(review_cleaned)

list

In [47]:
review_cleaned

['amazon echo dot review smart speaker pop market consum choic choos amazon echo dot fourth gener allow user listen music complet task say one word amazon echo dot fourth gener overal best small smart speaker market great sound qualiti access afford price tag firstli amazon echo dot echo dot short provid great sound qualiti listen music listen news commun echo dot incompar other mainli due front fire speaker system allow rather small devic dispers sound wave wall room place room smart speaker provid immers sound instantan lighten mood latest echo dot much better speaker qualiti prior gener opinion competitor speaker help even simpl task alarm sound qualiti allow user woken favorit song right amount sound sound qualiti echo dot key featur best could major break point purchas amazon echo dot addit amazon echo dot incorpor simpl yet power word allow hous surround access technolog unlik competitor wake word alexa open echo dot mani possibl simpl word subsequ phrase smart speaker may common

In [48]:
df=cv.transform(review_cleaned).toarray()
pred=svc.predict(df)
label=pred[0]

if label==2:
    print('Positive')
elif label==0:
    print('Negative')
else:
    print('Neutral')

Positive


### Saving the model

In [50]:
from pickle import dump
dump(cv,open('pickle/countvectorizer.pkl','wb'))
dump(svc,open('pickle/model.pkl','wb'))

### Predicting new review

In [56]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"couldn\'t", "could not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

from nltk.corpus import stopwords
def cleaning(text):
    corpus = []
    text = decontracted(text)
    text = text.lower()                              #lowering the text
    text = re.sub(r'#\S+','',text)                   #Remove the hyper link
    text = re.sub('[^a-z]',' ',text)              #Remove the character other than alphabet
    text = text.split()
    text=[ps.stem(word) for word in text if word not in stopwords.words('english')]
    text=' '.join(text)
    corpus.append(text)
    return corpus

In [57]:
from pickle import load

def predict(input_review):
    vectorizer=load(open('pickle/countvectorizer.pkl','rb'))
    classifier=load(open('pickle/model.pkl','rb'))
    clean_text=cleaning(input_review)
    vec_clean_text=vectorizer.transform(clean_text)
    vec_array=vec_clean_text.toarray()
    prediction=classifier.predict(vec_array)
    return prediction

In [67]:
input_review=input()

My mother has very poor eye sight and complications compounded when she had a stroke. To help with her need for communication with family and friends I installed a couple of Amazon echo dots. All she had to do was say the wake word and tell it to call _______ . This worked well until she wanted to call some one that was not on her original "10" contacts. The "10" contacts were swapped out to another "10" contacts (friends) not knowing that this would eliminate the prospects of ever calling any of her original contacts ( sons, daughters in laws, medical,....). Sense I'm one of emergency contacts and dropped from her list I'm very upset with the device, programers, the tech team from the store where I bought it......pretty much everyone involved with the problems brought on to a 86 year old with medical conditions that limit her ability to stay in touch with her world. REVIEW: 👎 VERY POOR DEVICE THAT LIMITS AND INHIBITS COMMUNICATION FOR AN ELDERLY WOMAN WITH MEDICAL CONDITIONS


In [68]:
prediction = predict(input_review)

In [69]:
if prediction==2:
    print('Positive')
elif prediction==0:
    print('Negative')
else:
    print('Neutral')

Negative
