### Load Packages

In [1]:
import numpy as np
import pandas as pd

### Collect dataset

In [2]:
df = pd.read_csv("Restaurant_Reviews.tsv",sep="\t")
df[:10]

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [3]:
df.shape

(1000, 2)

In [4]:
df.Liked.value_counts()

1    500
0    500
Name: Liked, dtype: int64

### Data Cleaning

In [5]:
import re
re.sub(r"didn't","did not","I didn't like this")
re.sub(r'<.*?>',' ',"<h1>HELLO all</h1>")

' HELLO all '

In [6]:


def clean(x):
    
    #To remove html tags
    x = re.sub(r'<.*?>',' ',x)
    
    #Remove shortcut words
    x = re.sub(r"didn't","did not",x)
    x = re.sub(r"couldn't","could not",x)
    x = re.sub(r"can't","can not",x)
    
    #Change to mobile number
    x = re.sub(r'[\d]{10,12}','mobno',x)
    x = re.sub(r'\d+',' ',x)
    
    #to remove punctuations and numbers
    x = re.sub(r'[^A-Za-z]',' ',x)
    
    #to replace more than 1 space with 1 space
    x =re.sub(r'\s+',' ',x)
    
    return x.lower()

clean("<html>hello 876 <p>78 7676767676 *&UI everyone couldn't !54^ Gy^%^<b> hii i can't to this didn't</html>")

' hello mobno ui everyone could not gy hii i can not to this did not '

In [7]:
df["Review"] = df.Review.apply(clean)

In [8]:
df.head(10)

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1
5,now i am getting angry and i want my damn pho,0
6,honeslty it did not taste that fresh,0
7,the potatoes were like rubber and you could te...,0
8,the fries were great too,1
9,a great touch,1


### Split data

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
df.Liked.value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [11]:
X = df["Review"].values
Y = df["Liked"].values

In [12]:
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,
                                            random_state=10,test_size=.20)

In [13]:
xtrain.shape,xtest.shape,ytrain.shape

((800,), (200,), (800,))

### Feature Extraction
- Apply Countvecorizor

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
cv = CountVectorizer()

In [16]:
cv.fit(xtrain)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [39]:
cv.get_feature_names()[:5]

['about', 'above', 'absolutely', 'absolutley', 'accident']

In [18]:
cv_train = cv.transform(xtrain).toarray()
cv_test = cv.transform(xtest)

In [19]:
cv_train.shape

(800, 1762)

In [20]:
cv_test.shape

(200, 1762)

### Train Model

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [22]:
nb = MultinomialNB(0.1)
lg = LogisticRegression()

In [23]:
nb.fit(cv_train,ytrain)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [24]:
lg.fit(cv_train,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Predicion

In [25]:
nb_pred = nb.predict(cv_test)
lg_pred = lg.predict(cv_test)

In [26]:
# To check probabilties
pp = nb.predict_proba(cv_test)
pp[0]

array([0.85927756, 0.14072244])

### Evaluation

In [27]:
from sklearn.metrics import recall_score,accuracy_score,confusion_matrix

In [28]:
print("Logistic Regression")
print("Training Score", lg.score(cv_train,ytrain))
print("Testing Score", lg.score(cv_test,ytest))
print("Confusion Matrix")
print(confusion_matrix(ytest,lg_pred))
accuracy_score(ytest,lg_pred)

Logistic Regression
Training Score 0.98625
Testing Score 0.82
Confusion Matrix
[[82 18]
 [18 82]]


0.82

In [29]:
print("Naive Bayes Classifier: ")
print("Training Score", nb.score(cv_train,ytrain))
print("Testing Score", nb.score(cv_test,ytest))
print("Confusion Matrix")
print(confusion_matrix(ytest,nb_pred))
accuracy_score(ytest,nb_pred)

Naive Bayes Classifier: 
Training Score 0.97875
Testing Score 0.78
Confusion Matrix
[[82 18]
 [26 74]]


0.78

### Test on Real Life Data

In [30]:
r = np.array(["Worst Experienceever","I must say it fabulus","Horrible! Don't ever visit again",
             "I hate this","I love this food","amazing food","Not Good"])

In [31]:
tst = cv.transform(r)
tst.shape

(7, 1762)

In [32]:
p=nb.predict(tst)
p

array([0, 1, 0, 0, 1, 1, 0], dtype=int64)

In [33]:
target = {0:"Negative",1:"Positive"}
for rw,prd in zip(r,p):
    print(target[prd],":",rw)

Negative : Worst Experienceever
Positive : I must say it fabulus
Negative : Horrible! Don't ever visit again
Negative : I hate this
Positive : I love this food
Positive : amazing food
Negative : Not Good


### Export model to pkl file (Deployment)

In [34]:
import pickle

In [35]:
filename="review_model.pkl"
f= open(filename,"wb")
pickle.dump(nb,f)
f.close()

In [36]:
fname = "countvec.pkl"
file = open(fname,"wb")
pickle.dump(cv,file)
file.close()