In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
data = pd.read_csv('C:/Users/User/Desktop/DS-Projects/NLP/Restaurant_Reviews.tsv', sep='\t', quoting=3)
# quoting parameter will avoid any issue related to double quotes in reviews text


In [5]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
#Data cleaning
import re # regular expression library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
data['Review'][0] # first review
review = re.sub('[^a-zA-Z]',' ',data['Review'][0])
print(review)

Wow    Loved this place 


In [15]:
# convert all the letters in lowercase
review = review.lower()
print(review)

wow    loved this place 


In [16]:
#Removing the stopwords
review =review.split() # a single review is a string, words are not seperated
# to gothrough all the words, we have to split the words

In [20]:
# taking all the words in review except stop words
review = [word for word in review if not word in set(stopwords.words('english'))] 

In [28]:
#Stemming
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] 
review = ' '.join(review)

In [33]:
#Apply text cleaning to all the reviews in dataset
# define an empty list
corpus = [] 
for i in range(0, 1000): 
  review = re.sub('[^a-zA-Z]',' ',data['Review'][i]) 
  review = review.lower() # convert all the letters in lowercase
  review =review.split() # a single review is a string, words are not seperated
  # to gothrough all the words, we have to split the words
  ps = PorterStemmer() # object of class PorterStemmer
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # taking all the words in review except stop words
  review = ' '.join(review)
  corpus.append(review)

In [None]:
#Bag of Words model

# In Bag of Words model we will take all the words in 1000 reviews and here we will select unique words, no repeatition.
#then we will create one column to each unique word after that we will put all these columns in a table, where rows = 1000 reviews and columns = words

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500) # keep 1500 most frequent words
x = cv.fit_transform(corpus).toarray() # convert x into matrix using toarray method

In [38]:
x.shape

(1000, 1500)

In [39]:
y = data.iloc[:, 1].values

In [41]:
#Splitting the dataset into the Train and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)
x_train.shape, y_train.shape

((800, 1500), (800,))

In [42]:
x_test.shape, y_test.shape

((200, 1500), (200,))

In [43]:
#Train the model (Naive Bayes)
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [45]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred = classifier.predict(x_test)

In [46]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

array([[55, 42],
       [12, 91]], dtype=int64)

In [47]:
# classification_report
print(classification_report(y_test, y_pred))
# we are getting almost 73% accuracy

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200

