In [None]:
## Natural Language Processing

In [53]:
# Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [54]:
# Importing the dataset

dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter='\t', quoting=3) # quoting=3 - ignore the quote
# x = dataset.iloc[:,:-1].values -> x will be created from cleaned words
y = dataset.iloc[:,-1].values

In [55]:
# Cleaning the texts

import re
import nltk

nltk.download('stopwords')   # non-relevant words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer   # stemming - word root -> reducimg the dimensions of sparse matrix
corpus = []   # list of cleaned words
no_rows = dataset.index.stop

for i in range(0, no_rows):
    review = re.sub('[^a-zA-Z]',' ', dataset['Review'][i])   # '' - signs to replace inside, ^ - signes that are not, a-zA-Z letters
    review = review.lower()
    review = review.split()   # simplify the words before stemming, review is a list
    ps = PorterStemmer()
    review = [ps.stem(j) for j in review if not j in set(stopwords.words('english'))]   # j is a word, element of the list review
    review = ' '.join(review)   # return to the original string format, add space between every word (elelment of the list)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anastazijaverovic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
# Creating the Bag of Words model - frequency of words in a vector
# different words of review -> columns of sparse matrix = matrix of features for building model upon

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500) # 1500 most common words
# matrix of features (with only most common words to get rid of personal names etc.)
x = cv.fit_transform(corpus).toarray()

no_words = len(x[0])
no_words

1500

In [66]:
# Splitting the dataset into the training set and test set

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [67]:
# Training the Naive Bayes model on the training set

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB()

In [68]:
# Predicting the results on the test set

y_pred = classifier.predict(x_test)
y_pred

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1])

In [70]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(cm)
print(accuracy)

[[55 42]
 [12 91]]
0.73


In [None]:
# Single review prediction

