## 1. Import data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
df.shape

(1000, 2)

In [4]:
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [5]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

---

## 2. Data preprocessing

In [6]:
review = df.Review[0]
review

'Wow... Loved this place.'

### Only keep alphabetic characters

In [7]:
import re
review = re.sub('[^a-zA-Z]', ' ', review)
review

'Wow    Loved this place '

### Lowercase

In [8]:
review = review.lower()
review

'wow    loved this place '

### Remove stopwords

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/myself/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords
stopwords_eng = stopwords.words('english')
stopwords_eng[0:5]

['i', 'me', 'my', 'myself', 'we']

In [11]:
words = [x for x in review.split()
           if x not in stopwords_eng]
words

['wow', 'loved', 'place']

### Stemming

* Stemming: get word's root (leaves -> leav)
* Lemmatization: get word's meaning (leaves -> leaf)

Here, we'll use stemming

In [12]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

words = [ps.stem(x) for x in words]
words

['wow', 'love', 'place']

In [13]:
review = " ".join(words)
review

'wow love place'

### Apply on all dataset

In [14]:
corpus = []

for index, item in df.iterrows():
    review = re.sub('[^a-zA-Z]', ' ', item.Review)
    review = review.lower()

    review = " ".join([ps.stem(x)
                        for x in review.split()
                        if x not in stopwords_eng])
    corpus.append(review)

corpus[0:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

### Save to CSV

In [15]:
df_corpus = pd.DataFrame(
    data=corpus,
    columns=['Corpus']
)
df_corpus['Liked'] = df.Liked
df_corpus.head()

Unnamed: 0,Corpus,Liked
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


In [16]:
df_corpus.to_csv('corpus.csv', index=False)

### Load data

In [17]:
df = pd.read_csv('corpus.csv')
df.head()

Unnamed: 0,Corpus,Liked
0,wow love place,1
1,crust good,0
2,tasti textur nasti,0
3,stop late may bank holiday rick steve recommen...,1
4,select menu great price,1


In [18]:
df.dropna(inplace=True)

## 3. Bag of Words model

### Encode words into numbers

CountVectorizer replaces words with their frequencies

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=3)
X = cv.fit_transform([df.Corpus[0]]).toarray()
X

array([[1, 1, 1]])

In [20]:
# Keep the top 1500 words
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(df.Corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [21]:
cv.get_feature_names()[0:5]

['absolut', 'absolutley', 'accid', 'accommod', 'accomod']

### Split data

In [22]:
y = df.Liked

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

### Train model

In [24]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

### Evaluate model

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score

y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.73

In [26]:
confusion_matrix(y_test, y_pred)

array([[54, 36],
       [18, 92]])

In [27]:
# Predict with our own review
input_data = cv.transform(['long waiting time']).toarray()
input_pred = classifier.predict(input_data)[0]

print('Review is', ('negative', 'positive')[input_pred])

Review is negative
