### Restaurant Reviews Segmentation

#### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

#### Importing Dataset

In [11]:
dataset = pd.read_csv('../Dataset/Restaurant_Reviews.tsv', delimiter='\t', quoting= 3)   # quoting = 3 ''csv.QUOTE_NONE'' to ignore all quotes
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1.0
1,Crust is not good.,0.0
2,Not tasty and the texture was just nasty.,0.0
3,Stopped by during the late May bank holiday of...,1.0
4,The selection on the menu was great and so wer...,1.0
...,...,...
995,I think food should have flavor and texture an...,0.0
996,Appetite instantly gone.,0.0
997,Overall I was not impressed and would not go b...,0.0
998,"The whole experience was underwhelming, and I ...",0.0


#### Cleaning the Text

In [45]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to C:\Users\Amr
[nltk_data]     Ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
dataset['Review'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x).lower().split())

0                              [wow, loved, this, place]
1                                 [crust, is, not, good]
2      [not, tasty, and, the, texture, was, just, nasty]
3      [stopped, by, during, the, late, may, bank, ho...
4      [the, selection, on, the, menu, was, great, an...
                             ...                        
995    [i, think, food, should, have, flavor, and, te...
996                          [appetite, instantly, gone]
997    [overall, i, was, not, impressed, and, would, ...
998    [the, whole, experience, was, underwhelming, a...
999    [then, as, if, i, hadn, t, wasted, enough, of,...
Name: Review, Length: 1000, dtype: object

In [14]:
dataset['Review'][0]

'Wow... Loved this place.'

In [28]:
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0]).lower().split()
review

['wow', 'loved', 'this', 'place']

In [44]:
for word in review:
    if word not in set(stopwords.words('english')):
        print(word)

# print word in one line
print('\n')
print(' '.join(review))

wow
loved
place


wow loved this place


In [38]:
[word for word in review if not word in set(stopwords.words('english'))]

['wow', 'loved', 'place']

In [50]:
ps = PorterStemmer()        # Root of the word
final_review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
final_review

['wow', 'love', 'place']

In [54]:
final_review = ' '.join(final_review)
final_review

'w o w   l o v e   p l a c e'

In [58]:
# Cleaned Reviews
corpus = []
for i in range(0, dataset.shape[0]):
    reviews = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]).lower().split()
    reviews = [ps.stem(word) for word in reviews if not word in set(stopwords.words('english'))]
    reviews = ' '.join(reviews)
    corpus.append(reviews)

### Creating the Bag Of Words Model

In [64]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)  # max_features = 1500 (Build a vocabulary that only consider the top max_features ordered by term frequency across the corpus)

X = cv.fit_transform(corpus).toarray()  # Convert corpus to Sparse Matrix (Each word is in a separate column)
y = dataset.iloc[:, -1].values

#### Splitting the dataset into the Training set and Test set

In [83]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

#### Fitting Random Forest Classification to the Training set

In [90]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy score: ', accuracy_score(y_test, y_pred))

[[88  9]
 [47 56]]
accuracy score:  0.72
