# Libraries

In [116]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# for text cleaning
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

# Stemming 
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

# bow model
from sklearn.feature_extraction.text import CountVectorizer

# train test split
from sklearn.model_selection import train_test_split


from sklearn.naive_bayes import GaussianNB

# metrics
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/krieger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Load Data

In [84]:
df = pd.read_csv('/Users/krieger/Desktop/NLP/review_classification/Restaurant_Reviews.tsv', 
            sep = '\t', 
            quoting = 3) # to ignore double quoting
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [85]:
# We have a balanced dataset
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

# 2. Data Preparation

In [86]:
stopwords = stopwords.words('english')

In [87]:
corpus = []

for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords]
    review = ' '.join(review)
    
    corpus.append(review)

In [89]:
len(corpus)

1000

# 3. Bag of word Model

In [91]:
cv = CountVectorizer(max_features= 1500)

In [104]:
X = cv.fit_transform(corpus).toarray()

In [105]:
X.shape

(1000, 1500)

In [102]:
y = df.iloc[:, 1].values
y.shape

(1000,)

# 4. Apply Naive Bayes Algorithm

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [109]:
X_train.shape, X_test.shape

((800, 1500), (200, 1500))

In [110]:
y_train.shape, y_test.shape

((800,), (200,))

In [112]:
classifier = GaussianNB() 

In [113]:
classifier.fit(X_train, y_train)

GaussianNB()

In [115]:
y_pred = classifier.predict(X_test)

In [118]:
accuracy_score(y_test, y_pred)

0.73