# NATURAL LANGUAGE PROCESSiNG

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
dts = pd.read_csv("Restaurant_Reviews.tsv", delimiter = "\t", quoting = 3)

## Cleaning the text

In [6]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords        #stopwords means removing words, such as the, is, at, which etc. as they don't effect reviews 
from nltk.stem.porter import PorterStemmer      #steming means simplifying words like converting loved to love as both are +ve review
corpus = []
for i in range(len(dts)):
    review = re.sub('[^a-zA-Z]', ' ', dts['Review'][i])     # ^ means not, everything thats not a-z& A-Z like "!':" punctuations remove
    review = review.lower()     
    review = review.split()
    ps = PorterStemmer()                      # steming to optimize the dimentionality of sparse matrix
    all_stopwords = stopwords.words('english')
    all_stopwords.remove("not")
    review =[ps.stem(word) for word in review if not word in set(all_stopwords)]    # ""
    review = " ".join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words
this process is also called Tokenization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
x = cv.fit_transform(corpus).toarray()
y = dts.iloc[:, -1].values

In [8]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Spliting data to train set and test set

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x ,y ,random_state = 42, test_size= 0.25)

## Training data to Naive Bayes model

In [10]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x_train, y_train)

GaussianNB()

## Predicting test set results

In [13]:
y_pred = clf.predict(x_test)
res = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)
print(res)

[[0 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [0 0]
 [0 1]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]

## making confusion matrix

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 66  62]
 [ 18 104]]


0.68

## Predicting if a single review is positive or negative

### Positive review
Use our model to predict if the following review:

"I love this restaurant so much"

is positive or negative.


In [16]:
new_review = "I love this restaurant so much"
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()     
new_review = new_review.split()
ps = PorterStemmer()                    
all_stopwords = stopwords.words('english')
all_stopwords.remove("not")
new_review =[ps.stem(word) for word in new_review if not word in set(all_stopwords)] 
new_review = " ".join(new_review)
new_corpus = [new_review]
new_X = cv.transform(new_corpus).toarray()
new_pred = clf.predict(new_X)
print(new_pred)

[1]


### Negative review
Use our model to predict if the following review:

"I hate this restaurant so much"

is positive or negative.


In [17]:
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()     
new_review = new_review.split()
ps = PorterStemmer()                    
all_stopwords = stopwords.words('english')
all_stopwords.remove("not")
new_review =[ps.stem(word) for word in new_review if not word in set(all_stopwords)] 
new_review = " ".join(new_review)
new_corpus = [new_review]
new_X = cv.transform(new_corpus).toarray()
new_pred = clf.predict(new_X)
print(new_pred)

[0]
