## Natural Language Processing

#### Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#### Importing the dataset

In [2]:
dataset=pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3)

#### Cleaning the texts

In [3]:
import re  #to work with regular expression
import nltk #to remove the stop words from the reviews which doesnt contribute in prediction of good or bad reviews
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer  #this package is used to stem the words which means that find the root of a word, for ex: played if stemmed it will be play
corpus=[]
for i in range(0,1000):
    review=re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # in this we will replace the special characters in reviews column with a space
    review=review.lower()  #Convert all the words to lower case
    review=review.split() #splitting the reviews into array of words
    ps=PorterStemmer()
    all_stopwords=stopwords.words('english')
    all_stopwords.remove("not")
    review=[ps.stem(word) for word in review if not word in set(all_stopwords)]
    review=" ".join(review)
    corpus.append(review)
# print(corpus)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Creating the bag of words model

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)
X=cv.fit_transform(corpus).toarray() #the X is converted to Array because we will be using Naive Bayes to train the model and Naive Bayes take the input as array
y=dataset.iloc[:, -1].values

#### Splitting the dataset into training and test set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20, random_state=0)

#### Training the Naive Bayes Model on the training set

In [12]:
from sklearn.naive_bayes import GaussianNB
classifier=GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

#### Predicting the test set results

In [13]:
y_pred=classifier.predict(X_test)

In [18]:
np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)

array([[1, 0],
       [1, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [0, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 0],
       [0, 0],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [1, 0],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 0],
       [0, 0],
       [1, 0],
       [1, 0],
       [1, 1],
       [0, 0],
       [1,

#### Making the Confusion Matrix

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm=confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.73

In [16]:
cm

array([[55, 42],
       [12, 91]], dtype=int64)