In [44]:
#Natural Language Processing (NLP)
#Natural Language Processing is applying Machine Learning models to text and language.
#Teaching machines to understand what is said in spoken and written word is the focus of Natural Language Processing. 
#Whenever we dictate something into our device that is then converted to text, that’s an NLP algorithm in action.


In [45]:
#We can also use NLP on a text review to predict if the review is a good one or a bad one. 
#NLP can be used on an article to predict some categories of the articles we are trying to segment. 
#We can use NLP on a book to predict the genre of the book. And it can go further, we can use NLP to build a machine 
#translator or a speech recognition system, and in that last example we use classification algorithms to classify language. 
#Speaking of classification algorithms, most of NLP algorithms are classification models, and they include 
#Logistic Regression, Naive Bayes, CART which is a model based on decision trees, Maximum Entropy again related 
#to Decision Trees,Hidden Markov Models which are models based on Markov processes.

#A very well-known model in NLP is the 'BAG OF WORDS MODEL'. It is a model used to preprocess the texts
#to classify before fitting the classification algorithms on the observations containing the texts.


In [46]:
#Steps for implementing NLP
#1 -Clean texts to prepare them for the Machine Learning models,
#2-Create a Bag of Words model,
#3-Apply Machine Learning models onto this Bag of Worlds model.

In [69]:

#Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter ='\t',quoting = 3)
#Here we are using tsv file (tab separated values) not csv(comma separated values)
#This is done as the text itself might contain commas and python would not be able to differentiate between values .
#delimiter is used to specify the sue of tab and code quoting =3 is used to ignore the""


In [70]:
#STEP 1- Cleaning the texts
#We remove words like 'the', 'is' ,'a'etc and we also perform stemming(extracting the root of the word eg. loved stemmed from love)
import re
#review is the new cleaned version
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
#We need to convert the review which is a string to a list to remove irrelevent words efficiently
# without stemming review = [word for word in review if not word in set(stopwords.words('english'))]
#we are looking at every word in the list and removing the one which are in english and a part of the stopwords package 
#we are using set here as it is faster to access elements in a set than in a list

corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    #we have to remove '...' ,so we instead tell sub to keep words from a-zA-Z
    #This is performed on the first 'Review' of the dataset which is accessed by dataset['Review'][0]
    review = review.lower() 
    #conversion of all alphabets to lower case
    review  = review.split()
    ps = PorterStemmer()
    # without stemming ,review = [word for word in review if not word in set(stopwords.words('english'))]
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #we now join all the words in the review together 
    review = ' '.join(review)
    corpus.append(review)

In [75]:
corpus = []
for i in range (0,1000):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    #we have to remove '...' ,so we instead tell sub to keep words from a-zA-Z
    #This is performed on the first 'Review' of the dataset which is accessed by dataset['Review'][0]
    review = review.lower() 
    #conversion of all alphabets to lower case
    review  = review.split()
    ps = PorterStemmer()
    # without stemming ,review = [word for word in review if not word in set(stopwords.words('english'))]
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    #we now join all the words in the review together 
    review = ' '.join(review)
    corpus.append(review)


In [77]:
#STEP 2
#Creating Bag of Words Model
#It is about creating the sparse matrix to reduce the no. of words(indep. var. corresponding to classification)
#and reduce sparsity . All of this is done by tokenization(creating rows for every word)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
#max_features is added to filter out the less frequent words
x = cv.fit_transform(corpus).toarray()
#x is the sparse matrix
y = dataset.iloc[:, 1].values
#y is dependent value vector
#sparsity can be reduced by dimensionality reduction or by setting max_features to a lower value


In [78]:
#STEP 3- Classification

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_x = LabelEncoder()

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.20,random_state=0)

#fitting Naive Bayes to the training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)
#predict the test set results
y_pred = classifier.predict(x_test)
y_pred
#making the confusion matrix(correct vs incorrect predictions that our model made)
from sklearn.metrics import confusion_matrix
#here the confusion_matrix is a class
cm = confusion_matrix(y_test,y_pred)
cm

array([[55, 42],
       [12, 91]])