In [1]:
# March 8th 2019 

# Import libraries 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import os 

# Set working directory
os.chdir('/Users/amandahutter/Documents/PythonCode/Udemy/MachineLearningA-Z/Part 7 - Natural Language Processing/Section 36 - Natural Language Processing') 
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter = '\t', quoting = 3)
print(dataset.head())

# Using CSV file will be difficult because the reviews will have commas in them 

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


# Cleaning the text > Remove irrelevant words that do not help ML algorithm predict if Negative or Positive 


In [2]:
# Used for regular expressions 
import re 
# Import NLTK to assist with removing the non important words 
import nltk 

from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amandahutter/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Remove the non important words 
review = re.sub(pattern = '[^a-zA-Z]', repl = ' ', string = dataset['Review'][0]) 
print(review)

# Lowercase words 
review = review.lower()
print(review)

# Split string into words 
review = review.split()
print(review)

# Make instance of this class because it will stem our words together by removing suffixes/prefixes 
ps = PorterStemmer()

# Keep the words in the list that are not in the Stopwords list 
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
print(review)

# Convert back to string 
review = ' '.join(review)
print(review)


Wow    Loved this place 
wow    loved this place 
['wow', 'loved', 'this', 'place']
['wow', 'love', 'place']
wow love place


In [4]:
# Loop through all reviews in your dataset and do above process 

# corpus refers to the cleaned reviews
corpus= []

for i in range (0, len(dataset)):
    review = re.sub(pattern = '[^a-zA-Z]', repl = ' ', string = dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
print(corpus[1:15])

['crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair']


# Pick most popular words to use 


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()

# create sparse matrix containing matrix of features, .toarray() makes this a matrix 
X = count_vec.fit_transform(corpus).toarray() 

print('This is the shape of the sparse matrix', X.shape)
print("That is 1565 unique words")

# Give an upper limit of unique words 
count_vec = CountVectorizer(max_features = 1500)
X = count_vec.fit_transform(corpus).toarray() 
print('This is the shape of the new sparse matrix', X.shape)
print("We have limited to 1500 most common words.")

print(type(X))
print("Each row is one review, each column is whether a given word from our bag of words is in that review")
print(X[1:10, 1:30])

This is the shape of the sparse matrix (1000, 1565)
That is 1565 unique words
This is the shape of the new sparse matrix (1000, 1500)
We have limited to 1500 most common words.
<class 'numpy.ndarray'>
Each row is one review, each column is whether a given word from our bag of words is in that review
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


### Create Depedent Variable Vector. With known sentiment that are Positive or Negative 

In [6]:
Y = dataset.iloc[:, 1].values
print(type(Y))
print(Y.shape)
print("These are sentiments:1 is for positive sentiment, 0 is for negative sentiment, from known relationships with words ")
print(Y[1:25,])

<class 'numpy.ndarray'>
(1000,)
These are sentiments:1 is for positive sentiment, 0 is for negative sentiment, from known relationships with words 
[0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 1 1]


# Pick a Classification Model to use, test for false positives

### The most common classification models to use are:
### - Naive Bayes
### - Decision Tree 
### - Random Forest 

## Naive Bayes: 

In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

# Fit Naive Bayes Classifier to the training data set  
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_Train, Y_Train)

# Predict the DV Test set using the classifer 
Y_Pred = classifier.predict(X_Test)
print("Predictions based on Y Test characteristics: \n", Y_Pred)  
print("The actual Y Test: \n", Y_Test)

# Making the confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true = Y_Test, y_pred = Y_Pred) # create an instance of confusion matrix class 
print("The confusion matrix: \n", cm)

TP = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TN = cm[1, 1]

Accuracy = (TP + TN) / (TP + TN + FP + FN)

print("Accuracy is:", Accuracy)

Precision = TP / (TP + FP)

print("Precision is:", Precision)

Recall = TP / (TP + FN)

print("Recall is:", Recall)

FScore = (2*Precision*Recall/(Precision + Recall))

print("Fscore is:", FScore)

Predictions based on Y Test characteristics: 
 [1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1
 0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1
 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0
 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1
 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 0
 1 0 1 0 1 1 0 1 1 1 0 1 1 1 1]
The actual Y Test: 
 [0 0 0 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0
 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 1 1 1
 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 1
 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 0 1
 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0
 1 0 1 0 1 1 0 0 0 1 0 1 1 0 1]
The confusion matrix: 
 [[55 42]
 [12 91]]
Accuracy is: 0.73
Precision is: 0.5670103092783505
Recall is: 0.82089552238

## Decision Tree

In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

# Fit 
from sklearn.tree import DecisionTreeClassifier 
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_Train, Y_Train)

# Predict 
Y_Pred = classifier.predict(X_Test)
print("Predictions based on Y Test characteristics: \n", Y_Pred)  
print("The actual Y Test: \n", Y_Test)


# Confusion Matrix 
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(Y_Test, Y_Pred)
print(cm)

TP = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TN = cm[1, 1]

Accuracy = (TP + TN) / (TP + TN + FP + FN)

print("Accuracy is:", Accuracy)

Precision = TP / (TP + FP)

print("Precision is:", Precision)

Recall = TP / (TP + FN)

print("Recall is:", Recall)

FScore = (2*Precision*Recall/(Precision + Recall))

print("Fscore is:", FScore)



Predictions based on Y Test characteristics: 
 [0 0 1 0 1 0 1 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1 0 1 1 0 1 1 0 0
 0 0 1 1 1 1 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0
 0 0 0 1 0 1 1 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0
 1 0 0 0 1 1 1 1 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 1 0 0 1 1 0 0 0 1 1
 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0 0 0
 0 1 1 0 1 1 1 0 0 1 0 1 1 0 0]
The actual Y Test: 
 [0 0 0 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0
 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 1 1 1
 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 1
 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 0 1
 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0
 1 0 1 0 1 1 0 0 0 1 0 1 1 0 1]
[[74 23]
 [35 68]]
Accuracy is: 0.71
Precision is: 0.7628865979381443
Recall is: 0.6788990825688074
Fscore is: 0.71844

## Random Forest 

In [9]:
# Split the data 
from sklearn.model_selection import train_test_split 
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.20, random_state = 0 )

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_Train, Y_Train)

# Predict based on the X Test set that was witheld from the fitted data 
Y_Pred = classifier.predict(X_Test)
print("Predictions based on Y Test characteristics: \n", Y_Pred)  
print("The actual Y Test: \n", Y_Test)

from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(Y_Test, Y_Pred)
print(cm)

TP = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TN = cm[1, 1]

Accuracy = (TP + TN) / (TP + TN + FP + FN)

print("Accuracy is:", Accuracy)

Precision = TP / (TP + FP)

print("Precision is:", Precision)

Recall = TP / (TP + FN)

print("Recall is:", Recall)

FScore = (2*Precision*Recall/(Precision + Recall))

print("Fscore is:", FScore)

Predictions based on Y Test characteristics: 
 [0 0 0 0 0 0 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0
 0 0 1 1 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0
 0 0 0 1 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 0
 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0
 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0]
The actual Y Test: 
 [0 0 0 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0
 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 1 1 1
 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 1
 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 0 1
 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0
 1 0 1 0 1 1 0 0 0 1 0 1 1 0 1]
[[87 10]
 [46 57]]
Accuracy is: 0.72
Precision is: 0.8969072164948454
Recall is: 0.6541353383458647
Fscore is: 0.75652

  from numpy.core.umath_tests import inner1d


In [10]:
# Split the data 
from sklearn.model_selection import train_test_split 
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.20, random_state = 0 )

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 0)
classifier.fit(X_Train, Y_Train)

# Predict based on the X Test set that was witheld from the fitted data 
Y_Pred = classifier.predict(X_Test)
print("Predictions based on Y Test characteristics: \n", Y_Pred)  
print("The actual Y Test: \n", Y_Test)

from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(Y_Test, Y_Pred)
print(cm)

TP = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TN = cm[1, 1]

Accuracy = (TP + TN) / (TP + TN + FP + FN)

print("Accuracy is:", Accuracy)

Precision = TP / (TP + FP)

print("Precision is:", Precision)

Recall = TP / (TP + FN)

print("Recall is:", Recall)

FScore = (2*Precision*Recall/(Precision + Recall))

print("Fscore is:", FScore)

Predictions based on Y Test characteristics: 
 [0 0 1 0 1 0 1 0 0 1 0 1 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0 1 1 0 0
 0 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 0 0
 0 0 0 1 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 0
 0 1 0 0 1 1 0 0 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0
 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0]
The actual Y Test: 
 [0 0 0 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 1 0 0
 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 1 1 1 1
 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 1
 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 0 1
 1 1 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0
 1 0 1 0 1 1 0 0 0 1 0 1 1 0 1]
[[82 15]
 [48 55]]
Accuracy is: 0.685
Precision is: 0.845360824742268
Recall is: 0.6307692307692307
Fscore is: 0.72246

In [11]:
# In future, try these: 
# CART - sklearn is an optimized version of this 
# C5.0 - extension of the ID3 algorithm, same as C4.5
# Max Entropy - depreciated 