In [12]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

- csv: delimited by a comma
- tsv: delimited by a tab
- The bad thing about using a tab is that it'll consider all the commas and not the specific commas that delimites the dataset!
- In this case,we would rarely find a tab in a review (which is the datset that we are looking at)!

In [13]:
# Importing the dataset
# Delimitted by tabs, and ignore the double quotes (just in case)
df = pd.read_csv("../../archive/Restaurant_Reviews.tsv", delimiter='\t', quoting=3)

In [14]:
# Quick look on the dataset
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [15]:
# We will creating a bag of words to bette understand words that are signficant
# We will apply stemming (to have fewer group by grouping past, present, and future tense words)

In [16]:
import nltk

# Downloaded all the words that are considered an article!
# nltk.download('stopwords')

# Eliminating article words (the, that, this)
from nltk.corpus import stopwords

# Grouping words that are part of the same family (loved, love, will love)
# bc their 'stem' value is identical and we need to optimize the model (too much words can be inefficient)
from nltk.stem.porter import PorterStemmer

In [17]:
# Cleaning the dataframe (removing everything except letters) for one of the examples
import re

# Define: we do not want to remove characters that are in a-z & A-Z
review = re.sub('[^a-zA-Z]', ' ', df.loc[0, 'Review']) # The second parameter indicates the replacement value

# We will change the reviews to lowercase
review = review.lower()

# Separating the review into words
review = review.split()

# Comparing the stopwwords with the words in our dataset (This should be commented out 
# The next if statement is used instead!

# review = [word for word in review if not word in set(stopwords.words('english'))]

In [18]:
# Grouping similar words
ps = PorterStemmer()

review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
# loved became love! Keeping the root of the word!

In [19]:
# Joining the new set of words
review = ' '.join(review)

In [20]:
# We created the frameword (let create a for loop)
corpus = [] # A collection of texts 

for index, row in df.iterrows():
    review = row['Review']
    review = re.sub('[^a-zA-Z]', ' ', review) # The second parameter indicates the replacement value
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)


In [25]:
# NLP: Tokenazation: Using the words of each review and making it a unique column
# We will predict if a review will be negative or positive (since we have the results!)
# Independent Variables: Each of the word
# This is classification! We have indepdent variables to train a binary outcome (which is this example)
# Bc we will be using each word as indepdent variable, this is the reason we needed to clean up the data!

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer

# Removing the less relevant words (keeping 1500)
cv = CountVectorizer(max_features=1500)
Xs = cv.fit_transform(corpus).toarray() # (1000 rows, 1565 words)

y = df.loc[:, 'Liked'].values

In [22]:
# Must decide which model will help us train the model
# We will use the Bayes Theorem

In [26]:
# Splitting the data set into training and testing

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.metrics import confusion_matrix


### Naive Bayes

In [61]:
# Fitting the classifier to the training set

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [62]:
# Predicting the values of the testing set
y_pred = classifier.predict(X_test)

In [63]:
# Checking for the results

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

total = tn+fp+fn+tp
accuracy = (tn+tp)/total
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*precision*recall/(precision+recall)


print(
    "true pos: {0}\n"
    "false pos: {1}\n"
    "true neg: {2}\n"
    "false neg: {3}\n".format(tp, fp, tn, fn))

print("""
Out of {0} reviews, the model got {1} correct,
Accuacy is: {2:.2f}%
Precision is: {3:.2f}%
Recall is: {4:.2f}%
F1 Score is: {5:.2f}%""".format(total, tn+tp, accuracy, precision, recall, f1_score))

true pos: 91
false pos: 42
true neg: 55
false neg: 12


Out of 200 reviews, the model got 146 correct,
Accuacy is: 0.73%
Precision is: 0.68%
Recall is: 0.88%
F1 Score is: 0.77%


### Decisions Trees

In [55]:
# Fitting the classifier to the training set

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [56]:
# Predicting the values of the testing set
y_pred = classifier.predict(X_test)

In [60]:
# Checking for the results

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

total = tn+fp+fn+tp
accuracy = (tn+tp)/total
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*precision*recall/(precision+recall)


print(
    "true pos: {0}\n"
    "false pos: {1}\n"
    "true neg: {2}\n"
    "false neg: {3}\n".format(tp, fp, tn, fn))

print("""
Out of {0} reviews, the model got {1} correct,
Accuacy is: {2:.2f}%
Precision is: {3:.2f}%
Recall is: {4:.2f}%
F1 Score is: {5:.2f}%""".format(total, tn+tp, accuracy, precision, recall, f1_score))

true pos: 68
false pos: 23
true neg: 74
false neg: 35


Out of 200 reviews, the model got 142 correct,
Accuacy is: 0.71%
Precision is: 0.75%
Recall is: 0.66%
F1 Score is: 0.70%


### Random Forest

In [84]:
# Fitting the classifier to the training set

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', max_depth=4, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [85]:
# Predicting the values of the testing set
y_pred = classifier.predict(X_test)

In [86]:
# Checking for the results

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

total = tn+fp+fn+tp
accuracy = (tn+tp)/total
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*precision*recall/(precision+recall)


print(
    "true pos: {0}\n"
    "false pos: {1}\n"
    "true neg: {2}\n"
    "false neg: {3}\n".format(tp, fp, tn, fn))

print("""
Out of {0} reviews, the model got {1} correct,
Accuacy is: {2:.2f}%
Precision is: {3:.2f}%
Recall is: {4:.2f}%
F1 Score is: {5:.2f}%""".format(total, tn+tp, accuracy, precision, recall, f1_score))

true pos: 40
false pos: 4
true neg: 93
false neg: 63


Out of 200 reviews, the model got 133 correct,
Accuacy is: 0.67%
Precision is: 0.91%
Recall is: 0.39%
F1 Score is: 0.54%


### SVMS (Kernel)

In [67]:
# Fitting the classifier to the training set

from sklearn.svm import SVC
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [68]:
# Predicting the values of the testing set
y_pred = classifier.predict(X_test)

In [74]:
from sklearn.metrics import confusion_matrix

# We will be looking at other kernels
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    classifier = SVC(kernel=kernel, random_state=0)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    # Checking for the results
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    total = tn+fp+fn+tp
    accuracy = (tn+tp)/total
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1_score = 2*precision*recall/(precision+recall)
    
    print("true pos: {0}\n"
          "false pos: {1}\n"
          "true neg: {2}\n"
          "false neg: {3}\n".format(tp, fp, tn, fn))
    
    print("""
    Out of {0} reviews, the model got {1} correct,
    Accuacy is: {2:.2f}%
    Precision is: {3:.2f}%
    Recall is: {4:.2f}%
    F1 Score is: {5:.2f}%""".format(total, tn+tp, accuracy, precision, recall, f1_score))

true pos: 70
false pos: 23
true neg: 74
false neg: 33


    Out of 200 reviews, the model got 144 correct,
    Accuacy is: 0.72%
    Precision is: 0.75%
    Recall is: 0.68%
    F1 Score is: 0.71%




true pos: 0
false pos: 0
true neg: 97
false neg: 103


    Out of 200 reviews, the model got 97 correct,
    Accuacy is: 0.48%
    Precision is: nan%
    Recall is: 0.00%
    F1 Score is: nan%
true pos: 0
false pos: 0
true neg: 97
false neg: 103


    Out of 200 reviews, the model got 97 correct,
    Accuacy is: 0.48%
    Precision is: nan%
    Recall is: 0.00%
    F1 Score is: nan%
true pos: 0
false pos: 0
true neg: 97
false neg: 103


    Out of 200 reviews, the model got 97 correct,
    Accuacy is: 0.48%
    Precision is: nan%
    Recall is: 0.00%
    F1 Score is: nan%


### KNN

In [75]:
# Fitting the classifier to the training set

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [76]:
# Predicting the values of the testing set
y_pred = classifier.predict(X_test)

In [77]:
# Checking for the results

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

total = tn+fp+fn+tp
accuracy = (tn+tp)/total
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*precision*recall/(precision+recall)


print(
    "true pos: {0}\n"
    "false pos: {1}\n"
    "true neg: {2}\n"
    "false neg: {3}\n".format(tp, fp, tn, fn))

print("""
Out of {0} reviews, the model got {1} correct,
Accuacy is: {2:.2f}%
Precision is: {3:.2f}%
Recall is: {4:.2f}%
F1 Score is: {5:.2f}%""".format(total, tn+tp, accuracy, precision, recall, f1_score))

true pos: 48
false pos: 23
true neg: 74
false neg: 55


Out of 200 reviews, the model got 122 correct,
Accuacy is: 0.61%
Precision is: 0.68%
Recall is: 0.47%
F1 Score is: 0.55%


### Logistic Regression

In [78]:
# Fitting the classifier to the training set

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [79]:
# Predicting the values of the testing set
y_pred = classifier.predict(X_test)

In [80]:
# Checking for the results

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

total = tn+fp+fn+tp
accuracy = (tn+tp)/total
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*precision*recall/(precision+recall)


print(
    "true pos: {0}\n"
    "false pos: {1}\n"
    "true neg: {2}\n"
    "false neg: {3}\n".format(tp, fp, tn, fn))

print("""
Out of {0} reviews, the model got {1} correct,
Accuacy is: {2:.2f}%
Precision is: {3:.2f}%
Recall is: {4:.2f}%
F1 Score is: {5:.2f}%""".format(total, tn+tp, accuracy, precision, recall, f1_score))

true pos: 66
false pos: 21
true neg: 76
false neg: 37


Out of 200 reviews, the model got 142 correct,
Accuacy is: 0.71%
Precision is: 0.76%
Recall is: 0.64%
F1 Score is: 0.69%
