# NLP Bag of Words

The goal of this project is to build a machine learning model that can predict if a new review is positive or negative. This is an exercise taken from Super Data Science machine learning class.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
cd '/Users/sauce/Desktop/Courses/Machine Learning A-Z Template Folder/Part 7 - Natural Language Processing/Natural_Language_Processing'

/Users/sauce/Desktop/Courses/Machine Learning A-Z Template Folder/Part 7 - Natural Language Processing/Natural_Language_Processing


In [3]:
# Upload Dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Data Cleaning - First Review

In [5]:
# Clean first review to begin with.
import re
dataset['Review'][0]

'Wow... Loved this place.'

In [6]:
# Remove everything besides the letters in the alphabet.
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])

In [7]:
# Remove capital letters.
review = review.lower()
review

'wow    loved this place '

In [8]:
# Remove uneccessary words.
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/sauce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
review = review.split()

In [10]:
review

['wow', 'loved', 'this', 'place']

In [11]:
from nltk.corpus import stopwords
review = [word for word in review if not word in set(stopwords.words('english'))]

In [12]:
review

['wow', 'loved', 'place']

In [13]:
# Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]

In [14]:
review

['wow', 'love', 'place']

In [15]:
# Join words back together.
review = ' '.join(review)
review

'wow love place'

### Data Cleaning - All Reviews

In [30]:
corpus = []
    
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [33]:
corpus[0:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

### Bag of Words Model

Through tokenization

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
# Max featues parameter is useful for big datasets with lots of words.
X = cv.fit_transform(corpus).toarray()

In [41]:
# Create Dependent Variable
y = dataset.iloc[:, 1].values
y[0:5]

array([1, 0, 0, 1, 1])

### Try Naive Bayes Classifier

In [42]:
# Split Data into Train and Test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

# Fit Model
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Results
y_pred = classifier.predict(X_test)

# Confusion Matrix of Results
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm



array([[81, 16],
       [50, 53]])

In [43]:
(81 + 53)/200
#Trash

0.67

### Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier as rf
rfClass = rf(n_estimators = 25)

In [47]:
rfClass.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
y_pred2 = rfClass.predict(X_test)

In [49]:
cm2 = confusion_matrix(y_pred2, y_test)
cm2

array([[87, 47],
       [10, 56]])

In [50]:
#accuracy
(87+56)/200

0.715