In [1]:
### Natural Language Processing NLP
## DATA INFO: the data we are using must be in tsv file separated the review from the weight with a tab.
## if its separated by comma it will mess up the data, because there are commas in the reviews - 1000 reviews
## dataset is taken from https://raw.githubusercontent.com/Logan213/DATA607_Week11/master/yelp_labelled.txt
## This dataset was created for the Paper 'From Group to Individual Labels using Deep Features',
## Kotzias et. al,. KDD 2015

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk #text processing
from sklearn.feature_extraction.text import CountVectorizer #matrix
# 01
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.metrics import confusion_matrix
# 02
from sklearn.tree import DecisionTreeClassifier
#03
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
# Download and import stop words
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dimitar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#Import the dataset
dataset = pd.read_csv('00exam/Restaurant_Reviews.tsv', delimiter='\t', quoting=3) #ignoring double quatles ""

In [5]:
#Cleaning the text (removing "the", "pountuation", "all small letters", "stemming (loved => love)" and etc)
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #remove pountuation
    review = review.lower() #to lower
    review = review.split() #split by space and create array from words
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] #remove stopwords
    review = ' '.join(review)
    corpus.append(review)

In [6]:
#Creating the Bag of Words model
cv = CountVectorizer(max_features=1500) #number of words we will use from 1585
X = cv.fit_transform(corpus).toarray() # classification model
y = dataset.iloc[:, 1].values # reviews if its possitive or negative

In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)



In [8]:
###01 Naive Bayes

In [9]:
# Fitting classifier to the Training set
# Create your classifier here
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [10]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm 
#01 correct preditions of negative reviews
#02 wrong of possitive reviews
#03 wrong of negative reviews
#04 correct predictions of positive reviews

array([[55, 42],
       [12, 91]])

In [11]:
(66 + 105)/200 #accuracy

0.855

In [12]:
##02 Decision Tree Classification

In [13]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)



In [14]:
# Fitting classifier to the Training set
# Create your classifier here
tree = DecisionTreeClassifier(criterion="entropy", max_depth=4)
tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [15]:
# Predicting the Test set results
y_pred = tree.predict(X_test)

In [16]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[94,  3],
       [74, 29]])

In [17]:
(114 + 39)/200 #accuracy

0.765

In [18]:
##03 Random Forest Classification

In [19]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [20]:
# Fitting classifier to the Training set
# Create your classifier here
forest = RandomForestClassifier()

In [21]:
params = {"criterion": ["gini", "entropy"], "n_estimators": [2,
5, 10, 25], "max_depth": [None, 2, 3, 5]}
search = GridSearchCV(forest, params)
search.fit(X_train, y_train)
predicted = search.best_estimator_.predict(X_test)
confusion_matrix(y_test, predicted)

array([[85, 12],
       [52, 51]])

In [22]:
(100 + 68)/200 #accuracy

0.84