In [125]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix, hstack, vstack
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

SAMPLE_SIZE = 500

In [55]:
reviews = pd.read_csv('dataset/review.csv', usecols=['text', 'useful'])

In [56]:
THRESHOLD = round(reviews['useful'].mean()) 

useful_reviews = reviews.loc[reviews['useful'] >= THRESHOLD].sample(SAMPLE_SIZE)
not_useful_reviews = reviews.loc[reviews['useful'] < THRESHOLD].sample(SAMPLE_SIZE)
print("useful_reviews shape: ", useful_reviews.shape)
print("not_useful_reviews shape: ", not_useful_reviews.shape)

reviews = shuffle(pd.concat([useful_reviews, not_useful_reviews]))
reviews.head()

useful_reviews shape:  (500, 2)
not_useful_reviews shape:  (500, 2)


<bound method NDFrame.head of                                                       text  useful
3611417  We weren't sure if we should try based on some...       0
650464   Our stay here was excellent. I enjoyed the siz...       0
2057244  This was a horrible experience! I booked a hot...       0
1995564  Four stars for the food, 3.25 stars for the se...       0
4989961  702Connections got us squared away with transp...       3
4235029  We had a full physical inspection of our home ...       3
3248745  I've ate here about 5 times now, it's about a ...       0
3186365  Our server was not the best.  He did not provi...       0
3351716  Had to write a review because the most recent ...       2
2679511  I go to yard house quite a bit. It is my favor...       0
3149570  NOM - tried the "Snicker" this trip...so good!...       0
1353348  Delicious. Was intimidating walking in without...       0
1562106  Love, love, love this stadium! But we had a bo...      10
1070756  I had the tonkatsu rame

# Preprocessing

For `text` a custom `analyzer` method is written which:
1. remove all punctuations and
3. removes new line characters (escape sequence)

The feature extraction (`CountVectorize` and `TfidfVectorizer`) class is then set to:
2. remove accents
2. remove all stopwords
3. lowercase all words

For `useful` votes:
2. remove df row if `useful` is `NaN`

In [57]:
reviews.dropna(inplace=True)
print("reviews shape after dropping NaN values: ", reviews.shape)

reviews shape after dropping NaN values:  (1000, 2)


In [60]:
import string
import re

RE_NEWLINE = '\n+'
PUNCTUATIONS = string.punctuation

def review_process(review):
    no_newline = re.sub(RE_NEWLINE, '', review)
    no_punc = ''.join([char for char in no_newline if char not in PUNCTUATIONS])
    
    return no_punc

In [61]:
test = "a str.. with! some @2 punctuations 6546721 and numbers \n [] \n"
print(review_process(test))

a str with some 2 punctuations 6546721 and numbers   


In [104]:
vect = TfidfVectorizer(strip_accents='ascii', preprocessor=review_process, stop_words='english' )
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function review_process at 0x147a98400>,
        smooth_idf=True, stop_words='english', strip_accents='ascii',
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None)

# Generating labels

In [64]:
NOT_USEFUL = 0
USEFUL = 1

def labeler(vote):
    if math.floor(vote) < THRESHOLD:
        return NOT_USEFUL
    else:
        return USEFUL

In [105]:
reviews['label'] = reviews['useful'].apply(labeler)
    
reviews.head()

Unnamed: 0,text,useful,label
3611417,We weren't sure if we should try based on some...,0,0
650464,Our stay here was excellent. I enjoyed the siz...,0,0
2057244,This was a horrible experience! I booked a hot...,0,0
1995564,"Four stars for the food, 3.25 stars for the se...",0,0
4989961,702Connections got us squared away with transp...,3,1


# Generating test train splits

In [106]:
X_train, X_test, Y_train, Y_test = train_test_split(reviews[['text', 'useful']], reviews['label'], test_size=0.4)

# Instantiate classifiers

We will be using 3 classifiers for this analysis namely, *Multinomial Naive Bayes*, *Linear SVM* and *Random Forest*.

In [126]:
mn_clas = MultinomialNB()
print(mn_clas)
print("\n")
svc_clas = LinearSVC()
print(svc_clas)
print("\n")

rf_clas = RandomForestClassifier()
print(rf_clas)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


# Model 1: training with only text

In [113]:
X1 = vect.fit_transform(X_train['text'])
X1_test = vect.transform(X_test['text'])
X1

<600x9206 sparse matrix of type '<class 'numpy.float64'>'
	with 32307 stored elements in Compressed Sparse Row format>

In [108]:
type(X1)

scipy.sparse.csr.csr_matrix

In [127]:
mn_clas.fit(X1, Y_train)
svc_clas.fit(X1, Y_train)
rf_clas.fit(X1, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [128]:
print("Naive Bayes score: ", mn_clas.score(X1_test, Y_test))
print("Linear SVC score: ", svc_clas.score(X1_test, Y_test))
print("Random Forest score: ", rf_clas.score(X1_test, Y_test))

Naive Bayes score:  0.59
Linear SVC score:  0.6375
Random Forest score:  0.6525


# Model 2: training with text and useful votes

Since X now contains mixed dtypes, we need to be a bit clever. First, we convert `useful` column for train and test into sparse matrix and *I2* normalize them (I2 is the default for `TfidfVectorizer` so that's what we use here as well).

The rest is straight forward, we create our updated X by stacking the `useful` sparse matrices with X horizontally (such that the number of feature increases). 

In [115]:
useful_sparse_train = normalize(csr_matrix(X_train['useful']))
useful_sparse_test = normalize(csr_matrix(X_test['useful']))
print("type of useful_sparse_train: ", type(useful_sparse_train))
print("shape of useful_sparse_train: ", useful_sparse_train.shape)

type of useful_sparse_train:  <class 'scipy.sparse.csr.csr_matrix'>
shape of useful_sparse_train:  (1, 600)


In [116]:
# training
X2 = hstack([X1, useful_sparse_train.T])

# testing
X2_test = hstack([X1_test, useful_sparse_test.T])

In [129]:
mn_clas.fit(X2, Y_train)
svc_clas.fit(X2, Y_train)
rf_clas.fit(X2, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [130]:
print("Naive Bayes score: ", mn_clas.score(X2_test, Y_test))
print("Linear SVC score: ", svc_clas.score(X2_test, Y_test))
print("Random Forest score: ", rf_clas.score(X2_test, Y_test))

Naive Bayes score:  0.6275
Linear SVC score:  0.755
Random Forest score:  0.8325
