# Are useful votes on a review biased? (Test run)

Do users actually read a review before voting it useful? Or is their decision biased based on the cool, funny and useful votes previously received by the review?

In [131]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix, hstack, vstack
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

SAMPLE_SIZE = 500

In [132]:
reviews = pd.read_csv('dataset/review.csv', usecols=['text', 'useful', 'cool'])

We need to make sure that we have equal number of data points for each label. We use the mean of `useful` column as our threshold, any review with a `useful` vote lesser than the mean is labeled *not useful* and anything equal or greater than the mean is labeled *useful*.

The following cell randomly samples for a predefined number of *useful* and *not useful* reviews which are then concatenated together and the resulting df is then shuffled.

In [133]:
THRESHOLD = round(reviews['useful'].mean()) 

useful_reviews = reviews.loc[reviews['useful'] >= THRESHOLD].sample(SAMPLE_SIZE)
not_useful_reviews = reviews.loc[reviews['useful'] < THRESHOLD].sample(SAMPLE_SIZE)
print("useful_reviews shape: ", useful_reviews.shape)
print("not_useful_reviews shape: ", not_useful_reviews.shape)

reviews = shuffle(pd.concat([useful_reviews, not_useful_reviews]))
reviews.head()

useful_reviews shape:  (500, 3)
not_useful_reviews shape:  (500, 3)


Unnamed: 0,text,useful,cool
196497,You will not find a better place not more know...,0,1
2538872,just got back from having a bite to eat over h...,0,2
3416925,This is the best Starbucks I've found in the c...,0,0
472978,Atmosphere is very relaxing and serene. Good f...,1,1
3022054,My entire experience at this little resto was ...,0,0


# Preprocessing

For `text` a custom `analyzer` method is written which:
1. remove all punctuations and
3. removes new line characters (escape sequence)

The feature extraction (`CountVectorize` and `TfidfVectorizer`) class is then set to:
2. remove accents
2. remove all stopwords
3. lowercase all words

For `useful` votes:
2. remove df row if `useful` is `NaN`

In [134]:
reviews.dropna(inplace=True)
print("reviews shape after dropping NaN values: ", reviews.shape)

reviews shape after dropping NaN values:  (1000, 3)


In [135]:
import string
import re

RE_NEWLINE = '\n+'
PUNCTUATIONS = string.punctuation

def review_process(review):
    no_newline = re.sub(RE_NEWLINE, '', review)
    no_punc = ''.join([char for char in no_newline if char not in PUNCTUATIONS])
    
    return no_punc

In [136]:
test = "a str.. with! some @2 punctuations 6546721 and numbers \n [] \n"
print(review_process(test))

a str with some 2 punctuations 6546721 and numbers   


In [137]:
vect = TfidfVectorizer(strip_accents='ascii', preprocessor=review_process, stop_words='english' )
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function review_process at 0x142bb0ae8>,
        smooth_idf=True, stop_words='english', strip_accents='ascii',
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None)

# Generating labels

In [138]:
NOT_USEFUL = 0
USEFUL = 1

def labeler(vote):
    if math.floor(vote) < THRESHOLD:
        return NOT_USEFUL
    else:
        return USEFUL

In [139]:
reviews['label'] = reviews['useful'].apply(labeler)
    
reviews.head()

Unnamed: 0,text,useful,cool,label
196497,You will not find a better place not more know...,0,1,0
2538872,just got back from having a bite to eat over h...,0,2,0
3416925,This is the best Starbucks I've found in the c...,0,0,0
472978,Atmosphere is very relaxing and serene. Good f...,1,1,1
3022054,My entire experience at this little resto was ...,0,0,0


# Generating test train splits

In [151]:
X_train, X_test, Y_train, Y_test = train_test_split(reviews[['text', 'useful', 'cool']], reviews['label'], test_size=0.4)

# Instantiate classifiers

We will be using 3 classifiers for this analysis namely, *Multinomial Naive Bayes*, *Linear SVM* and *Random Forest*.

In [141]:
mn_clas = MultinomialNB()
print(mn_clas)
print("\n")

svc_clas = LinearSVC()
print(svc_clas)
print("\n")

rf_clas = RandomForestClassifier()
print(rf_clas)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


# Model 1: training with only text

In [142]:
X1 = vect.fit_transform(X_train['text'])
X1_test = vect.transform(X_test['text'])
X1

<600x8665 sparse matrix of type '<class 'numpy.float64'>'
	with 28819 stored elements in Compressed Sparse Row format>

In [143]:
type(X1)

scipy.sparse.csr.csr_matrix

In [144]:
mn_clas.fit(X1, Y_train)
svc_clas.fit(X1, Y_train)
rf_clas.fit(X1, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [145]:
print("Naive Bayes score: ", mn_clas.score(X1_test, Y_test))
print("Linear SVC score: ", svc_clas.score(X1_test, Y_test))
print("Random Forest score: ", rf_clas.score(X1_test, Y_test))

Naive Bayes score:  0.5375
Linear SVC score:  0.5725
Random Forest score:  0.56


# Model 2: training with text and useful votes

Since X now contains mixed dtypes, we need to be a bit clever. First, we convert `useful` column for train and test into sparse matrix and *I2* normalize them (I2 is the default for `TfidfVectorizer` so that's what we use here as well).

The rest is straight forward, we create our updated X by stacking the `useful` sparse matrices with X horizontally (such that the number of feature increases). 

In [146]:
useful_sparse_train = normalize(csr_matrix(X_train['useful']))
useful_sparse_test = normalize(csr_matrix(X_test['useful']))
print("type of useful_sparse_train: ", type(useful_sparse_train))
print("shape of useful_sparse_train: ", useful_sparse_train.shape)

type of useful_sparse_train:  <class 'scipy.sparse.csr.csr_matrix'>
shape of useful_sparse_train:  (1, 600)


In [158]:
# training
X2 = hstack([X1, useful_sparse_train.T])
print("shape of X2: ", X2.shape)

# testing
X2_test = hstack([X1_test, useful_sparse_test.T])

shape of X2:  (600, 8666)


In [148]:
mn_clas.fit(X2, Y_train)
svc_clas.fit(X2, Y_train)
rf_clas.fit(X2, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [149]:
print("Naive Bayes score: ", mn_clas.score(X2_test, Y_test))
print("Linear SVC score: ", svc_clas.score(X2_test, Y_test))
print("Random Forest score: ", rf_clas.score(X2_test, Y_test))

Naive Bayes score:  0.5675
Linear SVC score:  0.66
Random Forest score:  0.9625


# Model 3: training with text and cool votes

Next we will carry out the analysis with text and `cool` votes. We choose `cool` votes since it had the second highest correlation with `useful` (see exploration.ipynb).

In [152]:
cool_sparse_train = normalize(csr_matrix(X_train['cool']))
cool_sparse_test = normalize(csr_matrix(X_test['cool']))

In [159]:
# training
X3 = hstack([X1, cool_sparse_train.T])
print("shape of X3: ", X3.shape)

# testing
X3_test = hstack([X1_test, cool_sparse_test.T])

shape of X3:  (600, 8666)


In [154]:
mn_clas.fit(X3, Y_train)
svc_clas.fit(X3, Y_train)
rf_clas.fit(X3, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [155]:
print("Naive Bayes score: ", mn_clas.score(X3_test, Y_test))
print("Linear SVC score: ", svc_clas.score(X3_test, Y_test))
print("Random Forest score: ", rf_clas.score(X3_test, Y_test))

Naive Bayes score:  0.4875
Linear SVC score:  0.5175
Random Forest score:  0.6725


# Model 4: training with text, useful and cool votes

Finally we will carry out the analysis with text, `useful` & `cool` votes.

In [157]:
uc_sparse_train = normalize(csr_matrix(X_train[['cool', 'useful']]))
uc_sparse_test = normalize(csr_matrix(X_test[['cool', 'useful']]))
print("shape of uc_sparse_train: ", uc_sparse_train.shape)

shape of uc_sparse_train:  (600, 2)


In [160]:
# training
X4 = hstack([X1, uc_sparse_train])
print("shape of X4: ", X4.shape)

# testing
X4_test = hstack([X1_test, uc_sparse_test])

shape of X4:  (600, 8667)


In [161]:
mn_clas.fit(X4, Y_train)
svc_clas.fit(X4, Y_train)
rf_clas.fit(X4, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [162]:
print("Naive Bayes score: ", mn_clas.score(X4_test, Y_test))
print("Linear SVC score: ", svc_clas.score(X4_test, Y_test))
print("Random Forest score: ", rf_clas.score(X4_test, Y_test))

Naive Bayes score:  0.92
Linear SVC score:  0.9975
Random Forest score:  0.865
