# Are useful votes on a review biased?

Do users actually read a review before voting it useful? Or is their decision biased based on the cool, funny and useful votes previously received by the review?

In [32]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

from scipy.sparse import csr_matrix, csc_matrix, hstack, vstack
SAMPLE_SIZE = 500

In [3]:
reviews = pd.read_csv('dataset/review.csv', usecols=['text', 'useful', 'cool'])

We need to make sure that we have equal number of data points for each label. We use the mean of `useful` column as our threshold, any review with a `useful` vote lesser than the mean is labeled *not useful* and anything equal or greater than the mean is labeled *useful*.

The following cell randomly samples for a predefined number of *useful* and *not useful* reviews which are then concatenated together and the resulting df is then shuffled.

In [4]:
THRESHOLD = round(reviews['useful'].mean()) 

useful_reviews = reviews.loc[reviews['useful'] >= THRESHOLD].sample(SAMPLE_SIZE)
not_useful_reviews = reviews.loc[reviews['useful'] < THRESHOLD].sample(SAMPLE_SIZE)
print("useful_reviews shape: ", useful_reviews.shape)
print("not_useful_reviews shape: ", not_useful_reviews.shape)

reviews = shuffle(pd.concat([useful_reviews, not_useful_reviews]))
reviews.head()

useful_reviews shape:  (500, 3)
not_useful_reviews shape:  (500, 3)


Unnamed: 0,text,useful,cool
218396,The hamburger was fresh and so were the toppin...,0,0
5218305,If you're looking for help in the Home departm...,0,0
905548,Heard about this place from friends how great ...,0,0
4914250,"Sushi in big wooden boats!\n\nI have to say, I...",4,1
4019227,We had a great server named Herschel who defin...,0,1


# Preprocessing

For `text` a custom `analyzer` method is written which:
1. remove all punctuations and
3. removes new line characters (escape sequence)

The feature extraction (`CountVectorize` and `TfidfVectorizer`) class is then set to:
2. remove accents
2. remove all stopwords
3. lowercase all words

For `useful` votes:
2. remove df row if `useful` is `NaN`

In [5]:
reviews.dropna(inplace=True)
print("reviews shape after dropping NaN values: ", reviews.shape)

reviews shape after dropping NaN values:  (1000, 3)


In [6]:
import string
import re

RE_NEWLINE = '\n+'
PUNCTUATIONS = string.punctuation

def review_process(review):
    no_newline = re.sub(RE_NEWLINE, '', review)
    no_punc = ''.join([char for char in no_newline if char not in PUNCTUATIONS])
    
    return no_punc

In [7]:
test = "a str.. with! some @2 punctuations 6546721 and numbers \n [] \n"
print(review_process(test))

a str with some 2 punctuations 6546721 and numbers   


In [8]:
vect = TfidfVectorizer(strip_accents='ascii', preprocessor=review_process, stop_words='english' )
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function review_process at 0x14b1c5bf8>,
        smooth_idf=True, stop_words='english', strip_accents='ascii',
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None)

# Generating labels

In [11]:
import math

NOT_USEFUL = 0
USEFUL = 1

def labeler(vote):
    if math.floor(vote) < THRESHOLD:
        return NOT_USEFUL
    else:
        return USEFUL

In [12]:
reviews['label'] = reviews['useful'].apply(labeler)
    
reviews.head()

Unnamed: 0,text,useful,cool,label
218396,The hamburger was fresh and so were the toppin...,0,0,0
5218305,If you're looking for help in the Home departm...,0,0,0
905548,Heard about this place from friends how great ...,0,0,0
4914250,"Sushi in big wooden boats!\n\nI have to say, I...",4,1,1
4019227,We had a great server named Herschel who defin...,0,1,0


# Cross validation

We will use K-fold validation with 5 folds using `cross_val_score` from `sklearn.model_selection` which by default used K-fold validation. `cross_val_score` trains and validates the classifier with *k-1* folds and tests with the remaining fold which means we do not have to create testing and training splits manually.

In [48]:
X = reviews[['text', 'useful', 'cool']]
Y = reviews['label']

In [76]:
def print_percent(flt):
    pretty = "%.2f" % (flt*100)
    
    return "{0}%".format(pretty)

# Instantiate classifiers

We will be using 3 classifiers for this analysis namely, *Multinomial Naive Bayes*, *Linear SVM* and *Random Forest*.

In [14]:
mn_clas = MultinomialNB()
print(mn_clas)
print("\n")

svc_clas = LinearSVC()
print(svc_clas)
print("\n")

rf_clas = RandomForestClassifier()
print(rf_clas)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


# Feature selection

As we increase the number of data points, the vocabulary of our classifiers will increase which means the number of features extracted will also increase. For a large numebr of data points, this will be memory intensive and will also result in longer training time.

To improve the performance of our program, we will remove all zero-variance features.

In [19]:
sel = VarianceThreshold()
sel

VarianceThreshold(threshold=0.0)

# Model 1: training with only text

In [38]:
# feature extraction
X1 = vect.fit_transform(X['text'])
print("shape of X1 after feature extraction: ", X1.shape)

# feature selection
X1 = sel.fit_transform(X1)
print("shape of X1_test after feature extraction: ", X1.shape)

shape of X1 after feature extraction:  (1000, 12255)
shape of X1_test after feature extraction:  (1000, 12255)


In [36]:
type(X1)

scipy.sparse.csr.csr_matrix

In [77]:
mn_scores = cross_val_score(mn_clas, X1, Y, cv=5)
svc_scores = cross_val_score(svc_clas, X1, Y, cv=5)
rf_scores = cross_val_score(rf_clas, X1, Y, cv=5)

In [78]:
print("Naive Bayes score: ", print_percent(mn_scores.mean()))
print("Linear SVC score: ", print_percent(svc_scores.mean()))
print("Random Forest score: ", print_percent(rf_scores.mean()))

Naive Bayes score:  56.30%
Linear SVC score:  55.10%
Random Forest score:  54.90%


# Model 2: training with text and useful votes

Since X now contains mixed dtypes, we need to be a bit clever. First, we convert `useful` column for train and test into sparse matrix and *I2* normalize them (I2 is the default for `TfidfVectorizer` so that's what we use here as well).

The rest is straight forward, we create our updated X by stacking the `useful` sparse matrices with X horizontally (such that the number of feature increases). 

In [52]:
useful_sparse_train = normalize(csr_matrix(X['useful']))
print("type of useful_sparse_train: ", type(useful_sparse_train))
print("shape of useful_sparse_train: ", useful_sparse_train.shape)

type of useful_sparse_train:  <class 'scipy.sparse.csr.csr_matrix'>
shape of useful_sparse_train:  (1, 1000)


In [53]:
X2 = hstack([X1, useful_sparse_train.T])
print("shape of X2: ", X2.shape)

shape of X2:  (1000, 12256)


In [79]:
mn_scores = cross_val_score(mn_clas, X2, Y, cv=5)
svc_scores = cross_val_score(svc_clas, X2, Y, cv=5)
rf_scores = cross_val_score(rf_clas, X2, Y, cv=5)

In [80]:
print("Naive Bayes score: ", print_percent(mn_scores.mean()))
print("Linear SVC score: ", print_percent(svc_scores.mean()))
print("Random Forest score: ", print_percent(rf_scores.mean()))

Naive Bayes score:  59.80%
Linear SVC score:  62.50%
Random Forest score:  85.30%


# Model 3: training with text and cool votes

Next we will carry out the analysis with text and `cool` votes. We choose `cool` votes since it had the second highest correlation with `useful` (see exploration.ipynb).

In [56]:
cool_sparse_train = normalize(csr_matrix(X['cool']))

In [57]:
X3 = hstack([X1, cool_sparse_train.T])
print("shape of X3: ", X3.shape)

shape of X3:  (1000, 12256)


In [58]:
mn_scores = cross_val_score(mn_clas, X3, Y, cv=5)
svc_scores = cross_val_score(svc_clas, X3, Y, cv=5)
rf_scores = cross_val_score(rf_clas, X3, Y, cv=5)

In [59]:
print("Naive Bayes score: ", print_percent(mn_scores.mean()))
print("Linear SVC score: ", print_percent(svc_scores.mean()))
print("Random Forest score: ", print_percent(rf_scores.mean()))

Naive Bayes score:  58.0%
Linear SVC score:  58.0%
Random Forest score:  62.0%


# Model 4: training with text, useful and cool votes

Finally we will carry out the analysis with text, `useful` & `cool` votes.

In [60]:
uc_sparse_train = normalize(csr_matrix(X[['cool', 'useful']]))
print("shape of uc_sparse_train: ", uc_sparse_train.shape)

shape of uc_sparse_train:  (1000, 2)


In [61]:
X4 = hstack([X1, uc_sparse_train])
print("shape of X4: ", X4.shape)

shape of X4:  (1000, 12257)


In [62]:
mn_scores = cross_val_score(mn_clas, X4, Y, cv=5)
svc_scores = cross_val_score(svc_clas, X4, Y, cv=5)
rf_scores = cross_val_score(rf_clas, X4, Y, cv=5)

In [63]:
print("Naive Bayes score: ", print_percent(mn_scores.mean()))
print("Linear SVC score: ", print_percent(svc_scores.mean()))
print("Random Forest score: ", print_percent(rf_scores.mean()))

Naive Bayes score:  82.0%
Linear SVC score:  100.0%
Random Forest score:  85.0%
