# Helpful score prediction

In [1]:
% matplotlib inline

import pickle
import numpy as np

from operator import itemgetter

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split

import string

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin



## Load video games review dataframe

In [2]:
with open('pickle/video_games.pkl', 'rb') as input:
    video_games = pickle.load(input)

In [3]:
# Get only the reviews with a helpful rate
video_games = video_games[video_games.helpfulRate >= 0]

# Create a columns that will be use as a label for our binary classification
video_games['helpfulOrNot'] = video_games['helpfulRate'].apply(lambda x: 'helpful' if x > 0.5 else 'notHelpful')

In [4]:
video_games.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,overall,reviewText,reviewerName,summary,date,helpfulRate,helpfulCount,rank,description,price,title,brand,reviewLength,summaryLength,reviewerNameLength,helpfulOrNot
asin,reviewerID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
700099867,A2HD75EMZR8QLN,1,Installing the game was a struggle (because of...,123,Pay to unlock content? I don't think so.,2012-07-09,0.666667,12,6629,Dirt 3 is a popular rally racing game for Play...,246.63,,,779,40,3.0,helpful
700099867,A1DLMTOTHQ4AST,3,"I got this version instead of the PS3 version,...",ampgreen,"awesome game, if it did not crash frequently !!",2011-09-14,0.7,10,6629,Dirt 3 is a popular rally racing game for Play...,246.63,,,3489,47,8.0,helpful
700099867,A361M14PU2GUEG,4,I had Dirt 2 on Xbox 360 and it was an okay ga...,"Angry Ryan ""Ryan A. Forrest""",DIRT 3,2011-06-14,1.0,2,6629,Dirt 3 is a popular rally racing game for Play...,246.63,,,294,6,28.0,helpful
700099867,AN3YYDZAS3O1Y,5,Loved playing Dirt 2 and I thought the graphic...,Bob,A step up from Dirt 2 and that is terrific!,2011-08-14,0.846154,13,6629,Dirt 3 is a popular rally racing game for Play...,246.63,,,431,43,3.0,helpful
700099867,AQTC623NCESZW,1,I can't tell you what a piece of dog**** this ...,Chesty Puller,Crash 3 is correct name AKA Microsoft,2012-11-24,0.25,4,6629,Dirt 3 is a popular rally racing game for Play...,246.63,,,728,37,13.0,notHelpful


## Pipeline classication

### Text preprocessing

In [5]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

In [6]:
print('Original review text:')
print(video_games.iloc[19].reviewText)
print('Preprocessed text:')
print(NLTKPreprocessor().transform([video_games.iloc[19].reviewText])[0])

Original review text:
We bought this item for our Wii. It does not work at all for it. :-( So that was a disapointment. We did decide to keep it though because it works wonderfully on our x-box 360.
Preprocessed text:
['buy', 'item', 'wii', 'work', 'disapointment', 'decide', 'keep', 'though', 'work', 'wonderfully', 'x', 'box', '360']


### Create pipeline

In [7]:
from sklearn.svm import SVC

class MultipleItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        if not isinstance(data_dict[:, self.keys][0], str):
            return np.array(list(data_dict[:, self.keys])).reshape(len(data_dict), 1)
        return data_dict[:, self.keys]
    
class TextLength(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text)} for text in posts]

def identity(x): 
    return x

def create_pipeline(classifier=SGDClassifier):
    if isinstance(classifier, type):
        classifier = classifier()

    model = Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('summary', Pipeline([
                    ('selector', MultipleItemSelector(keys=0)),
                    ('preprocessor', NLTKPreprocessor()),
                    ('vectorizer', TfidfVectorizer(
                        tokenizer=identity, preprocessor=None, lowercase=False
                    )),
                ])),
                ('reviewText', Pipeline([
                    ('selector', MultipleItemSelector(keys=1)),
                    ('preprocessor', NLTKPreprocessor()),
                    ('vectorizer', TfidfVectorizer(
                        tokenizer=identity, preprocessor=None, lowercase=False
                    )),
                ])),
#                 ('reviewText_length', Pipeline([
#                     ('selector', MultipleItemSelector(keys=1)),
#                     ('stats', TextLength()),  # returns a list of dicts
#                     ('vect', DictVectorizer()),  # list of dicts -> feature matrix
#                 ])),
                ('helpfulCount', Pipeline([
                    ('selector', MultipleItemSelector(keys=2)),
                ])),
#                  ('reviewText_length', Pipeline([
#                     ('selector', MultipleItemSelector(keys=3)),
#                 ])),
                
            ],
            transformer_weights={
                'summary': 1.0,
                'reviewText': 1.0,
                'helpfulCount': 1.0,
                'reviewText_length': 1.0,
            },
        )),

        ('classifier', classifier),
    ])
        

    return model

### Test on playstation reviews

In [8]:
ps_reviews = video_games.loc['B00BGA9WK2']
len(ps_reviews)

760

In [9]:
y = ps_reviews.helpfulOrNot.values
# y = ps_reviews.overall.apply(lambda x: str(x)).values
X = ps_reviews.summary.values
X = X.reshape(len(X), 1)
X = np.append(X, np.array(ps_reviews.reviewText.values).reshape(len(X), 1), 1)
X = np.append(X, np.array(ps_reviews.helpfulCount.values).reshape(len(X), 1), 1)
# X = np.append(X, np.array(ps_reviews.reviewLength.values).reshape(len(X), 1), 1)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
model = create_pipeline()
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
print(classification_report(y_train, y_pred, target_names=label_encoder.classes_))

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))



             precision    recall  f1-score   support

    helpful       0.78      0.68      0.73       335
 notHelpful       0.66      0.77      0.71       273

avg / total       0.73      0.72      0.72       608

             precision    recall  f1-score   support

    helpful       0.47      0.44      0.45        80
 notHelpful       0.42      0.46      0.44        72

avg / total       0.45      0.45      0.45       152



In [15]:
def show_most_informative_features(model, n=10):
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['classifier']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {}.".format(
                classifier.__class__.__name__
            )
        )

    coefficients = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(coefficients[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    # Get the top n and bottom n coef, name pairs
    top_words  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in top_words:
        print(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(
                cp, fnp, cn, fnn
            )
        )

show_most_informative_features(model)

4.4989          light    -4.6010     generation
4.2547           even    -4.1915              p
4.1688           fifa    -4.0233      dualshock
4.0457            wii    -3.9402 starsfantastic
3.7503    starssimply    -3.8656           best
3.6751          place    -3.7839          truly
3.6724      realistic    -3.7614       navigate
3.5817         return    -3.6097          build
3.5707         medium    -3.6072      developer
3.4768         family    -3.4881           menu
