## NLP: Using Word2Vec to Predict Review Usefulness

In [1]:
import os
import codecs
import json
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt

data_directory = os.path.join('C:/Users/andre/Documents/yelp_dataset_challenge_round9')
intermediate_directory = os.path.join(data_directory, 'intermediate')

In [2]:
df = pd.read_csv(os.path.join(intermediate_directory, 'useful.csv'))
luseful = np.log(df['useful'] + 1)

In [3]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [4]:
from gensim.models import Word2Vec

word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')
word2vec = Word2Vec.load(word2vec_filepath)

print(u'{:,} terms in the word2vec vocabulary.'.format(len(word2vec.wv.vocab)))

6,382 terms in the word2vec vocabulary.


In [5]:
# build a list of terms, index, and term counts from the word2vec model
ordered_vocab = [(term, vocab.index, vocab.count) for term, vocab in word2vec.wv.vocab.items()]
ordered_vocab = sorted(ordered_vocab, key=lambda ordered_vocab:ordered_vocab[2], reverse=True)
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

In [6]:
# Creat a dictionary mapping each word to a 100-dimensional vector
word_vectors = dict(list(zip(word2vec.wv.index2word, word2vec.wv.syn0)))

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, ShuffleSplit
from collections import defaultdict
from tabulate import tabulate

In [9]:
trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt')

X = []
with codecs.open(trigram_reviews_filepath, encoding='utf-8') as f:
    for review in f:
        X.append(review)

In [10]:
class MeanEmbeddingVectorizer():
    
    """ Given a word to vector mapping, vectorize texts by taking the mean of all the word vectors for each document"""
    
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(list(word2vec.values())[0])
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in review if w in self.word2vec] 
                   or [np.zeros(self.dim)], axis=0) 
                   for review in X])

In [11]:
class TfidfMeanVectorizer():
    
    """ Vectorize texts by taking the weighted average word vectors by their TF-IDF"""
    
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(list(word2vec.values())[0])
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x:x)
        tfidf.fit(X)
        # Let an unseem word be as infrequent as the most infreqeunt word
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
        return self
    
    def transform(self, X):
        return np.array([np.mean([self.word2vec[w] * self.word2weight[w]
                                 for w in review if w in self.word2vec] or 
                                [np.zeros(self.dim)], axis=0) 
                         for review in X])   

In [12]:
def cv_rmse(model, X, y, cv=5, scoring='neg_mean_squared_error'):
    
    """ Compute an overall RMSE across all folds of cross validation"""
    
    return np.sqrt(np.mean(np.multiply(cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error'), -1)))
    
def RMSE(y_true, y_pred):
    
    """ Root Mean Squared Error"""
    
    return np.sqrt(np.mean((y_true - y_pred)**2))

def RMSLE(y_true, y_pred):
    
    """ Root Mean Squared Logarithmic Error"""
    
    return np.sqrt(np.mean(((np.log(y_true + 1) - np.log(y_pred + 1))**2)))

In [28]:
# Linear Regression and shrinkage methods: Ridge and lasso

lr_w2v = Pipeline([("w2v_vectorizer", MeanEmbeddingVectorizer(word_vectors)), 
                   ("lr", LinearRegression())])
lr_w2v_tfidf = Pipeline([("tfidf_w2v_vectorizer", TfidfMeanVectorizer(word_vectors)),
                        ("lr", LinearRegression())])
ridge_w2v = Pipeline([("w2v_vectorizer", MeanEmbeddingVectorizer(word_vectors)),
                    ("ridge", Ridge(alpha=1))])
ridge_w2v_tfidf = Pipeline([("tfidf_w2v_vectorizer", TfidfMeanVectorizer(word_vectors)),
                    ("ridge", Ridge(alpha=1))])                            
rfr_w2v = Pipeline([("w2v_vectorizer", MeanEmbeddingVectorizer(word_vectors)),
                    ("rfr", RandomForestRegressor(n_estimators=100))])                            
xgb_w2v = Pipeline([("w2v_vectorizer", MeanEmbeddingVectorizer(word_vectors)),
                    ("xgb", XGBRegressor(n_estimators=100))])


In [29]:
w2v_models = [("lr_w2v", lr_w2v), ("lr_w2v_tfidf", lr_w2v_tfidf),
              ("ridge_w2v", ridge_w2v), ("ridge_w2v_tfidf", ridge_w2v_tfidf),
              ("rfr_w2v", rfr_w2v), ("xgb_w2v", xgb_w2v)]

w2v_rmse = sorted([(name, cv_rmse(model, X, luseful, cv=5)) 
                     for name, model in w2v_models], key=lambda x:x[1])

print (tabulate(w2v_rmse, floatfmt=".4f", headers=("model", "RMSE_5cv")))

model              RMSE_5cv
---------------  ----------
xgb_w2v              0.6037
rfr_w2v              0.6130
ridge_w2v_tfidf      0.6175
ridge_w2v            0.6176
lr_w2v_tfidf       175.3874
lr_w2v             324.2977


### What are the Most Useful or Useless Words?

To find the usefulness of any single word, I first estimate a regression model and then use it to predict the number of useful votes each single word will obtain in the entire vocabulary. For both XGBoost and Ridge regressors, words that involve a dollar amount, 'price', 'co-pay' or 'cash' tend to be the most useful words, perhaps because they are actually informative and provide objective facts. Words like 'confidence', 'amazing' and 'excellent' are mere subjective feelings, so they are among the least useful of all words.

In [13]:
xgb = XGBRegressor(learning_rate=0.01, 
                    n_estimators=2000, 
                    max_depth=3,
                    min_child_weight=3,
                    subsample=0.8,
                    colsample_bytree=0.6,
                    reg_lambda=10,
                    n_jobs=4,  
                    random_state=0).fit(MeanEmbeddingVectorizer(word_vectors)
                                        .fit(X, luseful).transform(X), luseful)

In [14]:
# Find the predicted usefulness of each word
words = np.array(list(word_vectors.keys()))
predicted = xgb.predict(list(word_vectors.values()))
indices = np.argsort(predicted)

In [15]:
useful_words = words[indices[-10:]]
useful_pred = predicted[indices[-10:]]
df = pd.DataFrame({'Most Useful Words': useful_words, 
                   'Predicted Useful Votes': np.exp(useful_pred)-1})
print (tabulate(df.sort_values('Predicted Useful Votes', ascending=False), 
                headers=df.columns, showindex=False))

Most Useful Words      Predicted Useful Votes
-------------------  ------------------------
1                                    0.712784
show_up                              0.688866
even_though                          0.685041
$_70                                 0.679914
$_20                                 0.679213
third                                0.676152
$_30                                 0.667335
$                                    0.662971
20                                   0.63884
decide                               0.634052


In [16]:
useless_words = words[indices[:10]]
useless_pred = predicted[indices[:10]]
df = pd.DataFrame({'Least Useful Words': useless_words, 
                   'Predicted Useful Votes': np.exp(useless_pred)-1})
print (tabulate(df, headers=df.columns, showindex=False))

Least Useful Words      Predicted Useful Votes
--------------------  ------------------------
confidence                         -0.0401011
sincerely                          -0.0324617
constantly                         -0.0163424
funky                              -0.0150891
birth                              -0.0141224
most_importantly                   -0.00772482
also                               -0.00560182
encouragement                       0.00176609
kid                                 0.00631058
glow                                0.00653279


In [17]:
ridge = Ridge(alpha=1).fit(MeanEmbeddingVectorizer(word_vectors)
                           .fit(X, luseful).transform(X), luseful)

In [18]:
words = np.array(list(word_vectors.keys()))
predicted = ridge.predict(list(word_vectors.values()))
indices = np.argsort(predicted)

In [19]:
useful_words = words[indices[-10:]]
useful_pred = predicted[indices[-10:]]
df = pd.DataFrame({'Most Useful Words': useful_words, 
                   'Predicted Useful Votes': np.exp(useful_pred)-1})
print (tabulate(df.sort_values('Predicted Useful Votes', ascending=False), 
                headers=df.columns, showindex=False))

Most Useful Words      Predicted Useful Votes
-------------------  ------------------------
parking                           1.19824e+07
price                             5.673e+06
few_minute                        2.51517e+06
become                            1.29542e+06
co_pay                            1.2212e+06
than                         691790
amount                       507229
side                         247344
cash                         179244
pretty                       170521


In [20]:
useless_words = words[indices[:10]]
useless_pred = predicted[indices[:10]]
df = pd.DataFrame({'Least Useful Words': useless_words, 
                   'Predicted Useful Votes': np.exp(useless_pred)-1})
print (tabulate(df, headers=df.columns, showindex=False))

Least Useful Words      Predicted Useful Votes
--------------------  ------------------------
exceptional                          -1
excellent                            -1
amazing                              -1
anyone                               -1
great                                -0.999999
expert                               -0.999999
outstanding                          -0.999999
issue                                -0.999998
incredible                           -0.999998
wonderful                            -0.999998
