__Experiments__

In [265]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction import DictVectorizer
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import scipy.stats

In [108]:
data_path = '../data/clean_data_full.csv'
df = pd.read_csv(data_path)

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51832 entries, 0 to 51831
Data columns (total 31 columns):
Unnamed: 0                  51832 non-null int64
user_num_places_rated       51832 non-null int64
user_num_following          51832 non-null int64
user_url                    51832 non-null object
beer_num_ratings            51832 non-null int64
user_num_friends            51832 non-null int64
user_num_breweries_rated    51832 non-null int64
review_palate_score         51832 non-null int64
review_taste_score          51832 non-null int64
user_num_ratings            51832 non-null int64
review_ratings_blob         51832 non-null object
review_aroma_score          51832 non-null int64
user_num_countries_rated    51832 non-null int64
user_id                     51832 non-null int64
review_avg_score            51832 non-null float64
beer_global_style_score     47253 non-null float64
beer_weighted_avg_score     51760 non-null float64
beer_brewer_name            51832 non-null object


In [140]:
## Num rows
len(df)

51832

In [151]:
mini_train_d = df[0:100]

100

In [156]:
## Basic test (0.2) train (0.8) split
def train_test_split(df, proportion):
    msk = np.random.rand(len(df)) < proportion
    train = df[msk]
    test = df[~msk]
    return train, test

train, test = train_test_split(mini_train_d, 0.8)
print len(train)
print len(test)

78
22


# Feature Engineering

In [282]:
def unigram_phi(review):
    return Counter(review.split())
def bigram_phi(review):
    return Counter(nltk.bigrams(review.split()))
def unigram_bigram_phi(review):
    return unigram_phi(review) + bigram_phi(review)
def trigram_phi(review):
    return Counter(nltk.trigrams(review.split()))

In [192]:
def build_data_set(data, phi, vectorizer = None):
    REVIEW_BLOB = 24
    TASTE_SCORE = 8
    labels = []
    feat_dicts = []
    raw_examples = []
    data_values = data.values
    for row in data.values:
        review, score = row[REVIEW_BLOB], row[TASTE_SCORE]
        if not isinstance(review, basestring):
            print 'weird review:\t', review
        feat_dicts.append(phi(review))
        labels.append(score)
        raw_examples.append(review)
        
    # In training, we want a new vectorizer:    
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=True)
        feat_matrix = vectorizer.fit_transform(feat_dicts)
    # In assessment, we featurize using the existing vectorizer:
    else:
        feat_matrix = vectorizer.transform(feat_dicts)

    return {'X': feat_matrix, 
            'y': labels, 
            'vectorizer': vectorizer, 
            'raw_examples': raw_examples}

In [193]:
d_train = build_data_set(train, bigram_phi)
d_test = build_data_set(test, bigram_phi)
d_test['X'].shape

(22, 564)

In [194]:
def fit_linear_regression(X, y):    
    """
    Linear Regression
    """
    mod = LinearRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [232]:
def fit_gbm_regression(X, y,
                       n_estimators = 100,
                       learning_rate = 0.1,
                       max_depth = 1,
                       random_state = 0,
                       loss = "ls"):
    """
    Gradient Boosting Method Regression
    """
    gbm = GradientBoostingRegressor(n_estimators = n_estimators,
                                    learning_rate = learning_rate,
                                    max_depth = max_depth,
                                    random_state = random_state,
                                    loss = "ls")
    mod = gbm.fit(X.toarray(), y)
    return mod

In [253]:
def fit_lasso(X, y, alpha = 0.1, max_iter = 1000):
    lasso = Lasso(alpha = alpha, max_iter = max_iter)
    mod = lasso.fit(X, y)
    return mod

In [254]:
def experiment(data, model, phi, train_size = 0.7):
    ## Build data set
    train = build_data_set(data, phi)
    X_train = train['X']
    y_train = train['y']

    ## Test-train split
    X_train, X_assess, y_train, y_assess = train_test_split(
            X_train, y_train, train_size = train_size)
    
    ## Model data
    mod = model(X_train, y_train)
    predictions = mod.predict(X_assess.toarray())
    
    ## Return MSE
    return mean_squared_error(y_assess, predictions)
    

In [255]:
experiment(train, fit_gbm_regression, bigram_phi)

4.9835614802792101

In [256]:
learners = [fit_linear_regression, fit_lasso, fit_gbm_regression]
phis = [unigram_phi, bigram_phi, unigram_bigram_phi]

In [257]:
for learner in learners:
    print "============================="
    print "Fitting:\t", learner.__name__
    for phi in phis:
        print "-------"
        print "phi:\t", phi.__name__
        print "mse:\t", experiment(train, learner, phi)        

Fitting:	fit_linear_regression
-------
phi:	unigram_phi
mse:	3.62147174996
-------
phi:	bigram_phi
mse:	5.85069689964
-------
phi:	unigram_bigram_phi
mse:	4.14539017663
Fitting:	fit_lasso
-------
phi:	unigram_phi
mse:	3.88304699485
-------
phi:	bigram_phi
mse:	5.04305786392
-------
phi:	unigram_bigram_phi
mse:	6.69353076001
Fitting:	fit_gbm_regression
-------
phi:	unigram_phi
mse:	6.48175391157
-------
phi:	bigram_phi
mse:	5.6615960342
-------
phi:	unigram_bigram_phi
mse:	2.83297702418


In [283]:
def run_model(data, model, phi, n_samples = 100):
    res = [experiment(data, model, phi) for _ in range(n_samples)]
    return res

In [284]:
glm_res = run_model(train, fit_linear_regression, trigram_phi)
gbm_res = run_model(train, fit_gbm_regression, trigram_phi)
lasso_res = run_model(train, fit_lasso, trigram_phi)

In [285]:
print "glm:", np.mean(glm_res), np.var(glm_res)
print "gbm:", np.mean(gbm_res), np.var(gbm_res)
print "lasso", np.mean(lasso_res), np.var(lasso_res)

glm: 5.09104430619 1.65291139452
gbm: 5.4193393923 2.1477179012
lasso 5.5710284521 1.65857267327


In [288]:
print scipy.stats.wilcoxon(glm_res, gbm_res)[1]
print scipy.stats.wilcoxon(glm_res, lasso_res)[1]
print scipy.stats.wilcoxon(gbm_res, lasso_res)[1]

0.0933661297989
0.0212407456292
0.5025560931
