# __Experiments__

In [60]:
import math
import pandas as pd
import numpy as np
import nltk
import time
from sklearn.feature_extraction import DictVectorizer
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingRegressor
import scipy.stats

In [4]:
data_path = '../data/clean_data_full.csv'
df = pd.read_csv(data_path)
# df.info()

In [7]:
## Num rows
print "num observations:\t", len(df)

num observations:	52778


# Feature Engineering

In [5]:
def unigram_phi(review):
    return Counter(review.split())
def bigram_phi(review):
    return Counter(nltk.bigrams(review.split()))
def unigram_bigram_phi(review):
    return unigram_phi(review) + bigram_phi(review)
def trigram_phi(review):
    return Counter(nltk.trigrams(review.split()))

# Learners

In [8]:
def fit_linear_regression(X, y):    
    """
    Linear Regression
    """
    mod = LinearRegression(fit_intercept=True, n_jobs = -1)
    mod.fit(X, y)
    return mod

In [9]:
def fit_gbm_regression(X, y,
                       n_estimators = 100,
                       learning_rate = 0.1,
                       max_depth = 1,
                       random_state = 0,
                       loss = "ls"):
    """
    Gradient Boosting Method Regression
    """
    gbm = GradientBoostingRegressor(n_estimators = n_estimators,
                                    learning_rate = learning_rate,
                                    max_depth = max_depth,
                                    random_state = random_state,
                                    loss = "ls")
    mod = gbm.fit(X.toarray(), y)
    return mod

In [10]:
def fit_lasso(X, y, alpha = 0.1, max_iter = 1000):
    lasso = Lasso(alpha = alpha, max_iter = max_iter)
    mod = lasso.fit(X, y)
    return mod

In [11]:
def fit_ridge(X, y, alpha = 0.1, max_iter = 1000):
    lasso = Lasso(alpha = alpha, max_iter = max_iter)
    mod = lasso.fit(X, y)
    return mod

In [12]:
def build_data_set(data, vectorizer = None, aspect_str = "OVERALL"):
    """
    Aspect ratings cols:
        (7)  review_palate_score      -- score_normalizer = 1
        (8)  review_taste_score       -- score_normalizer = 2
        (11) review_aroma_score       -- score_normalizer = 2
        (14) review_avg_score         -- score_normalizer = 1
        (18) review_overall_score     -- score_normalizer = 4
        (20) review_appearance_score  -- score_normalizer = 1
        
    predict_col :: column for aspect we're predicting, current default is column 8 (TASTE_SCORE)
    
    """
    
    ## RateBeer scrape data locations
    ## ------------------------------
    REVIEW_BLOB = 24
    ASPECTS = {
        "PALATE"     : [7, 1],
        "TASTE"      : [8, 2],
        "AROMA"      : [11, 2],
        "AVERAGE"    : [14, 1],
        "OVERALL"    : [18, 4],
        "APPEARANCE" : [20, 1]
    }
    assert aspect_str in ASPECTS
        
    aspect_column = ASPECTS[aspect_str][0]      ## Get aspect rating column
    aspect_normalizer = ASPECTS[aspect_str][1]  ## Get aspect normalizer
    labels = []                                 ## Ratings
    feat_dicts = []                             ## Features
    raw_examples = []                           ## Review strings
    data_values = data.values                   ## Data from pandas df
    for row in data.values:
        review, score = row[REVIEW_BLOB], row[aspect_column]
        score = float(score) / aspect_normalizer

        ## Safety check
        if not isinstance(review, basestring):
            print 'weird review:\t', review
            
#         feat_dicts.append(phi(review))
        labels.append(score)
        raw_examples.append(review)
        
    # In training, we want a new vectorizer:
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=True)
        feat_matrix = vectorizer.fit_transform(feat_dicts)
    # In assessment, we featurize using the existing vectorizer:
    else:
        feat_matrix = vectorizer.fit_transform(raw_examples)

    return {'X'            : feat_matrix, 
            'y'            : labels, 
            'vectorizer'   : vectorizer, 
            'raw_examples' : raw_examples,
            'feature_names' : vectorizer.get_feature_names()}

In [71]:
def experiment(data,
               model = fit_lasso,
               phi = None,
               assess_data = None,
               train_size = 0.9,
               score_metrics = [mean_squared_error],
               verbose = False):
    
    start_time = time.time()
    
    ## Build data set
    # train = build_data_set(train_data, phi, vectorizer = vectorizer)
    X_train = data['X'] 
    y_train = data['y']
    vectorizer = data['vectorizer']
    feature_names = data['feature_names']

    ## Test-train split
    if assess_data == None:
        X_train, X_assess, y_train, y_assess = train_test_split(
                X_train, y_train, train_size = train_size)
    ## Only use for test-set
    else:
        assess = build_data_set(assess_data, phi, vectorizer = vectorizer)
        X_assess, y_assess = assess['X'], assess['y']
        

    ## Model data
    if model == None:
        predictions = rep(np.mean(X_assess), len(y_assess))
    mod = model(X_train, y_train)
    predictions = mod.predict(X_assess.toarray())
    
    run_time = time.time() - start_time
    if verbose:
        print "\tExperiment information"
        print '\t======================='
        print "\ttrained:\t", model.__name__
        print "\tnum training observations:\t", X_train.shape[0]
        print "\tnum training features:\t", X_train.shape[1]
        print "\tscore_metrics:\t", [score_metric.__name__ for score_metric in\
                                     score_metrics]
        print "\trun time: ", run_time
        if model.__name__ == 'fit_linear_regression' or\
            model.__name__ == 'fit_lasso' or\
            model.__name__ == 'fit_ridge':
            print feature_names[1:10]
            print mod.coef_[1]
        print "y_assess[1:10]", y_assess[1:10]
    
    ## Return MSE
    return [score_metric(y_assess, predictions) for score_metric in score_metrics]
    

In [14]:
def train_reader(train_path = data_path, n = 1000):
    df = pd.read_csv(data_path)
    return df.sample(n)

def dev_reader(test_path, n = 100):
    df = pd.read_csv(data_path)
    return df.sample(n)

# Vectorizer's encode phi...

In [None]:
## bigram_vectorizer
## -----------------
## normalization:
##  * lower
##  * remove stop words
##  * min word length == 2
##
bigram_vectorizer = CountVectorizer(analyzer='word', stop_words = 'english', ngram_range=(2, 2), min_df = 3, max_features = 5000)

In [None]:
## unigram_vectorizer
## -----------------
## normalization:
##  * lower
##  * remove stop words
##  * min word length == 2
##
unigram_vectorizer = CountVectorizer(analyzer='word', stop_words = 'english', ngram_range=(1, 1), min_df = 3, max_features = 5000)

In [None]:
## trigram_vectorizer
## ------------------
## normalization:
##  * lower
##  * remove stop words
##  * min word length == 2
##
trigram_vectorizer = CountVectorizer(analyzer='word', stop_words = 'english', ngram_range=(3, 3), min_df = 3, max_features = 5000)

In [63]:
def set_vectorizer(n_gram = "unigram", min_df = 3, sample_size = 10000, stop_words = None):

    max_features = int(sample_size * 0.75)
    if n_gram == "trigram":
        ngram_range = (3,3)
    elif n_gram == "bigram":
        ngram_range = (2,2)
    else: # default to unigram
        ngram_range = (1,1)
    
    return CountVectorizer(analyzer='word',
                           ngram_range = ngram_range,
                           min_df = min_df,
                           max_features = max_features,
                           stop_words = stop_words)
        

# Data

In [68]:
# def build_data_set(data, phi, vectorizer = None, aspect_str = "OVERALL"):

n_samples = 40000
vectorizer = set_vectorizer(n_gram = "unigram", sample_size = n_samples)
train_d = train_reader(data_path, n = n_samples)
built_data_set = build_data_set(train_d, vectorizer = vectorizer, aspect_str = 'TASTE')

# Prelim runs

### Linear regresssion

In [70]:
## Note: running time ~ 1 minute @ 40k observations
experiment(data = built_data_set,
           model = fit_linear_regression,
           verbose = True,
           score_metrics = [mean_squared_error, r2_score])

	Experiment information
	trained:	fit_linear_regression
	num training observations:	28000
	num training features:	15349
	score_metrics:	['mean_squared_error', 'r2_score']
	run time:  36.1191158295
[u'000', u'001', u'00euro', u'00us', u'01', u'02', u'03', u'04', u'05']
-0.0135669739936


[3.5241066651379738, -2.9326080769577754]

### Ridge regression

In [67]:
## Note: running time few seconds @ 40k observations
experiment(data = built_data_set,
           model = fit_ridge,
           verbose = True, 
           score_metrics = [mean_squared_error, r2_score])

	Experiment information
	trained:	fit_ridge
	num training observations:	14000
	num training features:	10244
	score_metrics:	['mean_squared_error', 'r2_score']
	run time:  0.962724924088
[u'00euro', u'01', u'02', u'03', u'04', u'05', u'06', u'07', u'08']
-0.0


[0.92520561852400296, 0.0055677467043488971]

## Lasso

In [66]:
## Note: running time few seconds @ 40k observations
experiment(data = built_data_set,
           model = fit_lasso,
           verbose = True,
           score_metrics = [mean_squared_error, r2_score])

	Experiment information
	trained:	fit_lasso
	num training observations:	14000
	num training features:	10244
	score_metrics:	['mean_squared_error', 'r2_score']
	run time:  1.13637089729
[u'00euro', u'01', u'02', u'03', u'04', u'05', u'06', u'07', u'08']
-0.0


[0.90837965400798115, 0.0063928585260937254]

### GBM regression

In [47]:
## Note: running time is currently ver long
# experiment(data = built_data_set,
#            model = fit_gbm_regression,
#            verbose = False)

# Between aspects - lasso

In [46]:
ngram = 'bigram'
ASPECTS = ['OVERALL', 'TASTE', 'AROMA', 'PALATE', 'APPEARANCE']
for aspect in ASPECTS:
    n_samples = 40000
    vectorizer = set_vectorizer(n_gram = ngram, sample_size = n_samples)
    train_d = train_reader(data_path, n = n_samples)
    built_data_set = build_data_set(train_d, vectorizer = vectorizer, aspect_str = aspect)

    print "\n-----------------"
    print "ngram:\t", ngram
    print "curr aspect:\t", aspect
    mse = experiment(data = built_data_set,
           model = fit_lasso,
           verbose = True)
    print "mse:\t", mse


-----------------
ngram:	bigram
curr aspect:	OVERALL
	Experiment information
	trained:	fit_lasso
	num training observations:	28000
	num training features:	20000
	score_metric:	mean_squared_error
	run time:  3.36163902283
[u'03 07', u'04 05', u'04 07', u'04 2016', u'05 pours', u'06 07', u'06 08', u'07 07', u'07 poured']
-0.0
mse:	0.959995435162

-----------------
ngram:	bigram
curr aspect:	TASTE
	Experiment information
	trained:	fit_lasso
	num training observations:	28000
	num training features:	20000
	score_metric:	mean_squared_error
	run time:  3.04476690292
[u'04 07', u'04 2016', u'05 pours', u'06 07', u'06 08', u'07 07', u'07 poured', u'10 2012', u'10 2014']
-0.0
mse:	0.910395687819

-----------------
ngram:	bigram
curr aspect:	AROMA
	Experiment information
	trained:	fit_lasso
	num training observations:	28000
	num training features:	20000
	score_metric:	mean_squared_error
	run time:  3.11804795265
[u'04 07', u'04 2016', u'05 pours', u'06 07', u'06 08', u'07 07', u'07 poured', u'10

# Multiple runs

In [128]:
learners = [fit_linear_regression, fit_lasso, fit_gbm_regression]
vectorizers = [unigram_vectorizer, bigram_vectorizer]
vectorizer_names = ["unigram", "bigram"]

In [132]:
for learner in learners:
    print "\n============================="
    print "Fitting:\t", learner.__name__
    for vectorizer, name in zip(vectorizers, vectorizer_names):
        print "-------"
        print "vectorizer:\t", name
        mse = experiment(train_d,
                         learner,
                         unigram_phi,
                         vectorizer = vectorizer,
                         verbose = True)
        print "mse:\t", mse


Fitting:	fit_linear_regression
-------
vectorizer:	unigram


KeyboardInterrupt: 

# Model testing

In [283]:
def run_model(d_sample, model, phi, n_samples = 100):
    res = [experiment(data, model, phi) for _ in range(n_samples)]
    return res

In [284]:
glm_res = run_model(train, fit_linear_regression, trigram_phi)
gbm_res = run_model(train, fit_gbm_regression, trigram_phi)
lasso_res = run_model(train, fit_lasso, trigram_phi)

In [285]:
print "glm:", np.mean(glm_res), np.var(glm_res)
print "gbm:", np.mean(gbm_res), np.var(gbm_res)
print "lasso", np.mean(lasso_res), np.var(lasso_res)

glm: 5.09104430619 1.65291139452
gbm: 5.4193393923 2.1477179012
lasso 5.5710284521 1.65857267327


In [288]:
print scipy.stats.wilcoxon(glm_res, gbm_res)[1]
print scipy.stats.wilcoxon(glm_res, lasso_res)[1]
print scipy.stats.wilcoxon(gbm_res, lasso_res)[1]

0.0933661297989
0.0212407456292
0.5025560931
