__Experiments__

In [190]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction import DictVectorizer
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [108]:
data_path = '../data/clean_data_full.csv'
df = pd.read_csv(data_path)

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51832 entries, 0 to 51831
Data columns (total 31 columns):
Unnamed: 0                  51832 non-null int64
user_num_places_rated       51832 non-null int64
user_num_following          51832 non-null int64
user_url                    51832 non-null object
beer_num_ratings            51832 non-null int64
user_num_friends            51832 non-null int64
user_num_breweries_rated    51832 non-null int64
review_palate_score         51832 non-null int64
review_taste_score          51832 non-null int64
user_num_ratings            51832 non-null int64
review_ratings_blob         51832 non-null object
review_aroma_score          51832 non-null int64
user_num_countries_rated    51832 non-null int64
user_id                     51832 non-null int64
review_avg_score            51832 non-null float64
beer_global_style_score     47253 non-null float64
beer_weighted_avg_score     51760 non-null float64
beer_brewer_name            51832 non-null object


In [140]:
## Num rows
len(df)

51832

In [151]:
mini_train_d = df[0:100]

100

In [156]:
## Basic test (0.2) train (0.8) split
def train_test_split(df, proportion):
    msk = np.random.rand(len(df)) < proportion
    train = df[msk]
    test = df[~msk]
    return train, test

train, test = train_test_split(mini_train_d, 0.8)
print len(train)
print len(test)

78
22


# Feature Engineering

In [191]:
def unigram_phi(review):
    return Counter(review.split())
def bigram_phi(review):
    return Counter(nltk.bigrams(review.split()))

In [192]:
def build_data_set(data, phi, vectorizer = None):
    REVIEW_BLOB = 24
    TASTE_SCORE = 8
    labels = []
    feat_dicts = []
    raw_examples = []
    data_values = data.values
    for row in data.values:
        review, score = row[REVIEW_BLOB], row[TASTE_SCORE]
        if not isinstance(review, basestring):
            print 'weird review:\t', review
        feat_dicts.append(phi(review))
        labels.append(score)
        raw_examples.append(review)
        
    # In training, we want a new vectorizer:    
    if vectorizer == None:
        vectorizer = DictVectorizer(sparse=True)
        feat_matrix = vectorizer.fit_transform(feat_dicts)
    # In assessment, we featurize using the existing vectorizer:
    else:
        feat_matrix = vectorizer.transform(feat_dicts)

    return {'X': feat_matrix, 
            'y': labels, 
            'vectorizer': vectorizer, 
            'raw_examples': raw_examples}

In [193]:
d_train = build_data_set(train, bigram_phi)
d_test = build_data_set(test, bigram_phi)
d_test['X'].shape

(22, 564)

In [194]:
def fit_linear_regression(X, y):    
    """
    """
    mod = LinearRegression(fit_intercept=True)
    mod.fit(X, y)
    return mod

In [195]:
def experiment(data, model, phi, train_size = 0.7):
    ## Build data set
    train = build_data_set(data, phi)
    X_train = train['X']
    y_train = train['y']

    ## Test-train split
    X_train, X_assess, y_train, y_assess = train_test_split(
            X_train, y_train, train_size = train_size)
    
    ## Model data
    mod = model(X_train, y_train)
    predictions = mod.predict(X_assess)
    
    ## Return MSE
    return mean_squared_error(y_assess, predictions)
    

In [198]:
experiment(train, fit_linear_regression, unigram_phi)

5.908701293105957