# Language modeling

In [1]:
from __future__ import print_function


import random
import logging
import pandas as pd
import numpy as np
from pprint import pprint
from time import time

from collections import defaultdict

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
## Classifiers
from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
## Regression
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.dummy import DummyRegressor
## Custom build_data_set helper
import sklearn_helpers

# Data

Read in our data frame

In [2]:
data_path = '../data/clean_data_full.csv'
df = pd.read_csv(data_path)
print("Data dims:\t", df.shape)

Data dims:	 (52751, 33)


In [3]:
def data_reader(train_path = data_path, n = 1000):
    df = pd.read_csv(data_path)
    return df.sample(n)

Let's get the reviews and number of ratings the user's have made...

In [4]:
curr_df = data_reader(data_path, n = 30000)

reviews = curr_df['review_blob'].values
user_experience_level = curr_df['user_experience'].values
# curr_reviews = curr_df['review_blob'].values
# curr_user_experience_level = curr_df['user_experience'].values

In [5]:
print(reviews[12931])

bottle from may 2010. opaque black body and a thick mocha head. roasty, dark chocolate aroma and a bit of spiciness from the booze. amazing taste of dark chocolate with a slight spiciness from the booze. Finish is an amazing combo of bitterness and slight spice or pepper as advertised. the pepper just lingers in a not overwhelming way. i love this beer 


# Helpers

In [5]:
def fit_model_cv(X, y, model, cv = 10, scoring = 'mean_squared_error'):
    """
    
    """    
    scores = cross_val_score(model,
                             X = X,
                             y = y,
                             scoring = scoring,
                             cv = cv)
    return scores

In [6]:
def set_vectorizer(ngram_range = (1,1),
                   features_prop = 0.2,
                   min_df=3,
                   max_df=1.0,
                   lowercase=True,
                   analyzer="word",
                   token_pattern=u"(?u)\\b\\w+\\b",
                   binary = False):

    max_features = int(sample_size * features_prop) if max_features == None else max_features
    
    return CountVectorizer(ngram_range=ngram_range,
                           max_features=max_features,
                           min_df=min_df,
                           max_df=max_df,
                           lowercase=lowercase,                           
                           analyzer=analyzer,
                           token_pattern=token_pattern,
                           binary=binary)

# Language Model Classifier

In [8]:
from LanguageModel import Trigram_SB_LM, UnigramLM_Laplace
import LanguageModelClassifier

In [17]:
## Unigram Laplace smoothing
## -------------------------
## =========================
## CV 2 with 15K obs ~116sec :: score = 0.385, var = 0.0002
## CV 10 with 15K obs ~229sec :: score = 0.3959, var = 0.0019
## CV 10 with 30000 obs ~437sec :: score = 0.45, var = 0.000118
Uni_lm = LanguageModelClassifier.LanguageModelClassifier(UnigramLM_Laplace)
t0 = time()
preds = fit_model_cv(reviews, user_experience_level, Uni_lm, cv = 10, scoring = 'accuracy')
print("runtime:\t", time() - t0)
print(preds)
print(np.mean(preds))
print(np.var(preds))

runtime:	 443.94390893
[ 0.44718427  0.44885038  0.46284572  0.46684439  0.452       0.463
  0.44166667  0.45715238  0.44248083  0.46364243]
0.454566706444
7.85702508469e-05


In [16]:
## Stupid back-off runs
## -------------------------
## =========================
## CV 2 with 10K obs ~98sec
## CV 10 with 15K obs ~248sec
## CV 10 with 25K obs ~407sec
## CV 2, with 20K obs ~487
## CV 10 with 25K obs ~375, score 0.3383, var 0.00054
## CV 10 with 40K obs ~ 596sec, 0.33score , 0.0004var 
## CV 10 with 30000 obs ~455 :: score = 0.494, var = 0.0000000118
## CV 10 with 30000 obs ~455 :: score = 0.494, var = 0.00000918

SB_lm = LanguageModelClassifier.LanguageModelClassifier(Trigram_SB_LM)
t0 = time()
preds = fit_model_cv(reviews, user_experience_level, SB_lm, cv = 10, scoring = 'accuracy')
print("runtime:\t", time() - t0)
print(preds)
print(np.mean(preds))
print(np.var(preds))

runtime:	 465.67603898
[ 0.49350217  0.47850716  0.5164945   0.4925025   0.48833333  0.49133333
  0.49633333  0.49649883  0.48149383  0.49499666]
0.492999565989
9.46820665664e-05


# Prepare data for Baseline, NB and Random Forest

In [7]:
VECTORIZER_TOKEN_PATTERN = u"(?u)\\b\\w+\\b"
ngram_vectorizer = CountVectorizer(ngram_range=(1,3),\
                                        min_df=1,\
                                        max_df=1.0,\
                                        lowercase=True,\
                                        analyzer="word",\
                                        token_pattern=VECTORIZER_TOKEN_PATTERN)

In [8]:
built_data_set = sklearn_helpers.build_data_set(curr_df,
                                                vectorizer = ngram_vectorizer,
                                                aspect_str = "user_num_ratings")

# Predict user log ratings

In [9]:
def fit_model_cv(X, y, model, cv = 10, scoring = 'mean_squared_error'):
    """
    
    """    
    scores = cross_val_score(model,
                             X = X,
                             y = y,
                             scoring = scoring,
                             cv = cv,
                             n_jobs = -1)
    return scores * -1 ## cross_val_score flips the sign

## Ridge Regression

In [20]:
## Tuning Ridge
param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}
ridge_search = GridSearchCV(ridge, param_grid, scoring="mean_squared_error", cv=10, n_jobs=-1,
                            pre_dispatch='2*n_jobs')
ridge_search.fit(built_data_set['X'], built_data_set['y'])
best_alpha = ridge_search.best_params_['alpha']

In [22]:
ridge = Ridge(alpha=best_alpha)

start_time = time()
res = fit_model_cv(built_data_set['X'], built_data_set['y'], ridge)
run_time = time() - start_time
print("runtime:\t", run_time)
print("results:\t", res)

runtime:	 11.3086550236
results:	 [ 0.16618698  0.17540324  0.16622791  0.17081001  0.16771507  0.15280705
  0.1683629   0.16585258  0.15763097  0.16212557]


## Lasso

In [15]:
lasso = Lasso(alpha=0.1)

start_time = time()
res = fit_model_cv(built_data_set['X'], built_data_set['y'], lasso)
run_time = time() - start_time
print("runtime:\t", run_time)
print("results:\t", res)

runtime:	 34.3513929844
results:	 [ 0.79278528  0.83662492  0.81684246  0.82282741  0.80772041  0.79014939
  0.78796325  0.81053297  0.77854169  0.79399241]


## Random Forest

In [14]:
random_forest = RandomForestRegressor(n_estimators=100,
                                      max_features="sqrt",
                                      n_jobs=-1)
start_time = time()
res = fit_model_cv(built_data_set['X'], built_data_set['y'], random_forest, cv=2)
run_time = time() - start_time
print("runtime:\t", run_time)
print("results:\t", res)

runtime:	 1455.33228302
results:	 [ 0.63688272  0.61744725]


## Dummy Regressor (baseline)

In [12]:
baseline = DummyRegressor()

start_time = time()
res = fit_model_cv(built_data_set['X'], built_data_set['y'], baseline)
run_time = time() - start_time
print("runtime:\t", run_time)
print("results:\t", res)

runtime:	 6.90261101723
results:	 [ 0.79278528  0.83662492  0.81684246  0.82282741  0.80772041  0.79014939
  0.78796325  0.81053297  0.77854169  0.79399241]


# Naive Bayes

In [11]:
Multinomial_NB = MultinomialNB()

In [18]:
preds = fit_model_cv(built_data_set['X'], built_data_set['y'], Multinomial_NB, cv = 10, scoring = 'accuracy')

In [19]:
print(preds)
print(np.mean(preds))
print(np.var(preds))

[ 0.59813396  0.58713762  0.57514162  0.5838054   0.57333333  0.577
  0.60566667  0.56418806  0.57352451  0.58772515]
0.582565631476
0.00014158957577


# Baseline

In [14]:
baseline = LanguageModelClassifier.BaselineLanguageModel()
preds = fit_model_cv(reviews, user_experience_level, baseline, cv = 10, scoring = 'accuracy')

In [15]:
print(np.mean(preds))
print(np.var(preds))

0.253666694857
7.15250628363e-09


# Random Forest

In [19]:
## runtime ~408sec with n_estimators=100, number-of-observations=full set
## ~359sec with n_estimators=500, 20,000 observations (0.32326767)
## runtim ~ 7200sec with n_estimators = 100, number-of-observations = 30000, score = 0.638600323334

rf_classifier = RandomForestClassifier(max_features='log2', n_estimators=100, n_jobs=-1)
t0 = time()
preds = fit_model_cv(built_data_set['X'], built_data_set['y'], rf_classifier, cv = 10, scoring = 'accuracy')
print("runtime:\t", time() - t0)
print(preds)
print(np.mean(preds))
print(np.var(preds))

runtime:	 7200.27582192
[ 0.63457695  0.64145285  0.63366667  0.64        0.641       0.63366667
  0.64133333  0.63754585  0.65188396  0.63087696]
0.638600323334
3.25410581082e-05


In [20]:
rf_data = [('random_forest', 'max_features=log2;n_estimators=100', 'sample_size=30000', p) for p in preds]

In [21]:
rf_data

[('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.63457694870086612),
 ('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.64145284905031652),
 ('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.63366666666666671),
 ('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.64000000000000001),
 ('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.64100000000000001),
 ('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.63366666666666671),
 ('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.64133333333333331),
 ('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.63754584861620545),
 ('random_forest',
  'max_features=log2;n_estimators=100',
  'sample_size=30000',
  0.65188396132044013),
 ('random_forest',
  'max_features=log2;n_esti

In [36]:
path = 'analysis_data/rf_cv10_' + str(date.fromtimestamp(time())) + '.csv'
import csv
with open(path, "w") as f:
    csv.register_dialect("custom", delimiter=",", skipinitialspace=True)
    writer = csv.writer(f, dialect="custom")
    for tup in rf_data:
        writer.writerow(tup)

In [33]:
from datetime import date
date.today()


'2016-05-20'

# Save content

In [45]:
df['user_num_ratings'].quantile(q = [0.25, 0.5, 0.75]).values[0]

18.0

In [37]:
# review[df.loc[(df['user_num_ratings'] > 75) & (df['user_num_ratings'] < 276), :]]

In [38]:
# grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

# print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
# print("parameters:")
# pprint(parameters)
# t0 = time()
# grid_search.fit(data.data, data.target)
# print("done in %0.3fs" % (time() - t0))
# print()

# print("Best score: %0.3f" % grid_search.best_score_)
# print("Best parameters set:")
# best_parameters = grid_search.best_estimator_.get_params()
# for param_name in sorted(parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))
        