# Language modeling

In [1]:
from __future__ import print_function


import random
import logging
import pandas as pd
import numpy as np
from pprint import pprint
from time import time

from collections import defaultdict

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

import sklearn_helpers

# Data

Read in our data frame

In [2]:
data_path = '../data/clean_data_full.csv'
df = pd.read_csv(data_path)
print("Data dims:\t", df.shape)

Data dims:	 (52751, 33)


Let's get the reviews and number of ratings the user's have made...

In [3]:
reviews = df['review_blob'].values
user_experience_level = df['user_experience'].values
curr_reviews = df['review_blob'].values[:5000]
curr_user_experience_level = df['user_experience'].values[:5000]

In [4]:
print(reviews[12931])

This beer is iconic because people like me in their 30s remember it from the Uncles drinking it when you were a kid. This is not a beer for beer snobs. Certainly America has come of age with beer - microbrews and the popularity of foreign beers. But...appreciate it for what it is - an old time classic thats making a bit of a comeback with the college set. Colder is better, draft bottles over cans. I have a pub close by that has it for $3.00 a 20 oz glass.   It has a decent flavor, very drinkable. Much more character than the generic beers from Budweiser. 


# Language Model Classifier

In [5]:
from LanguageModel import Trigram_SB_LM
import LanguageModelClassifier

In [6]:
lm = Trigram_SB_LM
lang_model_classifier = LanguageModelClassifier.LanguageModelClassifier(Trigram_SB_LM)

In [7]:
def fit_model_cv(X, y, model, cv = 10, scoring = 'mean_squared_error'):
    """
    
    """    
    scores = cross_val_score(model,
                             X = X,
                             y = y,
                             scoring = scoring,
                             cv = cv)
    return scores

In [8]:
def set_vectorizer(ngram_range = (1,1),
                   min_df = 3,
                   sample_size = 10000,
                   stop_words = None,
                   lowercase = True,
                   binary = False):

    max_features = int(sample_size * features_prop) if max_features == None else max_features
    
    return CountVectorizer(analyzer='word',
                           ngram_range = ngram_range,
                           min_df = min_df,
                           max_features = max_features,
                           stop_words = stop_words,
                           binary = binary,
                           lowercase = lowercase)

In [22]:
## CV 2 with 10K obs ~98sec
## CV 10 with 15K obs ~248sec
## CV 10 with 25K obs ~407sec
t0 = time()
preds = fit_model_cv(curr_reviews, curr_user_experience_level, lang_model_classifier, cv = 10, scoring = 'f1_micro')
print("runtime:\t", time() - t0)

runtime:	 84.6095929146


In [23]:
print(preds)
print(np.mean(preds))
print(np.var(preds))

[ 0.36526946  0.39121756  0.45908184  0.416       0.498       0.516       0.33
  0.36673347  0.4488978   0.42685371]
0.421805383222
0.00324540357479


# Prepare data for NB and Random Forest

In [9]:
VECTORIZER_TOKEN_PATTERN = u"(?u)\\b\\w+\\b"
ngram_vectorizer = CountVectorizer(ngram_range=(1,1),\
                                        min_df=1,\
                                        max_df=1.0,\
                                        lowercase=True,\
                                        analyzer="word",\
                                        token_pattern=VECTORIZER_TOKEN_PATTERN)
X = ngram_vectorizer.fit_transform(curr_reviews)
y = curr_user_experience_level

In [10]:
built_data_set = sklearn_helpers.build_data_set(df, vectorizer = ngram_vectorizer, aspect_str = "user_experience")

# Naive Bayes

In [19]:
Multinomial_NB = MultinomialNB()

In [20]:
preds = fit_model_cv(built_data_set['X'], built_data_set['y'], Multinomial_NB, cv = 20, scoring = 'f1_micro')

In [21]:
print(np.mean(preds))
print(np.var(preds))

0.348710925192
0.000444556067403


# Baseline

In [28]:
baseline = LanguageModelClassifier.BaselineLanguageModel()
preds = fit_model_cv(curr_reviews, curr_user_experience_level, baseline, cv = 10, scoring = 'f1_micro')

In [29]:
preds

array([ 0.21956088,  0.21956088,  0.21956088,  0.218     ,  0.218     ,
        0.218     ,  0.218     ,  0.21843687,  0.21843687,  0.21843687])

In [14]:
baseline.fit(curr_reviews, curr_user_experience_level)

['Q2', 'Q2', 'Q2', 'Q2', 'Q2', 'Q2', 'Q2', 'Q2', 'Q2']

# Random Forest

# Save content

In [45]:
df['user_num_ratings'].quantile(q = [0.25, 0.5, 0.75]).values[0]

18.0

In [37]:
# review[df.loc[(df['user_num_ratings'] > 75) & (df['user_num_ratings'] < 276), :]]

In [38]:
# grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

# print("Performing grid search...")
# print("pipeline:", [name for name, _ in pipeline.steps])
# print("parameters:")
# pprint(parameters)
# t0 = time()
# grid_search.fit(data.data, data.target)
# print("done in %0.3fs" % (time() - t0))
# print()

# print("Best score: %0.3f" % grid_search.best_score_)
# print("Best parameters set:")
# best_parameters = grid_search.best_estimator_.get_params()
# for param_name in sorted(parameters.keys()):
#     print("\t%s: %r" % (param_name, best_parameters[param_name]))
        