# Language modeling

In [1]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

from collections import defaultdict
import pandas as pd
import numpy as np

# Data

In [2]:
data_path = '../data/clean_data_full.csv'
df = pd.read_csv(data_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52778 entries, 0 to 52777
Data columns (total 31 columns):
Unnamed: 0                  52778 non-null int64
user_num_places_rated       52778 non-null int64
user_num_following          52778 non-null int64
user_url                    52778 non-null object
beer_num_ratings            52778 non-null int64
user_num_friends            52778 non-null int64
user_num_breweries_rated    52778 non-null int64
review_palate_score         52778 non-null int64
review_taste_score          52778 non-null int64
user_num_ratings            52778 non-null int64
review_ratings_blob         52778 non-null object
review_aroma_score          52778 non-null int64
user_num_countries_rated    52778 non-null int64
user_id                     52778 non-null int64
review_avg_score            52778 non-null float64
beer_global_style_score     48129 non-null float64
beer_weighted_avg_score     52705 non-null float64
beer_brewer_name            52778 non-null object


In [4]:
reviews = df['review_blob'].values

In [5]:
num_reviews = df['user_num_ratings'].values

## Under 30 reviews

In [31]:
under_30_df = df[df.user_num_ratings < 30]
under_30_reviews = under_30_df['review_blob'].values

## Over 1000 reviews

In [32]:
over_10K_df = df[df.user_num_ratings >= 1000]
over_10K_reviews = over_10K_df['review_blob'].values

In [42]:
import LanguageModel
under30_unigram = LanguageModel.UnigramLM_Laplace()
under30_unigram.train(under_30_reviews)
under30_bigram = LanguageModel.BigramLM_Laplace()
under30_bigram.train(under_30_reviews)

In [45]:
over10K_unigram = LanguageModel.UnigramLM_Laplace()
over10K_unigram.train(over_10K_reviews)
over10K_bigram = LanguageModel.BigramLM_Laplace()
over10K_bigram.train(over_10K_reviews)

In [44]:
print('under30_unigram.score(under_30_reviews[1])', under30_unigram.score(under_30_reviews[1]))
print('over10K_unigram.score(under_30_reviews[1])', over10K_unigram.score(under_30_reviews[1]))
print('under30_unigram.score(over_10K_reviews[1])', under30_unigram.score(over_10K_reviews[1]))
print('over10K_unigram.score(over_10K_reviews[1])', over10K_unigram.score(over_10K_reviews[1]))
print('---------------------------')
print('under30_unigram.score(under_30_reviews[1])', under30_bigram.score(under_30_reviews[1]))
print('over10K_unigram.score(under_30_reviews[1])', over10K_bigram.score(under_30_reviews[1]))
print('under30_unigram.score(over_10K_reviews[1])', under30_bigram.score(over_10K_reviews[1]))
print('over10K_unigram.score(over_10K_reviews[1])', over10K_bigram.score(over_10K_reviews[1]))


under30_unigram.score(under_30_reviews[1]) [556.3317705894434, -398.24598403695342]
over10K_unigram.score(under_30_reviews[1]) [823.2693486058333, -422.93685569247492]
under30_unigram.score(over_10K_reviews[1]) [699.9730412407429, -334.10313291121628]
over10K_unigram.score(over_10K_reviews[1]) [446.36997891295346, -311.15855602059315]
---------------------------
under30_unigram.score(under_30_reviews[1]) [1291.3775138728486, -451.29828037337978]
over10K_unigram.score(under_30_reviews[1]) [1291.3775138728486, -451.29828037337978]
under30_unigram.score(over_10K_reviews[1]) [2615.482402077266, -401.32939555783497]
over10K_unigram.score(over_10K_reviews[1]) [2615.482402077266, -401.32939555783497]


# Testing

In [14]:
import LanguageModel

In [15]:
unigram_lm = LanguageModel.UnigramLM_Laplace()
unigram_lm.train(reviews_shortened)

In [16]:
bigram_lm = LanguageModel.BigramLM_Laplace()
bigram_lm.train(reviews_shortened)

In [17]:
trigram_lm = LanguageModel.TrigramLM_Laplace()
trigram_lm.train(reviews_shortened)

In [11]:
for ide in [100, 1000, 1100, 1200, 3]:
    print("Review\n--------------")
    print(reviews[ide])
    print('unigram', unigram_lm.score(reviews[ide]))
    print('bigram', bigram_lm.score(reviews[ide]))
    print('trigram', trigram_lm.score(reviews[ide]))

Review
--------------
Pours a dark amber with aroma and taste of malt and hops.  Smooth, slightly dry finish, closest thing I have had to the style over in the states.  Frequently see Ron and Bill drinking it. 
unigram [350.0144629333482, -216.74505562296719]
bigram [497.716784288449, -229.77115473306742]
trigram [2172.734909871731, -284.29845326439482]
Review
--------------
Bottle. Pours a viscous black body producing a small tan head. Malts, roast and coffee on the nose. Hints of liquorice and chocolate as well. Taste is well balanced, light sweet, roasty and rounded off with a moderate bitterness. Lingering coffee flavors. Its full bodied with a thick texture and avg to soft carbonation. Quite nice.  
unigram [285.42409585411696, -327.93061553068918]
bigram [472.71173244317924, -357.19217393806861]
trigram [2575.4293697770445, -455.51874959502595]
Review
--------------
Bottle from Belmont Station. Pours a lightly hazy amber with off-white foam. Smells sweet, mild of barley and caram

In [16]:
print(reviews[1])
unigram_lm.score(reviews[1])

Best example of a harvest ale that I have tried. Drink this beer as soon as possible. Do not store it for any amount of time. This is a fresh beer So DRINK IT! Its so fresh its almost oily. You can almost taste the freshy released alpha acid. MMM Citrus, grapefruit, hints of lime. Great beer again. Long live founders 


[301.7165760199403, -359.69774946318415]

In [13]:
sent = 'hello there friend, beer and crackers and other foods too'
print(unigram_lm.score(sent))
print(bigram_lm.score(sent))
print(trigram_lm.score(sent))

[1347.6028476360461, -72.060826246468508]
[3077.2268274968133, -80.31784089698607]
[2823.536856931504, -79.457455826602427]


## Fit unigram, bigrams, trigrams

In [14]:
unigram_vocab = CountVectorizer(ngram_range=(1,3))
bigram_vocab = CountVectorizer(ngram_range=(2,2))
trigram_vocab = CountVectorizer(ngram_range=(3,3))

In [15]:
t0 = time()
unigram_vocab.fit_transform(reviews_shortened)
print("unigrams", time() - t0)

t0 = time()
bigram_vocab.fit_transform(reviews_shortened)
print("bigrams", time() - t0)

t0 = time()
trigram_vocab.fit_transform(reviews_shortened)
print("trigram", time() - t0)

unigrams 0.00477004051208
bigrams 0.00294303894043
trigram 0.00247001647949


In [16]:
# vectorizer.get_feature_names()[-100:]

In [17]:
t0 = time()
unigram_counts = unigram_vocab.transform(reviews_shortened)
print("unigram counts", time() - t0)


t0 = time()
bigram_counts = bigram_vocab.transform(reviews_shortened)
print("bigram counts", time() - t0)

t0 = time()
trigram_counts = trigram_vocab.transform(reviews_shortened)
print("trigram counts", time() - t0)

unigram counts 0.000868082046509
bigram counts 0.000509977340698
trigram counts 0.000485897064209


In [18]:
unigram_dict = defaultdict(int)
t0 = time()
for token, count in zip(unigram_vocab.get_feature_names(), np.asarray(unigram_counts.sum(axis=0)).ravel()):
    unigram_dict[token] = count
print(time() - t0)

0.00130891799927


In [19]:
zip(unigram_vocab.get_feature_names(), np.asarray(unigram_counts.sum(axis=0)).ravel())

[(u'acid', 1),
 (u'acid mmm', 1),
 (u'acid mmm citrus', 1),
 (u'again', 1),
 (u'again long', 1),
 (u'again long live', 1),
 (u'ale', 2),
 (u'ale good', 1),
 (u'ale good beer', 1),
 (u'ale that', 1),
 (u'ale that have', 1),
 (u'almost', 2),
 (u'almost oily', 1),
 (u'almost oily you', 1),
 (u'almost taste', 1),
 (u'almost taste the', 1),
 (u'alpha', 1),
 (u'alpha acid', 1),
 (u'alpha acid mmm', 1),
 (u'amount', 1),
 (u'amount of', 1),
 (u'amount of time', 1),
 (u'any', 1),
 (u'any amount', 1),
 (u'any amount of', 1),
 (u'as', 2),
 (u'as possible', 1),
 (u'as possible do', 1),
 (u'as soon', 1),
 (u'as soon as', 1),
 (u'beer', 5),
 (u'beer again', 1),
 (u'beer again long', 1),
 (u'beer as', 1),
 (u'beer as soon', 1),
 (u'beer equals', 1),
 (u'beer equals better', 1),
 (u'beer or', 1),
 (u'beer or dales', 1),
 (u'beer so', 1),
 (u'beer so drink', 1),
 (u'best', 1),
 (u'best example', 1),
 (u'best example of', 1),
 (u'better', 1),
 (u'better times', 1),
 (u'camping', 1),
 (u'camping trip', 1

In [49]:
bigram_dict = defaultdict(int)
t0 = time()
for token, count in zip(bigram_vocab.get_feature_names(), np.asarray(bigram_counts.sum(axis=0)).ravel()):
    bigram_dict[token] = count
print(time() - t0)


1.65763497353


In [36]:
len(unigram_dict.keys())

135

In [37]:
unigram_dict['beer']

11

In [39]:
reviews_shortened

array([ 'I tried this beer back in 2007. I was working at village bottle shop and my manager broke the neck off one of the 4 bottles we had on accident. The break was perfect. None of the beer spilled and it didnt get into the beer. I poured the remaining into a nalgene bottle and saved it until I got home for a more proper tasting. I like this beer because they are pushing the envelope and just the idea of what beer is and can be. No head at all completely no carbonation. Lots of molasses and caramelized sugars. Alcohol presence heavy, very very warming. It would make for a nice beer to enjoy during the heart of winter with good friends.  ',
       'Best example of a harvest ale that I have tried. Drink this beer as soon as possible. Do not store it for any amount of time. This is a fresh beer So DRINK IT! Its so fresh its almost oily. You can almost taste the freshy released alpha acid. MMM Citrus, grapefruit, hints of lime. Great beer again. Long live founders ',
       '2 finger he

In [19]:
def get_value(vocab, counts, word):
    if word not in vocab.vocabulary_:
        return None
    else:
        return counts.toarray().sum(axis=0)[vocab.vocabulary_[word]]

In [20]:
def get_leading_bigram_str(words):
    tokens = words.split()
    return (' ').join(tokens[:2])

In [21]:
def get_leading_unigram_str(words):
    tokens = words.split()
    return (' ').join(tokens[:1])

In [None]:
print(get_value(bigram_vocab, bigram_counts, get_leading_bigram_str(target)))

In [17]:
target = 'alpha acid mmm'

In [243]:
get_value(trigram_vocab, trigram_counts, 'alpha acid mmm')

1

In [214]:
get_value(bigram_cv, bigram_counts, new_target)

1

In [203]:
get_value(unigram_cv, unigram_counts, 'drink')

2

In [219]:
total_tokens = np.sum(unigram_counts.sum(axis=1))
total_tokens

85

In [223]:
vocab_size = len(unigram_cv.vocabulary_.keys())
vocab_size

65

In [204]:
zip(trigram_cv.get_feature_names(),
    np.asarray(trigram_counts.sum(axis=0)).ravel())

[(u'acid mmm citrus', 1),
 (u'again long live', 1),
 (u'ale good beer', 1),
 (u'ale that have', 1),
 (u'almost oily you', 1),
 (u'almost taste the', 1),
 (u'alpha acid mmm', 1),
 (u'amount of time', 1),
 (u'any amount of', 1),
 (u'as possible do', 1),
 (u'as soon as', 1),
 (u'beer again long', 1),
 (u'beer as soon', 1),
 (u'beer equals better', 1),
 (u'beer or dales', 1),
 (u'beer so drink', 1),
 (u'best example of', 1),
 (u'camping trip complete', 1),
 (u'can almost taste', 1),
 (u'can couldnt see', 1),
 (u'citrus grapefruit hints', 1),
 (u'comes in can', 1),
 (u'complete without this', 1),
 (u'couldnt see fishing', 1),
 (u'dales pale ale', 1),
 (u'do not store', 1),
 (u'drink it its', 1),
 (u'drink this beer', 1),
 (u'equals better times', 1),
 (u'example of harvest', 1),
 (u'finger head love', 1),
 (u'fishing or camping', 1),
 (u'for any amount', 1),
 (u'fresh beer so', 1),
 (u'fresh its almost', 1),
 (u'freshy released alpha', 1),
 (u'good beer equals', 1),
 (u'grapefruit hints of'

In [129]:
reviews_shortened

array([ 'Best example of a harvest ale that I have tried. Drink this beer as soon as possible. Do not store it for any amount of time. This is a fresh beer So DRINK IT! Its so fresh its almost oily. You can almost taste the freshy released alpha acid. MMM Citrus, grapefruit, hints of lime. Great beer again. Long live founders ',
       '2 finger head. I love that it comes in a can. I couldnt see a fishing or camping trip complete without this beer or dales pale ale. Good beer equals better times! '], dtype=object)

In [20]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

In [21]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
        

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  66 out of  72 | elapsed:   24.6s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   26.3s finished


done in 27.834s

Best score: 0.936
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
