# Language modeling

In [1]:
from __future__ import print_function

import random
import logging
import pandas as pd
import numpy as np
from pprint import pprint
from time import time

from collections import defaultdict

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

# Data

In [2]:
data_path = '../data/clean_data_full.csv'
df = pd.read_csv(data_path)

In [3]:
#df.info()

In [4]:
reviews = df['review_blob'].values

# LM Classifier

In [5]:
import LanguageModel

## Q1 ( < 18)

In [7]:
quant1_df = df[df.user_num_ratings < 18]
quant1_reviews = quant1_df['review_blob'].values
print("num reviews:\t", len(quant1_reviews))
## Initialize
q1Un_lm = LanguageModel.UnigramLM_Laplace()
q1Bi_lm = LanguageModel.BigramLM_Laplace()
q1Tr_lm = LanguageModel.TrigramLM_Laplace()
q1SB_lm = LanguageModel.Trigram_SB_LM()
## Train
# q1Un_lm.train(quant1_reviews)
# q1Bi_lm.train(quant1_reviews)
# q1Tr_lm.train(quant1_reviews)
q1SB_lm.train(quant1_reviews)

num reviews:	 12822


## Q2 ( < 72)

In [8]:
quant2_df = df[(df.user_num_ratings < 72) & (df.user_num_ratings >= 18)]
quant2_reviews = quant2_df['review_blob'].values
print("num reviews:\t", len(quant2_reviews))
## Initialize
q2Un_lm = LanguageModel.UnigramLM_Laplace()
q2Bi_lm = LanguageModel.BigramLM_Laplace()
q2Tr_lm = LanguageModel.TrigramLM_Laplace()
q2SB_lm = LanguageModel.Trigram_SB_LM()
## Train
# q2Un_lm.train(quant2_reviews)
# q2Bi_lm.train(quant2_reviews)
# q2Tr_lm.train(quant2_reviews)
q2SB_lm.train(quant2_reviews)

num reviews:	 13384


## Q3 ( < 278)

In [11]:
quant3_df = df[(df.user_num_ratings < 278) & (df.user_num_ratings >= 72)]
quant3_reviews = quant3_df['review_blob'].values
print("num reviews:\t", len(quant3_reviews))
## Initialize
q3Un_lm = LanguageModel.UnigramLM_Laplace()
q3Bi_lm = LanguageModel.BigramLM_Laplace()
q3Tr_lm = LanguageModel.TrigramLM_Laplace()
q3SB_lm = LanguageModel.Trigram_SB_LM()
## Train
# q3Un_lm.train(quant3_reviews)
# q3Bi_lm.train(quant3_reviews)
# q3Tr_lm.train(quant3_reviews)
q3SB_lm.train(quant3_reviews)

num reviews:	 13330


## Q4 ( >= 278)

In [12]:
quant4_df = df[df.user_num_ratings >= 278]
quant4_reviews = quant4_df['review_blob'].values
print("num reviews:\t", len(quant4_reviews))
## Initialize
q4Un_lm = LanguageModel.UnigramLM_Laplace()
q4Bi_lm = LanguageModel.BigramLM_Laplace()
q4Tr_lm = LanguageModel.TrigramLM_Laplace()
q4SB_lm = LanguageModel.Trigram_SB_LM()
## Train
# q4Un_lm.train(quant4_reviews)
# q4Bi_lm.train(quant4_reviews)
# q4Tr_lm.train(quant4_reviews)
q4SB_lm.train(quant4_reviews)

num reviews:	 13242


## Within class testing

### Class 1

In [11]:
## Q1
for review in random.sample(quant1_reviews, 3):
    print("review:\t", review)
    print("-------------------")
    print("Unigram", q1Un_lm.score(review))
    print("Bigram", q1Bi_lm.score(review))
    print("Trigram", q1Tr_lm.score(review))
    print("Trigram_SB", q1SB_lm.score(review))

review:	 Burnt orange color, good carbonation when pouring, unfortunately doesnt last. Almost no head, past due date? Rather dull in the mouth, alcohol comes out. Expected pucker inducing tartness not present. A bit watery. I would like to taste a fresh one, it might influence my rating, maybe not by much, though. 
-------------------
score:	-360.338922668
Unigram [896.8132061541502, -360.33892266817639]
score:	-407.557123312
Bigram [2185.8434330931086, -407.55712331163244]
score:	-462.138571093
Trigram [6121.703466699875, -462.13857109341387]
score:	-77.228189252
Trigram_SB [4.546138633279932, -77.228189252001158]
review:	 Incredible beer. Very crisp and front heavy on the taste. The notes are of fruit and spice.  
-------------------
score:	-104.42966699
Unigram [243.78782141840088, -104.42966699005711]
score:	-125.069396794
Bigram [722.4150471237413, -125.06939679351534]
score:	-147.598775408
Trigram [2364.582140223316, -147.59877540798789]
score:	-35.2765013701
Trigram_SB [7.965249

### Class 2

In [12]:
## Q2
for review in random.sample(quant2_reviews, 3):
    print("review:\t", review)
    print("-------------------")
    print("Unigram", q2Un_lm.score(review))
    print("Bigram", q2Bi_lm.score(review))
    print("Trigram", q2Tr_lm.score(review))
    print("Trigram_SB", q2SB_lm.score(review))

review:	 Tap at Lincoln Ave; nice balance between the bitterness and malt; they use Galena and Williamette hops in this one; pours black with tones of red; tasty. (#28) 
-------------------
score:	-203.392217816
Unigram [879.840458498596, -203.39221781625747]
score:	-212.279254854
Bigram [1183.1967466997812, -212.27925485445624]
score:	-241.446208202
Trigram [3128.1809238020646, -241.44620820248107]
score:	-46.8816931283
Trigram_SB [5.335305695724755, -46.881693128319291]
review:	 This beer pours a light hazy color with little head. I has a pleasent aroma good mix of hppy and citrus. Good IPA

 
-------------------
score:	-139.717269455
Unigram [267.38532376202176, -139.71726945507066]
score:	-154.074854758
Bigram [474.8477328869545, -154.07485475750741]
score:	-188.680641592
Trigram [1895.4767399694558, -188.68064159241067]
score:	-46.8352034203
Trigram_SB [7.662307604128955, -46.835203420310663]
review:	 Copper amber appearance, toast, citrus, vanilla, nice lasting head with good lac

### Class 3

In [14]:
## Q3
for review in random.sample(quant3_reviews, 3):
    print("review:\t", review)
    print("-------------------")
    print("Unigram", q3Un_lm.score(review))
    print("Bigram", q3Bi_lm.score(review))
    print("Trigram", q3Tr_lm.score(review))
    print("Trigram_SB", q3SB_lm.score(review))

review:	 Pours amber minum head. Bitter hops but a nice grain to coincide the hops.  
-------------------
score:	-95.1394093643
Unigram [382.30284606934055, -95.139409364342626]
score:	-104.167783178
Bigram [672.1533061193065, -104.16778317777012]
score:	-121.337297819
Trigram [1965.6556125409159, -121.33729781944346]


AttributeError: Trigram_SB_LM instance has no attribute 'ngram_dict'

### Class 4

In [41]:
## Q4
for review in random.sample(quant4_reviews, 3):
    print("review:\t", review)
    print("-------------------")
    print("Unigram", q4Un_lm.score(review))
    print("Bigram", q4Bi_lm.score(review))
    print("Trigram", q4Tr_lm.score(review))

review:	 750 ml into a tulip. Nose is pine and citrus. Bitter and juicy. Taste is much better with the pine and citrus mingling with some bubble gum and tropical fruit in the finish, Really tasty DIPA. 
-------------------
Unigram [327.0200125615176, -220.02081204170366]
Bigram [415.2046624384755, -229.09331932439875]
Trigram [2091.1170121872688, -290.52723899565251]
review:	 Bottle: Poured alight reddish color ale with a medium size foamy head with OK retention and light lacing. Aroma of light cranberry notes with some tart notes. Taste is a mix of light lactic notes with some tart notes and light cranberry notes. Body is about average with good carbonation. Good refreshing example of the style though it might be lacking a little bit of character.  
-------------------
Unigram [286.61055543472975, -384.75245543041262]
Bigram [433.73068920371827, -412.92481905974978]
Trigram [1899.6350761438555, -513.3603615433741]
review:	 I like the low alcohol on this IIPA. It had a nice hop combo w

## Between group testing


In [13]:
review = quant1_reviews[1]
print(review)
print("q1 review predicted by q1 model", q1SB_lm.score(review))
print("q1 review predicted by q2 model", q2SB_lm.score(review))
print("q1 review predicted by q3 model", q3SB_lm.score(review))
print("q1 review predicted by q4 model", q4SB_lm.score(review))

Best example of a harvest ale that I have tried. Drink this beer as soon as possible. Do not store it for any amount of time. This is a fresh beer So DRINK IT! Its so fresh its almost oily. You can almost taste the freshy released alpha acid. MMM Citrus, grapefruit, hints of lime. Great beer again. Long live founders 
score:	-117.322648372
q1 review predicted by q1 model [6.843656086860421, -117.32264837173894]
score:	-405.469664273
q1 review predicted by q2 model [770.5030988347435, -405.46966427267307]
score:	-390.476756866
q1 review predicted by q3 model [602.6028217261323, -390.47675686552611]
score:	-421.464296715
q1 review predicted by q4 model [1001.4966056427281, -421.46429671543279]


In [14]:
review = quant2_reviews[1]
print(review)
print("q2 review predicted by q1 model", q1SB_lm.score(review))
print("q2 review predicted by q2 model", q2SB_lm.score(review))
print("q2 review predicted by q3 model", q3SB_lm.score(review))
print("q2 review predicted by q4 model", q4SB_lm.score(review))

Poured a very nice color.  Probably the best brown ale Ive ever tried.       
score:	-46.9220641455
q2 review predicted by q1 model [36.943493761380886, -46.92206414550126]
score:	-28.160219934
q2 review predicted by q2 model [8.724810635208474, -28.160219933975057]
score:	-53.1543896991
q2 review predicted by q3 model [59.668199550072885, -53.15438969913783]
score:	-47.6122178154
q2 review predicted by q4 model [38.95777212506173, -47.612217815382664]


In [15]:
review = quant3_reviews[1]
print(review)
print("q3 review predicted by q1 model", q1SB_lm.score(review))
print("q3 review predicted by q2 model", q2SB_lm.score(review))
print("q3 review predicted by q3 model", q3SB_lm.score(review))
print("q3 review predicted by q4 model", q4SB_lm.score(review))

12 pk from woodmans in Appleton. Great Winter beer, now a keeper. Heavy on the malt, coffee and cocoa. Right amount of alcohol to balance it all out!  
score:	-151.411976122
q3 review predicted by q1 model [223.0889511039937, -151.41197612244218]
score:	-153.687063862
q3 review predicted by q2 model [241.9724095094265, -153.68706386223101]
score:	-41.0764126534
q3 review predicted by q3 model [4.336270893488051, -41.076412653395636]
score:	-166.232394228
q3 review predicted by q4 model [378.7480553574962, -166.23239422834104]


In [16]:
review = quant4_reviews[1]
print(review)
print("q4 review predicted by q1 model", q1SB_lm.score(review))
print("q4 review predicted by q2 model", q2SB_lm.score(review))
print("q4 review predicted by q3 model", q3SB_lm.score(review))
print("q4 review predicted by q4 model", q4SB_lm.score(review))

Mahogany pour with a light beige head that lasts for days. Lots of cooked fruit and spice on the nose Theres also a bready character in there. A little warm on the palate but full of spice and bread notes with some cooked fruit. Solid but not especially exciting. 
score:	-250.301199285
q4 review predicted by q1 model [165.37038909145454, -250.30119928515015]
score:	-247.272487022
q4 review predicted by q2 model [155.45826233325062, -247.27248702176936]
score:	-213.269351095
q4 review predicted by q3 model [77.66741007423927, -213.26935109453206]
score:	-112.162344609
q4 review predicted by q4 model [9.865338390306665, -112.16234460877533]


## Over 1000 reviews

In [32]:
over_10K_df = df[df.user_num_ratings >= 1000]
over_10K_reviews = over_10K_df['review_blob'].values

In [42]:
import LanguageModel
under30_unigram = LanguageModel.UnigramLM_Laplace()
under30_unigram.train(under_30_reviews)
under30_bigram = LanguageModel.BigramLM_Laplace()
under30_bigram.train(under_30_reviews)

In [45]:
over10K_unigram = LanguageModel.UnigramLM_Laplace()
over10K_unigram.train(over_10K_reviews)
over10K_bigram = LanguageModel.BigramLM_Laplace()
over10K_bigram.train(over_10K_reviews)

In [46]:
print('under30_unigram.score(under_30_reviews[1])', under30_unigram.score(under_30_reviews[1]))
print('over10K_unigram.score(under_30_reviews[1])', over10K_unigram.score(under_30_reviews[1]))
print('under30_unigram.score(over_10K_reviews[1])', under30_unigram.score(over_10K_reviews[1]))
print('over10K_unigram.score(over_10K_reviews[1])', over10K_unigram.score(over_10K_reviews[1]))
print('---------------------------')
print('under30_unigram.score(under_30_reviews[1])', under30_bigram.score(under_30_reviews[1]))
print('over10K_unigram.score(under_30_reviews[1])', over10K_bigram.score(under_30_reviews[1]))
print('under30_unigram.score(over_10K_reviews[1])', under30_bigram.score(over_10K_reviews[1]))
print('over10K_unigram.score(over_10K_reviews[1])', over10K_bigram.score(over_10K_reviews[1]))


under30_unigram.score(under_30_reviews[1]) [556.3317705894434, -398.24598403695342]
over10K_unigram.score(under_30_reviews[1]) [823.2693486058333, -422.93685569247492]
under30_unigram.score(over_10K_reviews[1]) [699.9730412407429, -334.10313291121628]
over10K_unigram.score(over_10K_reviews[1]) [446.36997891295346, -311.15855602059315]
---------------------------
under30_unigram.score(under_30_reviews[1]) [1291.3775138728486, -451.29828037337978]
over10K_unigram.score(under_30_reviews[1]) [1780.3652062085273, -471.52814903891499]
under30_unigram.score(over_10K_reviews[1]) [2615.482402077266, -401.32939555783497]
over10K_unigram.score(over_10K_reviews[1]) [685.1573800543237, -333.01207675264845]


# Testing

In [14]:
import LanguageModel

In [15]:
unigram_lm = LanguageModel.UnigramLM_Laplace()
unigram_lm.train(reviews_shortened)

In [16]:
bigram_lm = LanguageModel.BigramLM_Laplace()
bigram_lm.train(reviews_shortened)

In [17]:
trigram_lm = LanguageModel.TrigramLM_Laplace()
trigram_lm.train(reviews_shortened)

In [11]:
for ide in [100, 1000, 1100, 1200, 3]:
    print("Review\n--------------")
    print(reviews[ide])
    print('unigram', unigram_lm.score(reviews[ide]))
    print('bigram', bigram_lm.score(reviews[ide]))
    print('trigram', trigram_lm.score(reviews[ide]))

Review
--------------
Pours a dark amber with aroma and taste of malt and hops.  Smooth, slightly dry finish, closest thing I have had to the style over in the states.  Frequently see Ron and Bill drinking it. 
unigram [350.0144629333482, -216.74505562296719]
bigram [497.716784288449, -229.77115473306742]
trigram [2172.734909871731, -284.29845326439482]
Review
--------------
Bottle. Pours a viscous black body producing a small tan head. Malts, roast and coffee on the nose. Hints of liquorice and chocolate as well. Taste is well balanced, light sweet, roasty and rounded off with a moderate bitterness. Lingering coffee flavors. Its full bodied with a thick texture and avg to soft carbonation. Quite nice.  
unigram [285.42409585411696, -327.93061553068918]
bigram [472.71173244317924, -357.19217393806861]
trigram [2575.4293697770445, -455.51874959502595]
Review
--------------
Bottle from Belmont Station. Pours a lightly hazy amber with off-white foam. Smells sweet, mild of barley and caram

In [16]:
print(reviews[1])
unigram_lm.score(reviews[1])

Best example of a harvest ale that I have tried. Drink this beer as soon as possible. Do not store it for any amount of time. This is a fresh beer So DRINK IT! Its so fresh its almost oily. You can almost taste the freshy released alpha acid. MMM Citrus, grapefruit, hints of lime. Great beer again. Long live founders 


[301.7165760199403, -359.69774946318415]

In [13]:
sent = 'hello there friend, beer and crackers and other foods too'
print(unigram_lm.score(sent))
print(bigram_lm.score(sent))
print(trigram_lm.score(sent))

[1347.6028476360461, -72.060826246468508]
[3077.2268274968133, -80.31784089698607]
[2823.536856931504, -79.457455826602427]


## Fit unigram, bigrams, trigrams

In [14]:
unigram_vocab = CountVectorizer(ngram_range=(1,3))
bigram_vocab = CountVectorizer(ngram_range=(2,2))
trigram_vocab = CountVectorizer(ngram_range=(3,3))

In [15]:
t0 = time()
unigram_vocab.fit_transform(reviews_shortened)
print("unigrams", time() - t0)

t0 = time()
bigram_vocab.fit_transform(reviews_shortened)
print("bigrams", time() - t0)

t0 = time()
trigram_vocab.fit_transform(reviews_shortened)
print("trigram", time() - t0)

unigrams 0.00477004051208
bigrams 0.00294303894043
trigram 0.00247001647949


In [16]:
# vectorizer.get_feature_names()[-100:]

In [17]:
t0 = time()
unigram_counts = unigram_vocab.transform(reviews_shortened)
print("unigram counts", time() - t0)


t0 = time()
bigram_counts = bigram_vocab.transform(reviews_shortened)
print("bigram counts", time() - t0)

t0 = time()
trigram_counts = trigram_vocab.transform(reviews_shortened)
print("trigram counts", time() - t0)

unigram counts 0.000868082046509
bigram counts 0.000509977340698
trigram counts 0.000485897064209


In [18]:
unigram_dict = defaultdict(int)
t0 = time()
for token, count in zip(unigram_vocab.get_feature_names(), np.asarray(unigram_counts.sum(axis=0)).ravel()):
    unigram_dict[token] = count
print(time() - t0)

0.00130891799927


In [19]:
zip(unigram_vocab.get_feature_names(), np.asarray(unigram_counts.sum(axis=0)).ravel())

[(u'acid', 1),
 (u'acid mmm', 1),
 (u'acid mmm citrus', 1),
 (u'again', 1),
 (u'again long', 1),
 (u'again long live', 1),
 (u'ale', 2),
 (u'ale good', 1),
 (u'ale good beer', 1),
 (u'ale that', 1),
 (u'ale that have', 1),
 (u'almost', 2),
 (u'almost oily', 1),
 (u'almost oily you', 1),
 (u'almost taste', 1),
 (u'almost taste the', 1),
 (u'alpha', 1),
 (u'alpha acid', 1),
 (u'alpha acid mmm', 1),
 (u'amount', 1),
 (u'amount of', 1),
 (u'amount of time', 1),
 (u'any', 1),
 (u'any amount', 1),
 (u'any amount of', 1),
 (u'as', 2),
 (u'as possible', 1),
 (u'as possible do', 1),
 (u'as soon', 1),
 (u'as soon as', 1),
 (u'beer', 5),
 (u'beer again', 1),
 (u'beer again long', 1),
 (u'beer as', 1),
 (u'beer as soon', 1),
 (u'beer equals', 1),
 (u'beer equals better', 1),
 (u'beer or', 1),
 (u'beer or dales', 1),
 (u'beer so', 1),
 (u'beer so drink', 1),
 (u'best', 1),
 (u'best example', 1),
 (u'best example of', 1),
 (u'better', 1),
 (u'better times', 1),
 (u'camping', 1),
 (u'camping trip', 1

In [49]:
bigram_dict = defaultdict(int)
t0 = time()
for token, count in zip(bigram_vocab.get_feature_names(), np.asarray(bigram_counts.sum(axis=0)).ravel()):
    bigram_dict[token] = count
print(time() - t0)


1.65763497353


In [36]:
len(unigram_dict.keys())

135

In [37]:
unigram_dict['beer']

11

In [39]:
reviews_shortened

array([ 'I tried this beer back in 2007. I was working at village bottle shop and my manager broke the neck off one of the 4 bottles we had on accident. The break was perfect. None of the beer spilled and it didnt get into the beer. I poured the remaining into a nalgene bottle and saved it until I got home for a more proper tasting. I like this beer because they are pushing the envelope and just the idea of what beer is and can be. No head at all completely no carbonation. Lots of molasses and caramelized sugars. Alcohol presence heavy, very very warming. It would make for a nice beer to enjoy during the heart of winter with good friends.  ',
       'Best example of a harvest ale that I have tried. Drink this beer as soon as possible. Do not store it for any amount of time. This is a fresh beer So DRINK IT! Its so fresh its almost oily. You can almost taste the freshy released alpha acid. MMM Citrus, grapefruit, hints of lime. Great beer again. Long live founders ',
       '2 finger he

In [19]:
def get_value(vocab, counts, word):
    if word not in vocab.vocabulary_:
        return None
    else:
        return counts.toarray().sum(axis=0)[vocab.vocabulary_[word]]

In [20]:
def get_leading_bigram_str(words):
    tokens = words.split()
    return (' ').join(tokens[:2])

In [21]:
def get_leading_unigram_str(words):
    tokens = words.split()
    return (' ').join(tokens[:1])

In [None]:
print(get_value(bigram_vocab, bigram_counts, get_leading_bigram_str(target)))

In [17]:
target = 'alpha acid mmm'

In [243]:
get_value(trigram_vocab, trigram_counts, 'alpha acid mmm')

1

In [214]:
get_value(bigram_cv, bigram_counts, new_target)

1

In [203]:
get_value(unigram_cv, unigram_counts, 'drink')

2

In [219]:
total_tokens = np.sum(unigram_counts.sum(axis=1))
total_tokens

85

In [223]:
vocab_size = len(unigram_cv.vocabulary_.keys())
vocab_size

65

In [204]:
zip(trigram_cv.get_feature_names(),
    np.asarray(trigram_counts.sum(axis=0)).ravel())

[(u'acid mmm citrus', 1),
 (u'again long live', 1),
 (u'ale good beer', 1),
 (u'ale that have', 1),
 (u'almost oily you', 1),
 (u'almost taste the', 1),
 (u'alpha acid mmm', 1),
 (u'amount of time', 1),
 (u'any amount of', 1),
 (u'as possible do', 1),
 (u'as soon as', 1),
 (u'beer again long', 1),
 (u'beer as soon', 1),
 (u'beer equals better', 1),
 (u'beer or dales', 1),
 (u'beer so drink', 1),
 (u'best example of', 1),
 (u'camping trip complete', 1),
 (u'can almost taste', 1),
 (u'can couldnt see', 1),
 (u'citrus grapefruit hints', 1),
 (u'comes in can', 1),
 (u'complete without this', 1),
 (u'couldnt see fishing', 1),
 (u'dales pale ale', 1),
 (u'do not store', 1),
 (u'drink it its', 1),
 (u'drink this beer', 1),
 (u'equals better times', 1),
 (u'example of harvest', 1),
 (u'finger head love', 1),
 (u'fishing or camping', 1),
 (u'for any amount', 1),
 (u'fresh beer so', 1),
 (u'fresh its almost', 1),
 (u'freshy released alpha', 1),
 (u'good beer equals', 1),
 (u'grapefruit hints of'

In [129]:
reviews_shortened

array([ 'Best example of a harvest ale that I have tried. Drink this beer as soon as possible. Do not store it for any amount of time. This is a fresh beer So DRINK IT! Its so fresh its almost oily. You can almost taste the freshy released alpha acid. MMM Citrus, grapefruit, hints of lime. Great beer again. Long live founders ',
       '2 finger head. I love that it comes in a can. I couldnt see a fishing or camping trip complete without this beer or dales pale ale. Good beer equals better times! '], dtype=object)

In [20]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

In [21]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
        

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  66 out of  72 | elapsed:   24.6s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   26.3s finished


done in 27.834s

Best score: 0.936
Best parameters set:
	clf__alpha: 1e-05
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
