In [1]:
import pandas as pd

In [30]:
test = pd.read_csv('data/test.csv')
trainval = pd.read_csv('data/train.csv')

In [31]:
print(trainval.shape)
trainval.head()

(2874, 7)


Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


In [5]:
trainval.isnull().sum()

id                0
author            0
description       0
price            63
ratingValue       0
pert_alcohol     60
category        288
dtype: int64

In [36]:
# Drop NaN rows
trainval = train.dropna()

In [7]:
# Import Statements
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Instances for pipeline
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

In [10]:
# Specify the pipeline
pipe = Pipeline([('vect', vect), ('rfc', rfc)])
pipe.fit(trainval['description'], trainval['category'])



Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patte...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

## Follow Along 

What you should be doing now:
1. Join the Kaggle Competition
2. Download the data
3. Train a model (try using the pipe method I just demoed)
4. Make a submission to Kaggle

Additional steps you should take -
* Train.test split on `train.csv`
* Use `test.csv` to make predictions
* Kaggle ID & categroy are integers

In [20]:
from sklearn.model_selection import train_test_split

# Split train into train & val
train, val = train_test_split(trainval, train_size=0.80, test_size=0.20, random_state=42)

In [21]:
print(train.shape)
train.head()

(1980, 7)


Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
2393,3409,John Hansell,"Fragrant aromas of ripe barley and vanilla, wi...",65.0,82,46.0,1.0
1963,2828,Gavin Smith,Another recent addition to Aberdeenshire bottl...,66.0,84,46.0,1.0
980,1466,John Hansell,Many of the Birthday Bourbon releases are wood...,75.0,88,50.0,2.0
1522,2213,Gavin Smith,Melon notes to the fore early on the nose; flo...,125.0,86,51.1,1.0
2415,3456,Lew Bryson,The second craft rye I’m revisiting. Mosby’s i...,38.0,82,40.0,3.0


In [22]:
print(val.shape)
val.head()

(496, 7)


Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
2049,2914,John Hansell,A tamed Talisker. The classic Talisker persona...,100.0,84,45.8,1.0
293,441,John Hansell,"Similar to the standard Ardbeg 10 year old, ex...",95.0,92,51.2,1.0
1847,2630,John Hansell,Not overly challenging. Stylish and very easy ...,15.0,85,43.0,2.0
514,770,John Hansell,Older expressions of this unpeated Islay dram ...,160.0,90,40.2,1.0
66,106,John Hansell,"Bottled in 2001, but still in circulation and ...",400.0,95,46.0,1.0


In [23]:
# Arrange data into X features matrix and y target vector 
X_train = train['description']
y_train = train['category']
X_val = val['description']
y_val = val['category']
X_test = test['description']

# Fit on train, score on val, predict on test
pipe.fit(X_train, y_train)
print('Validation Accuracy', pipe.score(X_val, y_val))
y_pred = pipe.predict(X_test)

Validation Accuracy 0.8165322580645161


## Tuning a pipeline object with GridSearch

In [120]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD

In [121]:
svd = TruncatedSVD(n_components=100, 
                   algorithm='randomized',
                   n_iter=10)
sgdc = SGDClassifier()
vect = TfidfVectorizer(stop_words='english')

In [122]:
params = { 
    'lsi__svd__n_components': [10, 100, 250]
}

In [123]:
# LSI
lsi = Pipeline([('vect', vect), ('svd', svd)])
# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

In [124]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (100, 500, 1000),
    'clf__max_iter':(20, 10, 100)
}

In [125]:
# Fit
pipe.fit(trainval['description'], trainval['category'])

Pipeline(memory=None,
         steps=[('lsi',
                 Pipeline(memory=None,
                          steps=[('vect',
                                  TfidfVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype=<class 'numpy.float64'>,
                                                  encoding='utf-8',
                                                  input='content',
                                                  lowercase=True, max_df=1.0,
                                                  max_features=None, min_df=1,
                                                  ngram_range=(1, 1), norm='l2',
                                                  preprocessor=None,
                                                  smooth_idf=True,
                                                  stop_words='english',
                                                  strip_a

In [126]:
pipe.score(trainval['description'], trainval['category'])

0.9474747474747475

In [129]:
grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)

In [130]:
grid_search.fit(trainval['description'], trainval['category'])

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter vect for estimator Pipeline(memory=None,
         steps=[('lsi',
                 Pipeline(memory=None,
                          steps=[('vect',
                                  TfidfVectorizer(analyzer='word', binary=False,
                                                  decode_error='strict',
                                                  dtype=<class 'numpy.float64'>,
                                                  encoding='utf-8',
                                                  input='content',
                                                  lowercase=True, max_df=1.0,
                                                  max_features=None, min_df=1,
                                                  ngram_range=(1, 1), norm='l2',
                                                  preprocessor=None,
                                                  smooth_idf=True,
                                                  stop_words='english',
                                                  strip_accents=...
                ('clf',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=42,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
y_pred = pipe.predict(test['description'])

In [113]:
sgdc = SGDClassifier(random_state=42)
vect = TfidfVectorizer(stop_words='english')

In [114]:
# Specify the pipeline
pipe = Pipeline([('vect', vect), ('sgdc', sgdc)])
pipe.fit(trainval['description'], trainval['category'])

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patte...
                ('sgdc',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', 

In [115]:
pipe.score(trainval['description'], trainval['category'])

1.0

In [116]:
gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=4)

In [117]:
gs.fit(trainval['description'], trainval['category'])

ValueError: Invalid parameter rfc for estimator Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patte...
                ('sgdc',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=42,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
gs.best_score_

In [88]:
trainval.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
2393,3409,John Hansell,"Fragrant aromas of ripe barley and vanilla, wi...",65.0,82,46.0,1.0
1963,2828,Gavin Smith,Another recent addition to Aberdeenshire bottl...,66.0,84,46.0,1.0
980,1466,John Hansell,Many of the Birthday Bourbon releases are wood...,75.0,88,50.0,2.0
1522,2213,Gavin Smith,Melon notes to the fore early on the nose; flo...,125.0,86,51.1,1.0
2415,3456,Lew Bryson,The second craft rye I’m revisiting. Mosby’s i...,38.0,82,40.0,3.0


In [97]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(stop_words='english', min_df = 0.05, max_df= 0.90)

# Specify the pipeline
pipe = Pipeline([('vect', vect), ('rfc', rfc)])
pipe.fit(trainval['description'], trainval['category'])
gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=4)
gs.fit(trainval['description'], trainval['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.9,
                                                        max_features=None,
                                                        min_df=0.05,
                                                        ngram_range=(1, 1),
                                         

In [98]:
gs.best_score_

0.8954545454545455

In [90]:
target = 'category'
features = trainval.drop(columns=[target, 'id']).columns.tolist()
X_trainval = trainval[features]
y_trainval = trainval[target]
X_test = test[features]
gs.fit(X_trainval, y_trainval)

ValueError: Number of labels=1583 does not match number of samples=5

In [None]:
gs.best_score_

## Challenge

1. Join the Kaggle Competition
2. Download the data
3. Train a model & try: 
    - Creating a Text Extraction & Classification Pipeline
    - Tune the pipeline with a `GridSearchCV` or `RandomizedSearchCV`
4. Make a submission to Kaggle

In [42]:
from sklearn.model_selection import RandomizedSearchCV

In [43]:
rs = RandomizedSearchCV(pipe, parameters, cv=5, n_jobs=4)

In [44]:
rs.fit(X_trainval, y_trainval)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('vect',
                                              TfidfVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.float64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                                        

In [45]:
rs.best_score_

0.8934343434343435

In [39]:
y_pred = rs.predict(X_test)

# Word Embeddings with Spacy (Learn)
<a id="p3"></a>

In [46]:
# Get embedding for our descriptions

import spacy
nlp = spacy.load('en_core_web_lg')

In [48]:
embeddings = [nlp(doc).vector for doc in train['description']]

In [49]:
rfc.fit(embeddings, y_trainval)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
rfc.score(embeddings, y_trainval)

0.9893939393939394

In [53]:
test_embeddings = [nlp(doc).vector for doc in test['description']]

In [54]:
y_pred = rfc.predict(test_embeddings)

In [59]:
trainval_embeddings = [nlp(doc).vector for doc in trainval['description']]

In [60]:
gs.fit(trainval_embeddings.tolist(), y_trainval)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [None]:
gs.best_score_

In [None]:
y_pred = gs.predict(test_embeddings)

In [67]:
doc = trainval['description'][0]
doc = nlp(doc)

In [68]:
for chunk in doc.noun_chunks:
    print(chunk.lemma_)

a marriage
13 and 18 year old bourbon
a mature yet very elegant whiskey
a silky texture
a splash
water
balanced note
honeyed vanilla , soft caramel
a basket
complex orchard fruit
blackberry
papaya
a dusting
cocoa
nutmeg
smooth finish
well - define flavor
a classic


In [72]:
def tokenize(doc):
    
    d = nlp(doc)
    tokens = []
    
    # you can extract anything you want from spacy to use as a feature
    # noun chunks arent the end all be all
    # try extracting just tokens, or just adjectives, or just pronouns
    for token
    
    for chunk in d.noun_chunks:
        tokens.append(chunk.lemma_)
        
    return tokens

In [81]:
dir(nlp)

['Defaults',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_meta',
 '_optimizer',
 '_path',
 'add_pipe',
 'begin_training',
 'create_pipe',
 'disable_pipes',
 'entity',
 'evaluate',
 'factories',
 'from_bytes',
 'from_disk',
 'get_pipe',
 'has_pipe',
 'lang',
 'linker',
 'make_doc',
 'matcher',
 'max_length',
 'meta',
 'parser',
 'path',
 'pipe',
 'pipe_names',
 'pipeline',
 'preprocess_gold',
 'rehearse',
 'remove_pipe',
 'rename_pipe',
 'replace_pipe',
 'resume_training',
 'tagger',
 'tensorizer',
 'to_bytes',
 'to_disk',
 'tokenizer',
 'update',
 'use_params',
 'vocab']

In [78]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer=tokenize, max_df=.9, min_df=.4)
vect.fit(trainval['description'])

CountVectorizer(analyzer=<function tokenize at 0x0000004015A678C8>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.9, max_features=None, min_df=0.4,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [79]:
len(vect.get_feature_names())

2

## Write submission csv file

In [127]:
y_pred = pipe.predict(test['description'])

In [128]:
sample_submission = pd.read_csv('data/sample_submission.csv')
submission = sample_submission.copy()
submission['category'] = y_pred.astype(int)
submission.to_csv('submission-05.csv', index=False)

In [56]:
# submission 4 - Word Embeddings with Spacy
# 73.6%
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


In [41]:
# submission 3 - GridSearchCV
# 90.3%
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [25]:
# submission 2 - RandomForestClassifier
# 82.6%
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1
