In [4]:
!pip install scikit-learn



In [8]:
import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
train = pd.read_csv('DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/train.csv')
test = pd.read_csv('DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/test.csv')

In [61]:
train = train.dropna()

In [62]:
print(train.shape)
train.head()

(2476, 7)


Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
5,9,Fred Minnick,"A caramel-laden fruit bouquet, followed by une...",150.0,96,54.49,2.0


In [63]:
print(test.shape)
test.head()

(288, 6)


Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


In [64]:
# Create a Classification Piopeline for our Whiskey Data

# Instances for Pipe
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

In [65]:
# Specify the pipeline
pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [66]:
pipe.fit(train['description'], train['category'])



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [67]:
# make predictions

y_preds = pipe.predict(test['description'])
y_preds = [int(i) for i in y_preds]
assert len(y_preds) == len(test)

In [68]:
# create submission csv

sample_submission = pd.read_csv('DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
sample_submission['category'] = y_preds
sample_submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [69]:
sample_submission.to_csv('whiskey_submission1.csv', index=False)

In [70]:
from sklearn.model_selection import GridSearchCV

In [80]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (100, 500,1000),
    'rfc__n_estimators':(20, 100, 400)
}

In [81]:
gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1)

In [82]:
gs.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (100, 500, 1000), 'rfc__n_estimators': (20, 100, 400)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [83]:
gs.best_score_

0.8957996768982229

In [84]:
# make predictions

y_preds = pipe.predict(test['description'])
y_preds = [int(i) for i in y_preds]
assert len(y_preds) == len(test)

# create submission csv

sample_submission = pd.read_csv('DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
sample_submission['category'] = y_preds
sample_submission.to_csv('whiskey_submission2.csv', index=False)

# Word Embeddings With Spacy

# Overview

In [139]:
# Get Embedding for our Descriptions

import spacy

nlp = spacy.load('en_core_web_md')

In [140]:
embeddings = [nlp(doc).vector for doc in train['description']]

In [141]:
rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [142]:
rfc.fit(embeddings,train['category'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [143]:
rfc.score(embeddings, train['category'])

0.9890953150242326

In [148]:
#embedding

test_embeddings = [nlp(doc).vector for doc in test['description']]

# make predictions

y_preds = rfc.predict(test_embeddings)
y_preds = [int(i) for i in y_preds]
assert len(y_preds) == len(test)

# create submission csv

sample_submission = pd.read_csv('DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
sample_submission['category'] = y_preds
sample_submission.to_csv('whiskey_submission3.csv', index=False)

# Custom Tokenization Embedding with Spacy (learn)

In [149]:
doc = train['description'][10]

In [150]:
doc

"The complete package: uncut, unfiltered, full-flavored, richly textured (almost chewy), and very complex. Notes of toffee-coated nuts, vanilla fudge, polished leather, cedar-tinged tobacco, barrel char, cocoa powder, and a hint of fig, wrapped up with a firm oak grip on the finish. Worth every penny of the premium price being charged for this commemorative release. Editor's Choice."

In [151]:
doc = nlp(doc)

In [152]:
for chunk in doc.noun_chunks:
    print(chunk.lemma_)

the complete package
note
toffee - coat nut
vanilla fudge
polished leather
cedar - ting tobacco
barrel char
cocoa powder
a hint
fig
a firm oak grip
the finish
the premium price
this commemorative release
editor 's choice


In [153]:
def tokenize(doc):
    d = nlp(doc)
    tokens = []
    
    # You can extract anything you want from Spacy to use as a feature
    # Noun Chunks aren't the end all be all.
    # Try Extracting just tokens, or just adjectives, or just pronouns
    for chunk in d.noun_chunks:
        tokens.append(chunk.lemma_)
        
    return tokens

In [167]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(analyzer=tokenize, max_df = 8, min_df=3)

In [168]:
vect.fit(train['description'])

CountVectorizer(analyzer=<function tokenize at 0x1a2c5ff488>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=8,
        max_features=None, min_df=3, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [158]:
vect.get_feature_names()

['$ 60 cad',
 '( 375 ml',
 '( 500 bottle',
 '( 6,000 bottle',
 '( a binny ’s beverage depot exclusive',
 '( a u.s. exclusive',
 '( australia',
 '( cask',
 "( editor 's choice",
 '( julio ’s liquors',
 '( new hampshire',
 '( ontario',
 '( source whiskey',
 '( taiwan',
 '( travel retail',
 '-PRON- - face',
 '-PRON- balance',
 '-PRON- bourbon',
 '-PRON- choice',
 '-PRON- eye',
 '-PRON- face',
 '-PRON- finish',
 '-PRON- heart',
 '-PRON- malt',
 '-PRON- mouth',
 '-PRON- nose',
 '-PRON- opinion',
 '-PRON- palate',
 '-PRON- predecessor',
 '-PRON- presence',
 '-PRON- prime',
 '-PRON- proof',
 '-PRON- quality',
 '-PRON- reputation',
 '-PRON- strength',
 '-PRON- sweetness',
 '-PRON- teen',
 '-PRON- tongue',
 '-PRON- welcome',
 '-PRON- whisky',
 '-PRON- year',
 '-PRON- youth',
 '10 year',
 '100 % rye',
 '100 proof',
 '11 year',
 '12 year',
 '12,000 bottle',
 '14 year',
 '15 year',
 '18 year',
 '2 year',
 '22 year',
 '3 month',
 '3 year',
 '30 year',
 '35 year',
 '4 year',
 '40 %',
 '43 %',
 '45 %

In [169]:
X = vect.transform(train['description'])

In [170]:
rfc.fit(X, train['category'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [171]:
rfc.score(X, train['category'])

0.9378029079159935

In [172]:
# vectorize

X_test = vect.transform(test['description'])


# make predictions

y_preds = rfc.predict(X_test)
y_preds = [int(i) for i in y_preds]
assert len(y_preds) == len(test)

# create submission csv

sample_submission = pd.read_csv('DS-Unit-4-Sprint-1-NLP/module3-document-classification/data/sample_submission.csv')
sample_submission['category'] = y_preds
sample_submission.to_csv('whiskey_submission4.csv', index=False)

In [None]:
sample_submission.head()