In [1]:
# !kaggle competitions download -c ds3-which-whisky

In [2]:
import pandas as pd
train_features = pd.read_csv('train.csv.zip')

train_features.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


In [3]:
train_features.shape

(2874, 7)

In [4]:
train_features = train_features.dropna()

In [5]:
train_labels = train_features['category'].astype(int)

train_labels.head()

0    2
1    1
2    2
3    1
5    2
Name: category, dtype: int64

In [6]:


train_features = train_features.drop(columns=['id', 'author','price', 'ratingValue', 'pert_alcohol', 'category'])

In [7]:
train_features.head()

Unnamed: 0,description
0,A marriage of 13 and 18 year old bourbons. A m...
1,There have been some legendary Bowmores from t...
2,This bottling celebrates master distiller Park...
3,What impresses me most is how this whisky evol...
5,"A caramel-laden fruit bouquet, followed by une..."


In [8]:
train_labels.shape

(2476,)

In [9]:
test_features = pd.read_csv('test.csv')

test_features.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


In [10]:
test_features.shape

(288, 6)

In [57]:
# Import Statements
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
# Create Pipeline

vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()

pipe = Pipeline([('vect', vect), ('clf', sgdc)])

In [44]:
X = train_features['description']
y = train_labels

In [68]:
# initail Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=len(, random_state=42)

In [69]:
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=.3, random_state=42)

In [70]:
# Fit Pipeline
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [71]:
score = pipe.score(X_val, y_val)

score

0.9365384615384615

In [81]:
y_pred = pipe.predict(test_features['description'])

In [82]:
len(y_pred), submission.shape

(288, (288, 6))

In [83]:
submission

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.00
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.30
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.00
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.80
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.90
5,1156,Gavin Smith,The oldest of Bablair’s current core bottlings...,150.0,89,46.00
6,2205,Dave Broom,"A sherry butt this time, which has allowed the...",803.0,86,48.50
7,885,Dave Broom,"This is one of a trio from Chivas Bros., who e...",59.0,90,55.30
8,1116,Fred Minnick,"Toasted oak, coffee, cola, molasses, and campf...",,89,45.20
9,3739,Dave Broom,"One of a trio from Chivas Bros., who every yea...",54.0,80,54.50


In [86]:
submission = pd.read_csv('./sample_submission.csv')
submission = submission.copy()
submission['category'] = y_pred
submission.to_csv('submission.csv', index=False)

In [87]:
len(y_pred), submission.shape

(288, (288, 2))

In [88]:
!kaggle  competitions  submit -c ds3-which-whisky -f submission.csv -m "SVD Prediction submission"

100%|██████████████████████████████████████| 1.91k/1.91k [00:00<00:00, 2.12kB/s]
Successfully submitted to DS3 Which Whisky

In [21]:
# Experiment Management
from sklearn.model_selection import GridSearchCV

In [22]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'clf__max_iter':(20, 10, 100)
}

In [23]:
grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)

In [24]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    2.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'clf__max_iter': (20, 10, 100)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [25]:
grid_search.best_score_

0.9419561243144424

In [26]:
y_pred = grid_search.predict(X_test)

In [27]:
submission = pd.read_csv('./sample_submission.csv')
submission = submission.copy()
submission['category'] = y_pred
submission.to_csv('submission.csv', index=False)

In [28]:
!kaggle  competitions  submit -c ds3-which-whisky -f submission.csv -m "GridSearch Prediction submission"

100%|██████████████████████████████████████| 1.91k/1.91k [00:00<00:00, 2.60kB/s]
Successfully submitted to DS3 Which Whisky

In [29]:
# Import

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, 
                   algorithm='randomized',
                   n_iter=10)

In [30]:
# LSI

lsi = Pipeline([('vect', vect), ('svd', svd)])

In [31]:
# Pipe

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

params = {
    'lsi__vect__max_df': (0.5, 0.75, 1)
}

In [33]:
# Fit
pipe.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [35]:
pipe.score(X_train, y_train)

0.9510968921389397

In [37]:
pipe.predict(X_test)

array([1, 2, 2, 1, 1, 1, 2, 4, 1, 3, 1, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 3, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1,
       1, 4, 1, 3, 1, 4, 1, 2, 1, 1, 1, 1, 2, 1, 3, 3, 1, 2, 1, 1, 1, 1,
       3, 3, 4, 3, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3,
       4, 1, 3, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 2,
       1, 3, 2, 1, 1, 1, 2, 1, 1, 1, 3, 3, 1, 1, 2, 3, 2, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       4, 3, 1, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 1,
       3, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 4, 4, 2, 1, 1, 1, 1, 4,
       1, 2, 1, 3, 1, 3, 3, 1, 1, 1, 1, 4, 1, 1, 1, 2, 3, 1, 1, 1, 3, 3,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 2, 1, 1, 1, 1, 2, 1, 1,
       1, 3, 3, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2,
       1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 4, 1, 1, 1, 3, 2, 3, 3, 3, 1, 1,
       3, 3])

In [38]:
y_pred = pipe.predict(X_test)

In [39]:
y_pred.shape

(288,)

In [40]:
submission = pd.read_csv('./sample_submission.csv')
submission = submission.copy()
submission['category'] = y_pred
submission.to_csv('submission.csv', index=False)

In [41]:
!kaggle  competitions  submit -c ds3-which-whisky -f submission.csv -m "SVD Prediction submission"

100%|██████████████████████████████████████| 1.91k/1.91k [00:01<00:00, 1.92kB/s]
Successfully submitted to DS3 Which Whisky

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
# doc = nlp("Two bananas in pyjamas")