## Imports

In [20]:
import pandas as pd
import numpy as np

In [21]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [22]:
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## Data Exploration

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2874 entries, 0 to 2873
Data columns (total 7 columns):
id              2874 non-null int64
author          2874 non-null object
description     2874 non-null object
price           2811 non-null float64
ratingValue     2874 non-null int64
pert_alcohol    2814 non-null float64
category        2586 non-null float64
dtypes: float64(3), int64(2), object(2)
memory usage: 157.2+ KB


In [38]:
train.head(10)

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
5,9,Fred Minnick,"A caramel-laden fruit bouquet, followed by une...",150.0,96,54.49,2.0
6,12,John Hansell,Distilled at the now legendary Stitzel-Weller ...,300.0,96,45.0,2.0
7,14,John Hansell,"No age statement, but distilled in 1998. A bea...",80.0,96,69.05,2.0
8,15,Davin de Kergommeaux,"Monarch, the 75th anniversary limited edition ...",75.0,96,40.0,4.0
10,17,John Hansell,"The complete package: uncut, unfiltered, full-...",100.0,96,65.4,2.0
11,21,John Hansell,"A marriage of four different bourbons, ranging...",90.0,96,55.7,2.0


In [39]:
train = train.dropna()

In [40]:
train.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
5,9,Fred Minnick,"A caramel-laden fruit bouquet, followed by une...",150.0,96,54.49,2.0


In [176]:
train['author'].value_counts()

John Hansell              834
Gavin Smith               543
Dave Broom                391
Lew Bryson                246
Davin de Kergommeaux      164
Fred Minnick              122
Jeffery Lindenmuth         77
Geoffrey Kleinman          58
Dominic Roskrow            18
Jonny McCormick            16
Susannah Skiver Barton      6
Adam Polonski               1
Name: author, dtype: int64

In [187]:
grouped = train.groupby('author')['category'].count()

In [188]:
grouped

author
Adam Polonski               1
Dave Broom                391
Davin de Kergommeaux      164
Dominic Roskrow            18
Fred Minnick              122
Gavin Smith               543
Geoffrey Kleinman          58
Jeffery Lindenmuth         77
John Hansell              834
Jonny McCormick            16
Lew Bryson                246
Susannah Skiver Barton      6
Name: category, dtype: int64

In [123]:
vect=TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()
pipeline = Pipeline([
    ('vect',vect),
    ('clf', sgdc)
])

In [134]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
#    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
#    'tfidf__use_idf': (True, False),
#    'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
#    'clf__max_iter': (10, 50, 80),
}

In [135]:
features = ['id', 'author', 'description', 'price', 'ratingValue', 'pert_alcohol']
target = 'category'

In [136]:
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                          n_jobs=-1,verbose=1)


grid_search.fit(train['description'],train[target])
print('best score: %0.3f'% grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    9.8s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   12.7s finished


best score: 0.932
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__max_features: 50000


In [133]:
predict=grid_search.predict(test['description'])

In [46]:
test['category']=predict
test.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0,2.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3,2.0
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0,4.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8,1.0
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9,1.0


In [47]:
submission=test.drop(columns=['author','description','price','ratingValue','pert_alcohol'])
submission['category']=submission['category'].astype('int')
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


## Initial Submission

In [48]:
submission.to_csv('submission.csv',index=False)

## Latent Semantic Indexing

In [166]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, 
                   algorithm='arpack',
                   n_iter=10)

In [167]:
# LSI

lsi = Pipeline([('vect', vect), ('svd', svd)])

In [168]:
# Pipe

pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

params = {
    'lsi__vect__max_df': (0.5, 0.75, 1),
}

In [169]:
# Fit

pipe.fit(train['description'], train[target])




Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [170]:
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                          n_jobs=-1,verbose=1)



grid_search.fit(train['description'],train[target])
print('best score: %0.3f'% grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   10.2s finished


best score: 0.933
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__max_features: None


In [172]:
predict=grid_search.predict(test['description'])

In [173]:
test['category']=predict
test.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0,2.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3,2.0
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0,4.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8,1.0
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9,1.0


In [174]:
submission=test.drop(columns=['author','description','price','ratingValue','pert_alcohol'])
submission['category']=submission['category'].astype('int')
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,4
3,1024,1
4,1902,1


In [175]:
submission.to_csv('submission4.csv',index=False)

## Word Embeddings with Spacy

In [65]:
!pip install spacy

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/a1/5b/0fab3fa533229436533fb504bb62f4cf7ea29541a487a9d1a0749876fc23/spacy-2.1.4-cp36-cp36m-manylinux1_x86_64.whl (29.8MB)
[K    100% |████████████████████████████████| 29.8MB 1.6MB/s eta 0:00:01
[?25hCollecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading https://files.pythonhosted.org/packages/a6/e6/63f160a4fdf0e875d16b28f972083606d8d54f56cd30cb8929f9a1ee700e/murmurhash-1.0.2-cp36-cp36m-manylinux1_x86_64.whl
Collecting wasabi<1.1.0,>=0.2.0 (from spacy)
  Downloading https://files.pythonhosted.org/packages/f4/c1/d76ccdd12c716be79162d934fe7de4ac8a318b9302864716dde940641a79/wasabi-0.2.2-py3-none-any.whl
Collecting blis<0.3.0,>=0.2.2 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/34/46/b1d0bb71d308e820ed30316c5f0a017cb5ef5f4324bcbc7da3cf9d3b075c/blis-0.2.4-cp36-cp36m-manylinux1_x86_64.whl (3.2MB)
[K    100% |████████████████████████████████| 3.2MB 18.9MB/s ta 0:00:01
Collecting

In [69]:
!python -m spacy download en_core_web_lg

[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [75]:
!python -m spacy download en

Collecting en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1MB)
[K    100% |████████████████████████████████| 11.1MB 91.5MB/s ta 0:00:01
[?25hInstalling collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25ldone
[?25hSuccessfully installed en-core-web-sm-2.1.0
[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/en_core_web_sm
-->
/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/

In [77]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz#egg=en_core_web_md==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.1.0/en_core_web_md-2.1.0.tar.gz (95.4MB)
[K    100% |████████████████████████████████| 95.4MB 91.7MB/s ta 0:00:01    93% |█████████████████████████████▊  | 88.8MB 76.4MB/s eta 0:00:01
[?25hInstalling collected packages: en-core-web-md
  Running setup.py install for en-core-web-md ... [?25ldone
[?25hSuccessfully installed en-core-web-md-2.1.0
[33mYou are using pip version 10.0.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [76]:
import spacy
nlp = spacy.load("en")

In [113]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [114]:
X = get_word_vectors(train['description'])

In [122]:
vect.fit(X, train[target])

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [118]:
sgdc.get_params().keys()

dict_keys(['alpha', 'average', 'class_weight', 'early_stopping', 'epsilon', 'eta0', 'fit_intercept', 'l1_ratio', 'learning_rate', 'loss', 'max_iter', 'n_iter', 'n_iter_no_change', 'n_jobs', 'penalty', 'power_t', 'random_state', 'shuffle', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [119]:
parameters = {
#    'vect__max_df': (0.5, 0.75, 1.0),
#    'vect__max_features': (None, 5000, 10000, 50000),
#    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
#    'tfidf__use_idf': (True, False),
#    'tfidf__norm': ('l1', 'l2'),
    'max_iter': (20,),
    'alpha': (0.00001, 0.000001),
    'penalty': ('l2', 'elasticnet'),
#    'clf__max_iter': (10, 50, 80),
}

In [121]:
grid_search = GridSearchCV(sgdc, parameters, cv=5,
                          n_jobs=-1,verbose=1)


grid_search.fit(X,train[target])
print('best score: %0.3f'% grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:    3.6s remaining:    3.6s


best score: 0.686
Best parameters set:
	alpha: 1e-06
	max_iter: 20
	penalty: 'l2'


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    4.9s finished
