### Import Modules and Read Data

In [1]:
import pandas as pd

import spacy
nlp = spacy.load("en_core_web_sm")

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('./train.csv')
df.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


### Describe and Clean Data

In [3]:
# Total NaNs in df
df.isna().sum()

id                0
author            0
description       0
price            63
ratingValue       0
pert_alcohol     60
category        288
dtype: int64

In [4]:
# NaNs shared by row with 'pert_alcohol' NaNs
df[df['pert_alcohol'].isna()].isna().sum()

id               0
author           0
description      0
price            0
ratingValue      0
pert_alcohol    60
category         4
dtype: int64

In [5]:
df.describe()

Unnamed: 0,id,price,ratingValue,pert_alcohol,category
count,2874.0,2811.0,2874.0,2814.0,2586.0
mean,2075.814544,225.297937,86.361517,48.043019,1.637664
std,1177.805945,990.619608,4.511554,6.298527,0.963049
min,1.0,7.5,60.0,33.0,1.0
25%,1087.25,50.0,84.0,43.3,1.0
50%,2109.5,80.0,87.0,46.0,1.0
75%,3102.0,140.0,90.0,51.375,2.0
max,4157.0,26650.0,97.0,98.6,4.0


In [6]:
df['pert_alcohol'] = df['pert_alcohol'].fillna(method='pad')
df['price'] = df['price'].fillna(method='pad')
df = df.dropna(axis=0)
df.isna().sum()

id              0
author          0
description     0
price           0
ratingValue     0
pert_alcohol    0
category        0
dtype: int64

### Split DataFrame into Target/Feature Frames

In [9]:
X_train = df['description'][:1500]
X_test = df['description'][1500:]
y_train = df['category'][:1500]
y_test = df['category'][1500:]

### Sklearn Pipeline Objects

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# Create pipeline
vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()

pipe = Pipeline([('vect', vect), ('clf', sgdc)])

### Tuning a Pipeline Object with GridSearch

In [13]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'clf__max_iter': (10, 20, 100)
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)

In [14]:
# Fit grid search
grid_search.fit(X_train, y_train)
best = grid_search.best_estimator_
best.score(X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    5.0s finished


0.850828729281768

### [TODO] Latent Semantic Indexing

* A topic modelling technique
* The index is a topic distribtuion
* Topics are not interpretable, used by search engines
* Could be thought of as dimensionality reduction

In [15]:
# from sklearn.decomposition import TruncatedSVD

# svd = TruncatedSVD(n_components=100,
#                    algorithm='randomized',
#                    n_iter=10)

In [16]:
# LSI
# lsi = Pipeline([('vect', vect), ('svd', svd)])

In [17]:
# Whole pipeline: a pipeline can accept another pipeline object
# pipe = Pipeline([('lsi', lsi), ('clf', sgdc)])

In [18]:
# Fit
# pipe.fit(X_train, y_train)

### Generate Predictions

In [19]:
# Read in test.csv
test_df = pd.read_csv('./test.csv')
test_df.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


In [20]:
X_test = test_df['description']
test_predictions = best.predict(X_test)
print(test_predictions)

[2. 2. 4. 1. 1. 1. 1. 1. 2. 1. 4. 4. 1. 1. 1. 1. 1. 1. 2. 1. 1. 1. 1. 1.
 4. 1. 1. 1. 3. 1. 4. 2. 1. 1. 1. 1. 1. 3. 4. 1. 2. 1. 1. 2. 1. 1. 1. 2.
 1. 1. 3. 1. 3. 1. 1. 1. 1. 1. 1. 1. 3. 1. 1. 1. 1. 4. 2. 1. 1. 1. 1. 3.
 1. 1. 4. 1. 2. 2. 1. 1. 4. 2. 2. 1. 1. 3. 2. 4. 1. 3. 1. 1. 1. 1. 1. 4.
 1. 1. 4. 3. 1. 1. 1. 2. 1. 1. 1. 2. 1. 2. 3. 1. 1. 1. 1. 3. 1. 1. 1. 1.
 3. 1. 2. 1. 1. 1. 1. 2. 2. 4. 1. 1. 1. 1. 3. 2. 1. 1. 1. 1. 1. 2. 2. 1.
 1. 3. 4. 1. 1. 1. 3. 1. 1. 1. 1. 1. 3. 1. 1. 1. 1. 1. 4. 1. 1. 1. 3. 1.
 2. 2. 1. 3. 2. 1. 1. 1. 1. 1. 1. 1. 2. 1. 1. 1. 1. 2. 1. 4. 1. 3. 1. 4.
 1. 1. 2. 2. 1. 1. 2. 1. 1. 1. 1. 2. 2. 1. 1. 1. 1. 4. 1. 1. 3. 1. 2. 1.
 1. 1. 1. 1. 1. 4. 2. 2. 2. 2. 1. 2. 2. 2. 1. 1. 1. 3. 1. 1. 2. 1. 1. 1.
 1. 3. 2. 2. 1. 3. 1. 4. 3. 3. 1. 1. 2. 1. 2. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 2. 1. 3. 2. 1. 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 4. 1. 1. 1. 1. 2. 1.]


### Kaggle Submission

Use INT values

Use index=False to remove index col

In [30]:
submission = pd.DataFrame({'id': test_df['id'], 'category': list(map(int, test_predictions))})
submission.to_csv('submission4.csv', index=False)