In [37]:
# Import Statements
import pandas as pd
import spacy

from scipy.stats import uniform
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [4]:
train['category'].value_counts(normalize=True)

1    0.633024
2    0.173627
3    0.116009
4    0.077340
Name: category, dtype: float64

In [5]:
train[train['description'].str.contains('Scotch')]['category'].value_counts(normalize=True)

1    0.772727
3    0.136364
4    0.045455
2    0.045455
Name: category, dtype: float64

In [6]:
train[train['description'].str.contains('bourbon')]['category'].value_counts(normalize=True)

1    0.491363
2    0.366603
3    0.117083
4    0.024952
Name: category, dtype: float64

In [7]:
train[train['description'].str.contains('Tennessee')]['category'].value_counts(normalize=True)

2    0.636364
3    0.363636
Name: category, dtype: float64

In [8]:
train[train['description'].str.contains('Tennessee bourbon')]['category'].value_counts(normalize=True)

2    1.0
Name: category, dtype: float64

In [9]:
train[train['description'].str.contains('craft')]['category'].value_counts()

3    25
4     6
2     1
1     1
Name: category, dtype: int64

In [10]:
train[train['description'].str.contains('craft whiskey')]['category'].value_counts()

3    4
Name: category, dtype: int64

In [11]:
train[train['description'].str.contains('Canadian')]['category'].value_counts()

4    60
3     2
1     1
Name: category, dtype: int64

## Clean & pre-process data

In [12]:
def clean(df):
    df = df.copy()
    df['description'] = df['description'].str.strip()
    df['description'] = df['description'].str.lower()
    df['description'] = df['description'].str.replace(r'\s+', ' ')
    return df

In [13]:
def tokenize(df):
    nlp = spacy.load('en_core_web_lg')
    docs = df['description'].apply(nlp)
    data = []
    for doc in docs:
        data.append(' '.join([token.lemma_ for token in doc if (token.is_stop != True) \
                              and (token.is_punct != True)]))
    df['tokens'] = data
    return df

## Build model on training dataset

In [14]:
train = clean(train)

In [15]:
train = tokenize(train)

In [16]:
# Create Pipeline Components
vect = TfidfVectorizer(stop_words='english')
rfc = RandomForestClassifier()

In [17]:
# Define the Pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect), 
                 # Classifier
                 ('clf', rfc)
                ])

In [34]:
parameters = {
    'vect__max_df': (uniform(0.75, 1.0)),
    'vect__min_df': (uniform(.02, .05)),
    'vect__max_features': (range(500, 1000, 100)),
    'clf__n_estimators':(5, 10),
    'clf__max_depth':(15,20)
}

search = RandomizedSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)
search.fit(train.tokens, train.category)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.8s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('vect',
                                              TfidfVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.float64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                                        

In [33]:
search.best_score_

0.8441608662026295

In [20]:
search.predict(['delicious canadian whiskey'])

array([3], dtype=int64)

In [35]:
search.predict(['my favorite tennessee bourbon'])

array([1], dtype=int64)

In [22]:
search.predict(['classic scotch'])

array([1], dtype=int64)

### Latent Semantic Indexing (Learn)

In [38]:
svd = TruncatedSVD(n_components=100, 
                   algorithm='randomized',
                   n_iter=10)

In [41]:
params = { 
    'lsi__svd__n_components': [10, 100, 250],
    'lsi__vect__max_df':[.9, .95, 1.0]
}

In [40]:
# LSI
lsi = Pipeline([('vect', vect), ('svd', svd)])


# Pipe
pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

In [43]:
# Fit
search = RandomizedSearchCV(pipe, params, cv=5, n_jobs=4, verbose=1)
search.fit(train.tokens, train.category)



Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:   15.5s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('lsi',
                                              Pipeline(memory=None,
                                                       steps=[('vect',
                                                               TfidfVectorizer(analyzer='word',
                                                                               binary=False,
                                                                               decode_error='strict',
                                                                               dtype=<class 'numpy.float64'>,
                                                                               encoding='utf-8',
                                                                               input='content',
                                                                               lowercase=True,
          

In [44]:
search.best_score_

0.8894044856921887

### Word Embeddings with Spacy (Learn)

In [45]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [47]:
nlp = spacy.load('en_core_web_lg')

In [48]:
word_vectors = get_word_vectors(train.tokens)

In [50]:
rfc.fit(word_vectors, train.category)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [52]:
rfc.score(word_vectors, train.category) # No max depth, almost certain overfitting.

0.9845320959010054

## Run model on test dataset

In [23]:
test = pd.read_csv('test.csv')

In [24]:
test.head()

Unnamed: 0,id,description
0,955,"Think carnival aromas—the good ones, anyway—me..."
1,3532,"A blend of three bourbons, between 6 and 12 ye..."
2,1390,"The nose is focused on cereal, hints of fresh ..."
3,1024,Swiss-based Chapter 7 released this 19 year ol...
4,1902,Valkyrie replaces the current Dark Origins exp...


## Generate submission file

In [25]:
sub = pd.DataFrame(test['id'])

In [26]:
sub['category'] = 1

In [27]:
sub.to_csv('sub.csv', index=False)