In [119]:
import pandas as pd
import numpy as np

import requests
import bs4
from bs4 import BeautifulSoup

import re
import string

from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier

import category_encoders as ce

import spacy
nlp = spacy.load('en_core_web_lg')

pd.set_option('display.max_rows', 500)
pd.options.mode.chained_assignment = None 

In [120]:
#!kaggle competitions download -c ds3-which-whisky

In [121]:
sample_submission = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv')
df = pd.read_csv('train.csv')

In [122]:
df.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


## Category Identities

It is important to note which categories refer to which. I want to use this to extract extra meaning from the description column in order to boost model performance.]

> 1 = Scotch  
> 2 = Bourbon  
> 3 = Craft  
> 4 = Canadian  


In [123]:
# Checking for class imbalance

df['category'].value_counts()

1.0    1637
2.0     449
3.0     300
4.0     200
Name: category, dtype: int64

In [124]:
df.isnull().sum()

id                0
author            0
description       0
price            63
ratingValue       0
pert_alcohol     60
category        288
dtype: int64

Before digging into the primary features, namely the description. I'd like to look at whether there is any noteable difference between ratings, alcohol percentage and price. If the primary statistical measurements show any significant variation, it would be worth it to include them in the modeling. 

In [125]:
def price_info(data):
    print('###################')
    print('mean:   ', data.mean())
    print('median: ', data.median())
    print('mode :  ', data.mode()[0])
    print('StDev:  ', data.std())
    print()

In [126]:
print('###### Prices ########')
for i in np.arange(1,5):
    print('Categeory  = ', i)
    price_info(df[df['category'] == i]['price'])

print('\n\n###### Ratings ########')
for i in np.arange(1,5):
    print('Categeory  = ', i)
    price_info(df[df['category'] == i]['ratingValue'])
    
print('\n\n###### % Alcohol ########')
for i in np.arange(1,5):
    print('Categeory  = ', i)
    price_info(df[df['category'] == i]['pert_alcohol'])

###### Prices ########
Categeory  =  1
###################
mean:    327.81806696146555
median:  110.0
mode :   100.0
StDev:   1288.8296456971598

Categeory  =  2
###################
mean:    70.10467706013362
median:  50.0
mode :   47.0
StDev:   69.47337105763614

Categeory  =  3
###################
mean:    56.873333333333335
median:  47.0
mode :   50.0
StDev:   52.47280508161027

Categeory  =  4
###################
mean:    58.3725
median:  38.5
mode :   70.0
StDev:   74.12604702795065



###### Ratings ########
Categeory  =  1
###################
mean:    86.58399511301161
median:  87.0
mode :   86
StDev:   4.039210375758337

Categeory  =  2
###################
mean:    87.89755011135857
median:  89.0
mode :   90
StDev:   5.170591485127886

Categeory  =  3
###################
mean:    83.23
median:  84.0
mode :   84
StDev:   4.0873121242863

Categeory  =  4
###################
mean:    85.465
median:  86.0
mode :   88
StDev:   5.462413827615096



###### % Alcohol ########
Categeory

It appears that price and percent alcohol don't vary much. That makes sense as most whisk(e)y's generally sit between 40% and 61%. Canadian whiskey tends to be 40% whereas the upper limit for most scotches is 52. However, many overproof american whiskeys like Bottled in Bond and Cask Strengths may also sit around the scotch level. Only the really insane stuff like Booker's reaches the vaunted 61%. 

> Given the lack of variety and existence of null values in the validation set, I will remove the percent_alcohol and rating feature

Price is staying despite it's null values because it shows a marked difference between classes, specifically identifying Scotch. 

This can then be imputed based on the next engineered features. 

In [127]:
df = df.drop(columns = ['ratingValue','pert_alcohol'])

## Identifying Key Terms

Given these are reviews, there is a pretty significant amount of leakage in the desciption text. They often provide keywords that are associated with the specified field. 

#### Scotch

The first row that is classified as a Scotch is this one. It lists the name Bowmore which is a distiller from the isle of Islay that utilizes sherry finished on their characteristically smokey scotches. 

In [128]:
df[df['category'] == 1]['description'][1]

"There have been some legendary Bowmores from the mid-60s and this is every bit their equal. All of them share a remarkable aroma of tropical fruit, which here moves into hallucinatory intensity: guava, mango, peach, pineapple, grapefruit. There’s a very light touch of peat smoke, more a memory of Islay than the reality. Concentrated; even at low strength the palate is silky, heady, and haunting, and lasts forever in the dry glass. A legend is born. (Eight bottles only for the U.S.) Editor's Choice."

#### Bourbon

The Bourbon is harder to identify. It relied on name recongnition. Either the distiller or the distillery will be mentioned. 

In [129]:
df[df['category'] == 2]['description'][2]

"This bottling celebrates master distiller Parker Beam's 50 years of service by including whiskey from each of the past five decades. This is a fabulous whiskey: seamless and incredibly complex, with an impeccable marriage of youth and maturity. It’s also very even-keeled throughout -- quite different than last year’s equally impressive PHC, a 27 year old, whose personality was more like an exhilarating old wooden rollercoaster ride (and also brandished more oak).\xa0Look for candied citrus, nectarine, blueberry, and sultana anchored by a nougat center, laced with honeyed vanilla and orange creamsicle. There’s a dusting of cocoa powder, brittle mint, and cinnamon, too. Tobacco leaves, polished leather, and teasing bourbon barrel char round out the palate, emerging more prominently towards a warming finish. A classic!"

#### Craft Whiskey

Craft distilleries are harder to identify. Often times they will source from all over the place. This will lead to a hodgepodge of distilleries. If anything this is the wildcard. It may be best to ignore this feature alltogether in case any feature engineering confuses the model. 

In [130]:
df[df['category'] == 3]['description'][391]

'A sourced whiskey of 95% corn, finished in wine barrels from winemaker Dave Phinney. This makes a super first impression, with Bit-O-Honey candy, eucalyptus, black cherry, cinnamon hearts, violet candies, and sandalwood. The flavors pour layered and complex, with clove-studded orange, flickers of rye spice, and pure, crystalline sweetness balanced with lemony lift. Laser-like spice, sweet caramel corn, and more floral notes dance across the long finish.'

#### Canadian Whiskey

Canada sources a lot of whiskey to the United States craft distilleries. I have to be careful about classifying Rye or Canada as it is often mentioned in the craft classes. On top of that, almost all Canadian whiskeys are blends of multiple distilleries. it might be good to identify Canadian Brands rather than regions or distilleries. 

In [131]:
df[df['category'] == 4]['description'][93]

'Corby’s latest Lot 40, this one undated, comes from the same distillation batch as the 2012 release, but with a couple of extra years in wood. The familiar flavors are all there: dustiness, sour rye, hard wet slate, floral notes, exotic fruits, sweet spices, and biting white pepper. Over these, time has sprinkled licorice root, dried dates, oatmeal porridge, vanilla, hints of bike tires, and mango peels. Flavors remain fully integrated with faint tannins underscoring a long sour-rye finish. Value Pick.'

### Keyword Identification

There are a lot of Scotch keywords so I am going to scrape those. The Canadian and Bourbon keywords I will recite by memory. 

In [132]:
#res = requests.get(URL).text
URL = requests.get('https://en.wikipedia.org/wiki/List_of_whisky_distilleries_in_Scotland').text

soup = BeautifulSoup(URL,'html.parser')

scotch_references = []

soup = BeautifulSoup(URL,'html.parser')
for items in soup.find('table', class_='wikitable').find_all('tr')[1::1]:
    data = items.find_all(['th','td'])
    try:
        distillery = data[0].a.text
    
    except AttributeError:pass
    except IndexError:pass
    scotch_references.append(distillery)
    
other_scotch_references = ['loch','speyside','campbeltown',
                           'highland','islay','orkney','skye']

for keyword in other_scotch_references:
    scotch_references.append(keyword)
    
bourbon_references = ['Kentucky','Bourbons','Lexington','Sazerac','Bardstown',
                      'Buffalo Trace','1792','Colonel','Barton','Dixie','Stagg',
                      'Winkle','Bowman','Blanton','Eagle Rare','E.H. Taylor',
                      'Elmer T. Lee','Weller','Heaven Hill','Brown Forman', 'Beam',
                      'Elijah Craig','Evan Williams','Old Fitzgerald','Basil Hayden']

canadian_references = ['Alberta', 'Canadian Mist', 'Crown Royal','Forty Creek','Highwood'
                       'Pendleton','Pemberton','Hiram Walker','Lot 40','Wiser','Seagram']

bourbon_references = [x.lower() for x in bourbon_references]
canadian_references = [x.lower() for x in canadian_references]
scotch_references = [x.lower() for x in scotch_references]

references = [bourbon_references, 
              canadian_references, 
              scotch_references]

In [133]:
scotch = []
bourbon = []
canadian = []

for i in np.arange(len(df)):
    for k in references:
        ref_count = 0
        for j in k:
            if j in df['description'][i].lower():
                ref_count += 1
        if k == bourbon_references:
            bourbon.append(ref_count)
        elif k == scotch_references:
            scotch.append(ref_count)
        else:
            canadian.append(ref_count)

df['scotch_reference'] = scotch
df['bourbon_reference'] = bourbon
df['canadian_reference'] = canadian

#### Baseline Models

Now that references to the data have been engineered into features, it is time to run some cross-validation and baseline models. 

In [134]:
def cleaner():
    
    cleaned_entry = []
    
    for i in np.arange(len(df)):
        
        sample = df['description'][i]
        soup = BeautifulSoup(sample, "html.parser")
        
        sample = (re.sub(r'[^a-zA-Z ^0-9]', '', 
                         soup.text))
            
        cleaned_entry.append(sample)

    return cleaned_entry



In [135]:
b_line = df[['description','category']].dropna()
X = b_line['description']
y = b_line['category']

In [136]:
vect = TfidfVectorizer(stop_words='english', 
                        max_features = 10000)

sgdc  = SGDClassifier()

pipe = Pipeline([('vect', vect), ('clf', sgdc)])

pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=10000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
           

In [137]:
score_1 = (cross_val_score(pipe, X, y, 
                          cv = 10, 
                          scoring = 'accuracy',
                          n_jobs = -1,
                          verbose = 10)).mean()

print('cross validation baseline: ', score_1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.5s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.6s remaining:    0.2s


cross validation baseline:  0.9412017547751871


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.8s finished


In [138]:
b_2 = df[['author', 'price', 'scotch_reference', 
          'bourbon_reference', 'canadian_reference', 
          'category']].dropna()

X = b_2.drop(columns = 'category')
y = b_2['category']

xgbc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
                        max_depth=25, min_child_weight=1, missing=None, n_estimators=200,
                        n_jobs=-1, nthread=None, objective='multi:softmax',num_class=4,
                        random_state=5, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                        seed=42, silent=True, subsample=1, eval_metric='merror')


encoder = ce.OrdinalEncoder()

pipe_2 = Pipeline([('ord', encoder), ('clf', xgbc)])

pipe_2.fit(X, y)

Pipeline(memory=None,
         steps=[('ord',
                 OrdinalEncoder(cols=['author'], drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=[{'col': 'author',
                                          'data_type': dtype('O'),
                                          'mapping': John Hansell               1
Dave Broom                 2
Fred Minnick               3
Davin de Kergommeaux       4
Gavin Smith                5
Dominic Roskrow            6
Geoffrey Kleinman          7
Jonny McCormick            8
Susannah Skiver Barton     9
Lew Bryson                10
Jeffery Lindenmut...
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='merror',
                               gamma=0, learning_rate=0.1, max_delta_step=0,
                               max_depth=25, min_child_weight=1, missing=None,
            

In [139]:
score_2 = (cross_val_score(pipe_2, X, y, 
                          cv = 10, 
                          scoring = 'accuracy',
                          n_jobs = -1,
                          verbose = 10)).mean()

print('second validation baseline: ', score_2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    3.5s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    3.5s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    3.6s remaining:    1.5s


second validation baseline:  0.9237384482078079


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.4s finished


It appears that the vectorized description model performed slightly better by utilizing a stochastic gradient descent classifier. The other features when seem to peform almost as good as the nlp vectorization when run through a gradient boosting tree ensemble. Now I am going to try to engineer a new dataframe that combines both. I can either use a feature union or concatentate the dataframe without a pipeline. 

#### Impute null values for Price

Since the validation dataset has a few null values for price, I want to impute the values of these bottles. Since these items are usually rare or collector's bottles, it'll they don't have a price, but tend to be expensive. Similarly, they appear to mostly refer to scotch. 

In [140]:
df[df['price'].isnull() == True]['category'].value_counts()

1.0    54
Name: category, dtype: int64

In [141]:
# Scotch references
df[df['price'].isnull() == True]['scotch_reference'].value_counts()

0    33
1    29
2     1
Name: scotch_reference, dtype: int64

In [142]:
# Bourbon references
df[df['price'].isnull() == True]['bourbon_reference'].value_counts()

0    62
1     1
Name: bourbon_reference, dtype: int64

In [143]:
# Canadian references
df[df['price'].isnull() == True]['canadian_reference'].value_counts()

0    63
Name: canadian_reference, dtype: int64

I think it's safe to assume that these bottles are references to more obscure scotches. I am going to use the mean scotch price to impute the value there. I am also going to impute a few styles that reference certain categories in order to have a more robust training dataset. 

In [144]:
df['price'] = df['price'].fillna(df[df['category'] == 1]['price'].mean())

In [145]:
for i in df[df['category'].isnull() == True].index:
    if df['scotch_reference'][i] > 0:
        df['category'][i] = 1
    elif df['bourbon_reference'][i] > 0:
        df['category'][i] = 2
    elif df['canadian_reference'][i] > 0:
        df['category'][i] = 4
    else:
        pass

Now to drop any remaining rows that are not easily classifiable. 

In [146]:
df = df.dropna()

Now to build the final training dataset

In [187]:
tfidf = TfidfVectorizer(stop_words='english', 
                        max_features = 5000, 
                        min_df=5)

dtm = tfidf.fit_transform(df['description'])

docs = (pd.DataFrame(dtm.todense(), 
                    columns=tfidf.get_feature_names())
       .drop(columns = ['category','price']))

print(docs.shape,'\n')
docs.head()

(2713, 2718) 



Unnamed: 0,000,10,100,105,11,110,115,12,120,125,...,younger,youngest,youth,youthful,youthfulness,zest,zesty,zinfandel,zing,zingy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.136695,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:
combined = pd.concat([df[['author', 'price', 
                          'category', 'scotch_reference', 
                          'bourbon_reference', 'canadian_reference']]
                      .reset_index(), 
                      docs], 
                     axis = 1).drop(columns = 'index')

In [189]:
X = combined.drop(columns = 'category')
y = combined['category']

In [196]:
pipe_3 = Pipeline([('ord', encoder), ('clf', xgbc)])

pipe_3.fit(X, y)

Pipeline(memory=None,
         steps=[('ord',
                 OrdinalEncoder(cols=['author'], drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=[{'col': 'author',
                                          'data_type': dtype('O'),
                                          'mapping': John Hansell               1
Dave Broom                 2
Fred Minnick               3
Davin de Kergommeaux       4
Gavin Smith                5
Dominic Roskrow            6
Geoffrey Kleinman          7
Jonny McCormick            8
Susannah Skiver Barton     9
Lew Bryson                10
Jeffery Lindenmut...
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='merror',
                               gamma=0, learning_rate=0.1, max_delta_step=0,
                               max_depth=25, min_child_weight=1, missing=None,
            

In [197]:
score_3 = (cross_val_score(pipe_3, X, y, 
                          cv = 10, 
                          scoring = 'accuracy',
                          n_jobs = -1,
                          verbose = 10)).mean()

print('third validation baseline: ', score_3)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed: 12.1min remaining: 28.2min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 12.1min remaining: 12.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 12.2min remaining:  5.2min


third validation baseline:  0.9575708881496272


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 17.9min finished


In [198]:
pipe_4 = Pipeline([('ord', encoder), ('clf', sgdc)])

pipe_4.fit(X, y)

Pipeline(memory=None,
         steps=[('ord',
                 OrdinalEncoder(cols=['author'], drop_invariant=False,
                                handle_missing='value', handle_unknown='value',
                                mapping=[{'col': 'author',
                                          'data_type': dtype('O'),
                                          'mapping': John Hansell               1
Dave Broom                 2
Fred Minnick               3
Davin de Kergommeaux       4
Gavin Smith                5
Dominic Roskrow            6
Geoffrey Kleinman          7
Jonny McCormick            8
Susannah Skiver Barton     9
Lew Bryson                10
Jeffery Lindenmut...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [199]:
score_4 = (cross_val_score(pipe_4, X, y, 
                          cv = 10, 
                          scoring = 'accuracy',
                          n_jobs = -1,
                          verbose = 10)).mean()

print('fourth validation baseline: ', score_4)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    4.2s remaining:    9.8s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    4.4s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    4.7s remaining:    2.0s


fourth validation baseline:  0.627681442274453


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.5s finished


In this case, it appears the stochastic gradient descent classifier does markedly worse when provided with the non-nlp related features. However, the gradient boosting tree ensemble appears to have benefitted from the combination of the two features. The next step is to apply the feature engineering the the validation dataset.

In [202]:
test = test.drop(columns = ['id','pert_alcohol','ratingValue'])

scotch = []
bourbon = []
canadian = []

for i in np.arange(len(test)):
    for k in references:
        ref_count = 0
        for j in k:
            if j in test['description'][i].lower():
                ref_count += 1
        if k == bourbon_references:
            bourbon.append(ref_count)
        elif k == scotch_references:
            scotch.append(ref_count)
        else:
            canadian.append(ref_count)

test['scotch_reference'] = scotch
test['bourbon_reference'] = bourbon
test['canadian_reference'] = canadian

test['price'] = test['price'].fillna(df[df['category'] == 1]['price'].mean())

In [210]:
tfidf = TfidfVectorizer(stop_words='english', 
                        max_features = 5000, 
                        min_df=5)

dtm = tfidf.fit_transform(test['description'])

docs_2 = (pd.DataFrame(dtm.todense(), 
                    columns=tfidf.get_feature_names())
       .drop(columns = ['price']))

print(docs_2.shape,'\n')
docs_2.head()

(288, 507) 



Unnamed: 0,000,10,100,12,17,20,2016,21,30,375,...,white,wine,wood,year,years,yields,young,youth,youthful,zesty
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.228406,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.154227,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.156227,0.140604,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.251943,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now for some creative data wrangling to get the validation dataset matching the training dataset's columns. 

In [244]:
combined_2 = pd.concat([test[['author', 'price', 
                              'scotch_reference', 
                              'bourbon_reference', 'canadian_reference']]
                        .reset_index(), 
                        docs_2],
                       axis = 1).drop(columns = 'index')

col_diff = list(set(X) - set(combined_2.columns))

final = pd.concat([combined_2, 
                   pd.DataFrame(data=np.zeros(shape=(len(combined_2) ,
                                                     len(col_diff))), 
                                columns=col_diff)],
                 axis=1)

final = final[X.columns]

final.shape

In [251]:
y_pred = pipe_3.predict(final)

In [254]:
sample_submission['category'] = y_pred.astype(int)
sample_submission.to_csv('hopefully_this_works.csv', index=False)
!kaggle competitions submit -c ds3-which-whisky -f hopefully_this_works.csv -m "Once more with feeling!"

100%|██████████████████████████████████████| 1.91k/1.91k [00:00<00:00, 4.14kB/s]
Successfully submitted to DS3 Which Whisky

This method netted a 0.96511 Accuracy score on the public leaderboard. 