In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
whiskey_df = pd.read_csv('train.csv')

In [3]:
whiskey_df = whiskey_df.dropna()

In [4]:
whiskey_df = whiskey_df.reset_index()

In [5]:
X = whiskey_df[['id', 'author', 'description', 'price', 'ratingValue', 'pert_alcohol']]
y = whiskey_df['category']

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [76]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1658, 6), (818, 6), (1658,), (818,))

In [77]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')

vectorizer.fit(X_train['description'])

dtm = vectorizer.transform(X_train['description'])

In [78]:
dtm.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [79]:
dtm = pd.DataFrame(dtm.todense(), columns = vectorizer.get_feature_names())

In [80]:
dtm.head()

Unnamed: 0,000,035,070,076,080,10,100,10042,101,1013,...,zigzag,zin,zinfandel,zing,zinginess,zings,zingy,zip,zippy,ìle
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [82]:
get_top_n_words(whiskey_df['description'], 20)

[('finish', 1599),
 ('palate', 1340),
 ('oak', 1294),
 ('notes', 1186),
 ('sweet', 1137),
 ('nose', 1127),
 ('vanilla', 1065),
 ('fruit', 953),
 ('whisky', 895),
 ('old', 715),
 ('spice', 619),
 ('sherry', 615),
 ('caramel', 599),
 ('malt', 588),
 ('bourbon', 583),
 ('year', 581),
 ('chocolate', 576),
 ('smoke', 572),
 ('cinnamon', 518),
 ('toffee', 513)]

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=5000, min_df=.05)

dtm = tfidf.fit_transform(whiskey_df['description'])

docs = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
docs.head()

Unnamed: 0,age,aged,apple,apples,aroma,aromas,balance,balanced,barley,barrel,...,tongue,vanilla,water,whiskey,whiskies,whisky,white,wood,year,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257235,0.0,0.0,...,0.0,0.149668,0.243576,0.235487,0.0,0.0,0.0,0.0,0.209777,0.0
1,0.0,0.0,0.0,0.0,0.366813,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209811,...,0.0,0.115499,0.0,0.363452,0.0,0.0,0.0,0.0,0.32377,0.182848
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.481486,0.0,0.0,0.0,0.0
4,0.0,0.0,0.262379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.139624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.neighbors import NearestNeighbors

nn = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')

nn.fit(dtm.todense())

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [206]:
X_test[['description']][:1]

Unnamed: 0,description
2049,A tamed Talisker. The classic Talisker persona...


In [207]:
new = tfidf.transform(test['description'][6:7])

In [208]:
nn.kneighbors(new.todense())

(array([[1.        , 1.06957581, 1.07036448, 1.07302478, 1.07601191]]),
 array([[1471, 2042,  966,  342, 2086]]))

In [200]:
whiskey_df.shape

(2476, 7)

In [95]:
X_train.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
1930,2775,Fred Minnick,An 8 year old Kentucky straight bourbon finish...,50.0,85,45.0
22,36,John Hansell,"Powerful, muscular, well-textured, and invigor...",85.0,96,57.1
1624,2336,Geoffrey Kleinman,Made from malted Bavarian hard wheat and aged ...,75.0,86,50.0
522,782,Gavin Smith,"Released with no age statement, Fettercairn Fi...",59.0,90,42.0
1343,1932,John Hansell,"Fuller in body, deeper and more complex than t...",60.0,87,43.0


In [96]:
y_train.head()

1930    2.0
22      1.0
1624    3.0
522     1.0
1343    1.0
Name: category, dtype: float64

In [33]:
pd.read_csv('sample_submission.csv').shape

(288, 2)

In [98]:
whiskey_df['description'][2049]

'A tamed Talisker. The classic Talisker personality shows-seaweed, brine, peat smoke, and freshly ground pepper on the finish. There are even other interesting flavors I enjoy, notably vanilla, licorice root, charcoal, and bitter chocolate. But this whisky needs to be a little livelier to be a truly great whisky. While there’s no age statement, there’s older whisky in here for sure, and maybe that’s part of its “reserved” nature. It certainly is a nice whisky, but I would prefer the more vibrant Talisker 18 year old -at two-thirds the cost.'

In [114]:
whiskey_df[whiskey_df['description'] == 'When compared to the 10 year old, this one’s richer, with darker fruit and more caramelized sweet notes, paired with accentuated dried wood spice, while the 10 year old is more vibrant and floral. Like the 10 year old, there’s good balance here.']['category']

830    1.0
Name: category, dtype: float64

In [9]:
test = pd.read_csv('test.csv')

In [110]:
test.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


In [116]:
new = tfidf.transform(test['description'][:1])
nn.kneighbors(new.todense())[1][0][0]

1327

In [129]:
new = tfidf.transform(test['description'][1:2])
nn.kneighbors(new.todense())[1][0][0]

597

In [35]:
count = 0

new = tfidf.transform(test['description'][count:count+1])
closest = nn.kneighbors(new.todense())[1][0][0]
closest

1017

In [314]:
whiskey_df[whiskey_df['description'] == whiskey_df['description'][0]]['category'][0]

2.0

In [237]:
whiskey_df.head()

Unnamed: 0,level_0,index,id,author,description,price,ratingValue,pert_alcohol,category
0,0,0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,1,1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,2,2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,3,3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,4,5,9,Fred Minnick,"A caramel-laden fruit bouquet, followed by une...",150.0,96,54.49,2.0


In [36]:
ids = []
predictions = []

for i in range(288):
    ids.append((test['id'][count:count+1][i]))
    new = tfidf.transform(test['description'][count:count+1])
    closest = nn.kneighbors(new.todense())[1][0][0]
    predictions.append(whiskey_df[whiskey_df['description'] == whiskey_df['description'][closest]]['category'][closest])
    count += 1
    

In [37]:
predictions = pd.DataFrame(data={'id': ids, 'category': predictions})

In [38]:
predictions['category'] = predictions['category'].astype('int')

In [42]:
predictions.head()

Unnamed: 0,id,category
0,955,2
1,3532,3
2,1390,4
3,1024,1
4,1902,1


In [46]:
predictions.to_csv('first_submission.csv', index=False)

In [41]:
## notes 

from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier


In [42]:
# create a pipeline

vect = TfidfVectorizer(stop_words='english')
sgdc = SGDClassifier()

pipe = Pipeline([('vect', vect), ('clf', sgdc)])


In [None]:
# fit pipeline

pipe.fit(data.data, data.target)

In [None]:
pipe.predict(test)

In [43]:
# gridsearch
from sklearn.model_selection import GridSearchCV

In [44]:
parameters = {
    'vect_max_df' : (.5, .75, 1.0),
    'clf_max_iter' : (20, 10, 100)
}

In [45]:
gridsearch = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1, verbose=1)

In [None]:
gridsearch.fit(data.data, data.target)