In [6]:
import pandas as pd

In [7]:
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')
sub = pd.read_csv('./data/sample_submission.csv')

In [8]:
train.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


In [9]:
train = train.dropna()
#test = test.dropna()

In [10]:
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
# Instances for Pipe
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

In [12]:
pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [13]:
pipe.fit(train['description'], train['category'])



Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [52]:
sub.head()

Unnamed: 0,id,category
0,955,1
1,3532,3
2,1390,2
3,1024,4
4,1902,2


In [53]:
test.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol
0,955,Fred Minnick,"Think carnival aromas—the good ones, anyway—me...",36.0,90,50.0
1,3532,Lew Bryson,"A blend of three bourbons, between 6 and 12 ye...",90.0,82,49.3
2,1390,Davin de Kergommeaux,"The nose is focused on cereal, hints of fresh ...",48.0,89,45.0
3,1024,Gavin Smith,Swiss-based Chapter 7 released this 19 year ol...,180.0,90,55.8
4,1902,Gavin Smith,Valkyrie replaces the current Dark Origins exp...,71.0,87,45.9


In [54]:
pred = pipe.predict(test['description'])
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [55]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [56]:
submission.to_csv('./data/submission1.csv', index=False)

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (100, 500, 1000),
    'rfc__n_estimators': (20, 100, 400)
}

In [16]:
gs = GridSearchCV(pipe, parameters, cv=5, n_jobs=4)

In [68]:
gs.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'vect__max_df': (0.5, 0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (100, 500, 1000), 'rfc__n_estimators': (20, 100, 400)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [69]:
gs.best_score_

0.8937802907915994

In [70]:
pred = gs.predict(test['description'])
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [71]:
submission.head()

Unnamed: 0,id,category
0,955,2
1,3532,3
2,1390,1
3,1024,1
4,1902,1


In [72]:
submission.to_csv('./data/submission1.csv', index=False)

In [2]:
import spacy 

In [4]:
nlp = spacy.load("en_core_web_lg")

In [18]:
embeddings = [nlp(doc).vector for doc in train['description']]

In [20]:
rfc.fit(embeddings, train['category'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
rfc.score(embeddings, train['category'])

0.9894991922455574

In [None]:
pred = rfc.predict(test['description'])
submission = pd.DataFrame({'id': test['id'], 'category':pred})
submission['category'] = submission['category'].astype('int64')

In [None]:
submission.head()

In [None]:
submission.to_csv('./data/submission2.csv', index=False)

In [None]:
def tokenize(doc):
    
    d = nlp(doc)
    tokens = []
    
    
    # You can extract anything you want from Spacy to use as a feature
    # Noun Chunks aren't the end all be all. 
    # Try Extracting just tokens, or just adjectives, or just pronouns
    
    if token in d:
        if (token.is_stop == False) and (token.pos_ == 'ADJ'):
            tokens.append(token.lemma_)
        
    return tokens

In [None]:
from sklearn.feature_extraction.text from