In [6]:
import pandas as pd
wh_train = pd.read_csv('train.csv')
wh_test = pd.read_csv('test.csv')
wh_train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
rfc = RandomForestClassifier()
vec = TfidfVectorizer(stop_words='english', ngram_range=(1,2))

In [5]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('vec', vec),
    ('clf', rfc)
])

In [8]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vec__max_df': ( 0.75, 1.0),
    'vec__min_df': (.02, .05),
    'vec__max_features': (500,1000),
    'clf__n_estimators':(5, 10,),
    'clf__max_depth':(15,20)
}
gsearch = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1)
gsearch.fit(wh_train['description'].values, wh_train['category'].values)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
 ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vec__max_df': (0.75, 1.0), 'vec__min_df': (0.02, 0.05), 'vec__max_features': (500, 1000), 'clf__n_estimators': (5, 10), 'clf__max_depth': (15, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
preds = gsearch.predict(wh_test['description'].values)
subm = pd.DataFrame({'id':wh_test['id'], 'category':preds})
subm.head()

Unnamed: 0,id,category
0,955,2
1,3532,2
2,1390,1
3,1024,1
4,1902,1


In [14]:
subm.to_csv('actual_submission.csv', index=False)

## LSI

In [15]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(
    n_components=100,
    algorithm='randomized',
    n_iter=10
)

In [16]:
lsi = Pipeline([
    ('vec', vec),
    ('svd', svd)
])
pipe2 = Pipeline([
    ('lsi', lsi),
    ('clf', rfc)
])

In [17]:
params2 = {
    'lsi__svd__n_components': [10,100,250],
    'lsi__vec__max_df':[.9, .95, 1.0],
    'clf__n_estimators':[5,10,20]
}
gsearch2 = GridSearchCV(pipe2, params2, cv=5, n_jobs=-1)
gsearch2.fit(wh_train['description'].values, wh_train['category'].values)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('lsi', Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'lsi__svd__n_components': [10, 100, 250], 'lsi__vec__max_df': [0.9, 0.95, 1.0], 'clf__n_estimators': [5, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
preds2 = gsearch2.predict(wh_test['description'].values)
subm2 = pd.DataFrame({'id':wh_test['id'], 'category':preds2})
subm2.to_csv('lsi_submission.csv', index=False)

## Embeddings

In [19]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [20]:
X = [nlp(d).vector for d in wh_train['description'].values]
rfc.fit(X, wh_train['category'].values)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
Y = [nlp(d).vector for d in wh_test['description'].values]
preds3 = rfc.predict(Y)
subm3 = pd.DataFrame({'id':wh_test['id'].values, 'category':preds3})
subm3.to_csv('wordemb_submission.csv', index=False)