In [2]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

import xgboost as xgb

import spacy
nlp = spacy.load("en_core_web_lg")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

In [4]:
train = train.dropna()

# TFID + RF

In [5]:
rfc = RandomForestClassifier()
vect = TfidfVectorizer()

pipe = Pipeline([('vect', vect), ('rfc', rfc)])

In [8]:
pipe.fit(train['description'], train['category'])



Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

In [9]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': ( 100, 500, 1000),
    'rfc__n_estimators': (20, 100, 400),
}

clf = GridSearchCV(pipe, parameters, cv=5, n_jobs=-1)

In [11]:
clf.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [12]:
clf.best_score_

0.8869143780290791

# TFID + XGBC

In [19]:
xgbc = xgb.XGBClassifier()
pipe2 = Pipeline([('vect', vect), 
                  ('xgbc', xgbc)])

parameters2 = {
    'vect__max_df': (0.7,), #'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02,), #'vect__min_df': (.02, .05),
    #'vect__max_features': ( 100, 500, 1000),
    'xgbc__n_estimators': (100, 400, 800),
}

clf2 = GridSearchCV(pipe2, parameters2, cv=5, n_jobs=7)

In [21]:
clf2.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [22]:
clf2.best_score_

0.9006462035541195

In [26]:
clf2.predict(train['description'])

array([2., 1., 2., ..., 3., 1., 2.])

In [31]:
y_pred = clf2.predict(test['description'])

# TFID + SGDC

In [35]:
from sklearn.linear_model import SGDClassifier

In [41]:
sgdc = SGDClassifier()
pipe3 = Pipeline([('vect', vect), 
                  ('sgdc', sgdc)])

parameters3 = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': ( 100, 500, 1000),
    'sgdc__max_iter': (300, 1000, 3000),
}
clf3 = GridSearchCV(pipe3, parameters3, cv=5, n_jobs=7)

In [42]:
clf3.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [43]:
clf3.best_score_

0.9022617124394184

In [46]:
clf3.best_params_

{'sgdc__max_iter': 300, 'vect__max_df': 0.7, 'vect__min_df': 0.02}

In [44]:
y_pred = clf3.predict(test['description'])

# TFID + SVD + SGDC

In [47]:
from sklearn.decomposition import TruncatedSVD

In [59]:
svd = TruncatedSVD(algorithm='randomized',
                  n_iter=10, random_state=9999)
sgdc = SGDClassifier(random_state=9999)

pipe4 = Pipeline([('vect', vect), 
                  ('svd', svd),
                  ('sgdc', sgdc)])

parameters4 = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__min_df': (.02, .05),
    #'vect__max_features': ( 100, 500, 1000),
    'svd__n_components': (100, 300, 1000),
    'sgdc__max_iter': (300, 1000, 3000),
}
clf4 = GridSearchCV(pipe4, parameters4, cv=5, n_jobs=7)

In [60]:
clf4.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [62]:
clf4.best_score_

0.9309369951534734

In [66]:
# 0.9293214862681745 first attempt.  0.97222 Public
# 0.9309369951534734 second attempt. 0.96527 Public

In [56]:
clf4.best_score_

0.9293214862681745

In [63]:
y_pred = clf4.predict(test.description)

# TFID + SVD + XGBC

In [67]:
xgbc = xgb.XGBClassifier()

svd = TruncatedSVD(algorithm='randomized',
                  n_iter=10)

pipe5 = Pipeline([('vect', vect), 
                  ('svd', svd),
                  ('xgbc', xgbc)])

parameters5 = {
    #'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__min_df': (.02, .05),
    #'vect__max_features': ( 100, 500, 1000),
    'svd__n_components': (100, 300, 1000),
    'xgbc__n_estimators': (100, 400, 800),
}
clf5 = GridSearchCV(pipe5, parameters5, cv=5, n_jobs=7)

In [68]:
clf5.fit(train['description'], train['category'])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                          

In [69]:
clf5.best_score_

0.9220516962843296

In [70]:
y_pred = clf5.predict(test.description)

# Submission Script

In [71]:
y_submission = pd.read_csv('./data/sample_submission.csv')
assert y_submission.shape[0] == y_pred.shape[0]

y_submission.category = y_pred.astype(int)
y_submission.to_csv('svd_xgbc2_sample_submission.csv', index=None)

In [30]:
y_submission.shape, y_pred.shape

((288, 2), (2476,))

In [13]:
test_pred = clf.predict(test['description'])

In [None]:
embeddings = [nlp(doc).vector for doc in train['description']]

In [None]:
rfc.fit(embeddings, train['category'])


In [None]:
rfc.score(embeddings, train['category'])

In [None]:
y_embeddings = [nlp(doc).vector for doc in test['description']]

In [None]:
rfc_embeddings_out = rfc.predict(y_embeddings)

In [None]:
doc = train['description'][0]

In [None]:
doc

In [None]:
doc = nlp(doc)

In [None]:
for c in doc.noun_chunks:
    # stop word
    print(c.lemma_)

In [None]:
def tokenize(doc):
    
    d = nlp(doc)
    tokens = []
    
    # you can extract anything you want from spacy to use as a feature
    # noun chunks arent the end all be all
    # try extracting just tokens, or just adjectives, or pronouns
    for chunk in d.noun_chunks:
        tokens.append(chunk.lemma_)
        
    return tokens

In [None]:
vect= CountVectorizer(analyzer=tokenize, max_df=.9, min_df=.1)

In [None]:
vect.fit(train['description'])

In [None]:
len(vect.get_feature_names())

In [None]:
X = vect.transform(train['description'])

In [None]:
rfc.fit(X, train['category'])

In [None]:
rfc.score(X, train['category'])

In [None]:
out_csv = pd.read_csv('./data/sample_submission.csv')
out_csv.shape, test_pred.shape, rfc_embeddings_out.shape
out_csv.category = rfc_embeddings_out.astype(int)
out_csv.to_csv('sample_submission.csv', index=None)