## Kaggle Competition

### Guide to whoop Josh's ass in Kaggle Competition

In [40]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
# ---------------- Pandas settings --------------- #
# Removes rows and columns truncation of '...'
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD

import xgboost as xgb
import spacy


nlp = spacy.load("en_core_web_lg")

## Load data

In [None]:
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

## Quick EDA

In [None]:
train.info()

In [None]:
train[train.duplicated()]

In [5]:
train.isnull().sum()

id             0
description    0
category       0
dtype: int64

In [6]:
train.head()

Unnamed: 0,id,description,category
0,1,A marriage of 13 and 18 year old bourbons. A m...,2
1,2,There have been some legendary Bowmores from t...,1
2,3,This bottling celebrates master distiller Park...,2
3,4,What impresses me most is how this whisky evol...,1
4,9,"A caramel-laden fruit bouquet, followed by une...",2


In [7]:
def wrangle(df):
    df = df.copy()
    df['description'] = df['description'].str.lower().str.strip().str.replace(r"’", "'")
    return df

In [8]:
train = wrangle(train)
test = wrangle(test)

In [9]:
train.loc[0, 'description']

'a marriage of 13 and 18 year old bourbons. a mature yet very elegant whiskey, with a silky texture and so easy to embrace with a splash of water. balanced notes of honeyed vanilla, soft caramel, a basket of complex orchard fruit, blackberry, papaya, and a dusting of cocoa and nutmeg; smooth finish. sophisticated, stylish, with well-defined flavors. a classic!'

In [10]:
train.loc[10, 'description']

"another excellent stagg, and considering its alcohol level, it's also a good value if you can get it at this price. notes of toffee, pot still rum, nougat, dates, tobacco, roasted nuts, polished oak, and leather. great depth and nicely balanced. a masculine bourbon of character and structure."

In [11]:
import string
from spacy.lang.en.stop_words import STOP_WORDS

def tokenize(text):
    tokens = []
    doc = nlp(text)
    for token in doc:
        if (token.is_stop == False) and (token.is_punct == False or token.text not in string.punctuation):
            tokens.append(token.text)
    return tokens

In [12]:
text = train.loc[10, 'description']
tokenize(text)

['excellent',
 'stagg',
 'considering',
 'alcohol',
 'level',
 'good',
 'value',
 'price',
 'notes',
 'toffee',
 'pot',
 'rum',
 'nougat',
 'dates',
 'tobacco',
 'roasted',
 'nuts',
 'polished',
 'oak',
 'leather',
 'great',
 'depth',
 'nicely',
 'balanced',
 'masculine',
 'bourbon',
 'character',
 'structure']

## Submission

In [13]:
def submission(model, file_name_suffix):
    # Predictions on test sample
    preds = model.predict(test['description'])
    
    # Convert predictions to dataframe
    submission = pd.DataFrame({'id': test['id'], 'category':preds})
    submission['category'] = submission['category'].astype(int)
    
    # Save your Submission File
    file_path = f'./data/submission_{file_name_suffix}.csv'
    submission.to_csv(file_path, index=False)
    print(f'File saved at: {file_path}')
    print(submission.head())

## Timer

In [14]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

### TFID + RF

In [15]:
rfc = RandomForestClassifier()
vect = TfidfVectorizer(stop_words='english')

pipe1 = Pipeline([('vect', vect), ('rfc', rfc)])

parameters1 = {
    'vect__max_df': (0.5, 0.75, 0.9, 0.95, 0.99),
    'vect__min_df': (0.02, 0.05, 0.1, 0.15),
    'vect__max_features': (100, 500, 1000),
    'rfc__n_estimators': (100, 200, 300, 400, 500),
}

start_time = timer(None) # timing starts from this point for "start_time" variable
grid_search1 = GridSearchCV(pipe1, parameters1, cv=5, n_jobs=-1, verbose=10)
grid_search1.fit(train['description'], train['category'])
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   29.0s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   38.4s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   50.1s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   


 Time taken: 0 hours 15 minutes and 52.87 seconds.


In [16]:
grid_search1.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=1000, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [17]:
grid_search1.best_params_

{'rfc__n_estimators': 400,
 'vect__max_df': 0.5,
 'vect__max_features': 1000,
 'vect__min_df': 0.02}

In [18]:
grid_search1.best_score_

0.8994586233565351

In [19]:
submission(grid_search1, 'TFIDF_RF')

File saved at: ./data/submission_TFIDF_RF.csv
     id  category
0   955         2
1  3532         3
2  1390         1
3  1024         1
4  1902         1


### TFIDF + SGDC

In [20]:
sgdc = SGDClassifier()
vect = TfidfVectorizer(stop_words='english')

pipe2 = Pipeline([('vect', vect), 
                  ('sgdc', sgdc)])

parameters2 = {
    'vect__max_df': (0.5, 0.75, 0.9, 0.95, 0.99),
    'vect__min_df': (0.02, 0.05, 0.1, 0.15),
    'vect__max_features': (100, 500, 1000),
    'sgdc__max_iter': (300, 1000, 3000),
}
start_time = timer(None)
grid_search2 = GridSearchCV(pipe2, parameters2, cv=5, n_jobs=-1, verbose=10)
grid_search2.fit(train['description'], train['category'])
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   


 Time taken: 0 hours 3 minutes and 45.4 seconds.


In [21]:
grid_search2.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=500, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [22]:
grid_search2.best_params_

{'sgdc__max_iter': 1000,
 'vect__max_df': 0.5,
 'vect__max_features': 500,
 'vect__min_df': 0.02}

In [23]:
grid_search2.best_score_

0.9025522041763341

In [24]:
submission(grid_search2, 'TFIDF_SGDC')

File saved at: ./data/submission_TFIDF_SGDC.csv
     id  category
0   955         2
1  3532         2
2  1390         1
3  1024         1
4  1902         1


### TFID + XGBC

In [26]:
xgbc = xgb.XGBClassifier()
vect = TfidfVectorizer(stop_words='english', max_features=500, min_df=0.02)
pipe3 = Pipeline([('vect', vect), 
                  ('xgbc', xgbc)])

# parameters3 = {
#         'vect__max_df': (0.5, 0.75, 0.95, 0.99, 1),
#         'vect__min_df': (.02,),
#         'xgbc__learning_rate': (0.01, 0.05, 0.1),
#         'xgbc__n_estimators': (100, 500, 800, 1000),
#         'xgbc__min_child_weight': [1, 5, 10],
#         'xgbc__gamma': [0.5, 1, 1.5, 2, 5],
#         'xgbc__subsample': [0.6, 0.8, 1.0],
#         'xgbc__colsample_bytree': [0.6, 0.8, 1.0],
#         'xgbc__max_depth': [3, 5, 10, 15, 20],
#         'xgbc__booster':['booster', 'gblinear', 'gbtree']
# }

# parameters3 = {
#         'vect__max_df': (0.5, 0.75, 0.95, 0.99, 1), 
#         'vect__min_df': (.02,), 
#         'xgbc__learning_rate': (0.01, 0.05, 0.1),
#         'xgbc__n_estimators': (100, 500, 800, 1000),
#         'xgbc__max_depth': [3, 5, 10, 15, 20]
# }

parameters3 = {
        'vect__max_df': (0.99,), 
        'vect__min_df': (.02,), 
        'xgbc__learning_rate': (0.01,),
        'xgbc__n_estimators': (1000,),
        'xgbc__max_depth': (15,),
}
start_time = timer(None)
grid_search3 = GridSearchCV(pipe3, parameters3, cv=5, n_jobs=-1, verbose=10)
grid_search3.fit(train['description'], train['category'])
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.3min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.3min remaining:   51.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.3min finished



 Time taken: 0 hours 3 minutes and 31.32 seconds.


In [27]:
grid_search3.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=500, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])

In [28]:
grid_search3.best_params_

{'vect__max_df': 0.99,
 'vect__min_df': 0.02,
 'xgbc__learning_rate': 0.01,
 'xgbc__max_depth': 15,
 'xgbc__n_estimators': 1000}

In [29]:
grid_search3.best_score_

0.88553750966744

In [30]:
submission(grid_search3, 'TFIDF_XGBC')

File saved at: ./data/submission_TFIDF_XGBC.csv
     id  category
0   955         2
1  3532         2
2  1390         1
3  1024         1
4  1902         1


## TFID + SVD + SGDC

In [48]:
# vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=0.01)

# sparse = vect.fit_transform(train['description'])

# dtm = pd.DataFrame(sparse.todense(), columns=vect.get_feature_names())
# print(dtm.shape)
# dtm.head()

(2586, 885)


Unnamed: 0,000,000 bottles,10,10 year,10 year old,100,12,12 year,12 year old,15,15 year,15 year old,16,17,18,18 year,18 year old,20,2015,2016,21,25,30,30 year,30 year old,375,375 ml,40,45,46,50,500,60,70,80,add,added,adding,addition,additional,adds,age,age statement,aged,aged bourbon,aged years,aggressive,aging,ago,alcohol,allowing,allspice,almond,almonds,alongside,amber,amber color,american,american oak,anise,aniseed,antique,appears,appetizing,apple,apples,apricot,apricots,aroma,aromas,aromatic,arran,available,background,baked,baking,baking spices,balance,balanced,banana,barley,barrel,barrels,base,batch,beautiful,beautifully,bed,begins,berries,berry,best,better,big,bit,bitter,bitterness,black,black pepper,blackberry,...,sweeter,sweetness,syrup,taffy,takes,tangerine,tannic,tannins,tar,tarry,tart,taste,tasted,tastes,tea,teasing,texture,textured,thing,things,think,time,tinged,tinned,toasted,tobacco,toffee,tongue,touch,trace,traditional,travel,travel retail,travel retail exclusive,treacle,tropical,tropical fruit,tropical fruits,true,turn,turns,typical,ultimately,underlying,used,using,value,vanilla,vanilla caramel,vanilla cream,vanilla fudge,vanilla honey,variant,ve,version,vibrant,vintage,viscous,walnuts,want,warehouse,warm,warming,water,water brings,waxy,way,weight,wet,wheat,whiff,whiskey,whiskeys,whiskies,whisky,white,white chocolate,white pepper,wine,wine casks,wisp,wood,wood smoke,woody,work,world,worth,year,year old,year old expression,years,years old,yes,yields,young,younger,youth,youthful,zest,zesty
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212281,0.233001,0.234217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.243818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.140816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119665,0.128763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170648,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199707,0.0,0.0,0.0,0.0,0.0,0.0,0.20473,0.226971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.106727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186599,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.164057,0.088265,0.0,0.09185,0.0,0.0,0.0,0.0,0.0,0.157345,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.137397,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154506,0.10927,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.159277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))

svd = TruncatedSVD(algorithm='randomized', n_components=100)

sgdc = SGDClassifier(early_stopping=True)

pipe4 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

parameters4 = { 
    'svd__n_iter': (5, 10, 15),
    'svd__n_components': (100, 300, 500, 1000),
    'sgdc__max_iter': (300, 1000, 3000)
}

start_time = timer(None)
grid_search4 = GridSearchCV(pipe4, parameters4, cv=5, n_jobs=-1, verbose=10)
grid_search4.fit(train['description'], train['category'])
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   32.9s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.7s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3


 Time taken: 0 hours 11 minutes and 52.04 seconds.


In [57]:
grid_search4.best_estimator_

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.01,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=Tr...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [58]:
grid_search4.best_params_

{'sgdc__max_iter': 3000,
 'svd__n_iter': 15,
 'vect__max_df': 0.95,
 'vect__min_df': 0.01}

In [59]:
grid_search4.best_score_

0.9191802010827533

In [60]:
submission(grid_search4, 'TFIDF_SVD_SGDC')

File saved at: ./data/submission_TFIDF_SVD_SGDC.csv
     id  category
0   955         2
1  3532         2
2  1390         1
3  1024         1
4  1902         1


## Custom Tokenizer + TFIDF + SVD + SGDC

In [69]:
vect = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3))

svd = TruncatedSVD(algorithm='randomized')

sgdc = SGDClassifier(early_stopping=True)

pipe5 = Pipeline([('vect', vect), ('svd', svd), ('sgdc', sgdc)])

parameters5 = { 
    'svd__n_iter': (5, 10, 15),
    'sgdc__max_iter': (300, 1000, 3000, 4000, 5000),
    'svd__n_components': (300, 1000, 3000),
}

start_time = timer(None)
grid_search5 = GridSearchCV(pipe5, parameters5, cv=5, n_jobs=-1, verbose=10)
grid_search5.fit(train['description'], train['category'])
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1654s.) Setting batch_size=2.


PicklingError: Could not pickle the task to send it to the workers.

In [None]:
grid_search5.best_estimator_

In [None]:
grid_search5.best_params_

In [None]:
grid_search5.best_score_

In [None]:
submission(grid_search5, 'CUSTOM_TOKENIZER_TFIDF_SVD_SGDC')

## Spacy Embeddings + SGDC

In [None]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]



In [None]:
# import pandas as pd

# # Filenames of your submissions you want to ensemble
# files = ['submission-01.csv', 'submission-02.csv', 'submission-03.csv']

# target = 'status_group'
# submissions = (pd.read_csv(file)[[target]] for file in files)
# ensemble = pd.concat(submissions, axis='columns')
# majority_vote = ensemble.mode(axis='columns')[0]

# sample_submission = pd.read_csv('sample_submission.csv')
# submission = sample_submission.copy()
# submission[target] = majority_vote
# submission.to_csv('my-ultimate-ensemble-submission.csv', index=False)