In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GroupShuffleSplit, GridSearchCV
from scipy import sparse

import warnings
warnings.filterwarnings("ignore")

In [2]:
PATH_TO_PROCESSED_DATA = './../data/kaggle_receipts/processed/'

## load data

In [3]:
loaded_train = []
loaded_test = []

features_to_load = [
    ('name_cvect', 'npz'),
    #('name_cvect_tw', 'npz'),
    #('name_tfidf', 'npz'),
    ('shop_name', 'npz'),
    ('price', 'csv'),
    #('dayofweek', 'npz'),
    #('hour', 'npz'),
]

In [4]:
for (name, tp) in features_to_load:
    if tp == 'npz':
        loaded_train.append(sparse.load_npz(PATH_TO_PROCESSED_DATA + name + '_train.npz'))
        loaded_test.append(sparse.load_npz(PATH_TO_PROCESSED_DATA + name + '_test.npz'))
    elif tp == 'csv':
        loaded_train.append(pd.read_csv(PATH_TO_PROCESSED_DATA + name + '_train.csv', index_col=0))
        loaded_test.append(pd.read_csv(PATH_TO_PROCESSED_DATA + name + '_test.csv', index_col=0))
        
categories_train = pd.read_csv(PATH_TO_PROCESSED_DATA + 'categories_train.csv', index_col=0)
check_id_train = pd.read_csv(PATH_TO_PROCESSED_DATA + 'check_id_train.csv', index_col=0)

In [5]:
X_train_full = sparse.csr_matrix(sparse.hstack(loaded_train))
X_test_full = sparse.csr_matrix(sparse.hstack(loaded_test))
y_train = categories_train['category']
check_id_train = check_id_train['check_id']

In [6]:
print(X_train_full.shape)
print(X_test_full.shape)
print(y_train.shape)
print(check_id_train.shape)

(13682, 14432)
(3000, 14432)
(13682,)
(13682,)


## slpit data to cross validation (GroupShuffleSplit)

In [7]:
gss = GroupShuffleSplit(n_splits=5, random_state=0)

In [8]:
gss_splits = list(gss.split(X_train_full, y_train, check_id_train))

## LogisticRegression CV param search

In [9]:
lr_gs_params = {
    'penalty': ['l1', 'l2'],
    'class_weight': [None, 'balanced'],
    'multi_class': ['multinomial', 'ovr'],
    'solver': ['sag', 'saga', 'newton-cg', 'lbfgs', 'liblinear'],
    'C': np.logspace(-2, 3, 30)
}

In [10]:
lr_estimator = LogisticRegression(random_state=14)

In [11]:
lr_gs = GridSearchCV(estimator=lr_estimator, 
                     param_grid=lr_gs_params, 
                     cv=gss_splits, 
                     scoring='neg_log_loss', 
                     return_train_score=True,
                     verbose=10,
                     error_score=-100.0,
                     refit=False,
                     n_jobs=-1)

In [12]:
%%time

gs.fit(X_train_full, y_train)

Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


[Parallel(n_jobs=-1)]: Done 719 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 4763 tasks      | elapsed: 109.2min
[Parallel(n_jobs=-1)]: Done 5224 tasks      | elapsed: 134.4min
[Parallel(n_jobs=-1)]: Done 5589 tasks      | elapsed: 165.0min


CPU times: user 23.9 s, sys: 4.02 s, total: 27.9 s
Wall time: 3h 33min 17s


[Parallel(n_jobs=-1)]: Done 6000 out of 6000 | elapsed: 213.3min finished


GridSearchCV(cv=[(array([    0,     1, ..., 13678, 13679]), array([    9,    10, ..., 13680, 13681])), (array([    0,     1, ..., 13680, 13681]), array([    9,    10, ..., 13604, 13606])), (array([    0,     1, ..., 13680, 13681]), array([   40,    41, ..., 13667, 13668])), (array([    0,     1, ..., 13680, 13681]), array([    9,    10, ..., 13603, 13604])), (array([    0,     1, ..., 13680, 13681]), array([   14,    15, ..., 13665, 13666]))],
       error_score=-100.0,
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=14, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'multi_class': ['multinomial', 'ovr'], 'C': array([  1.00000e-02,   1.48735e-02,   2.21222e-02,   3.29034e-02,
         4.89390e-02,   7.27895e-02,   1.082

In [13]:
print('best score:', -1 * lr_gs.best_score_)
print('best params:', lr_gs.best_params_)

('best score:', 0.48803143994009479)
('best params:', {'penalty': 'l2', 'multi_class': 'ovr', 'C': 18.873918221350976, 'solver': 'liblinear', 'class_weight': 'balanced'})


## LinearSVC CV param search

In [9]:
svc_gs_params = {
    'C': np.logspace(-3, 3, 30),
    'class_weight': [None, 'balanced'],
}

In [10]:
#svc_estimator = LinearSVC(random_state=14)
svc_estimator = SVC(kernel='linear', decision_function_shape='ovr', probability=True, random_state=14)

In [11]:
svc_gs = GridSearchCV(estimator=svc_estimator, 
                      param_grid=svc_gs_params, 
                      cv=gss_splits, 
                      scoring='neg_log_loss', 
                      return_train_score=True,
                      verbose=True,
                      error_score=-100.0,
                      refit=False,
                      n_jobs=-1)

In [12]:
%%time

svc_gs.fit(X_train_full, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 18.9min


CPU times: user 2.47 s, sys: 1.63 s, total: 4.1 s
Wall time: 30min 2s


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 30.0min finished


GridSearchCV(cv=[(array([    0,     1, ..., 13678, 13679]), array([    9,    10, ..., 13680, 13681])), (array([    0,     1, ..., 13680, 13681]), array([    9,    10, ..., 13604, 13606])), (array([    0,     1, ..., 13680, 13681]), array([   40,    41, ..., 13667, 13668])), (array([    0,     1, ..., 13680, 13681]), array([    9,    10, ..., 13603, 13604])), (array([    0,     1, ..., 13680, 13681]), array([   14,    15, ..., 13665, 13666]))],
       error_score=-100.0,
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=14, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': array([1.00000e-03, 1.61026e-03, 2.59294e-03, 4.17532e-03, 6.72336e-03,
       1.08264e-02, 1.74333e-02, 2.80722e-02, 4.52035e-02, 7.27895e-02,
       1.17210e-01, 1.88739e-01, 3.03920e-01, 4.89390e-01, 7.88046e-0

In [13]:
print('best score:', -1 * svc_gs.best_score_)
print('best params:', svc_gs.best_params_)

best score: 0.5418458743935469
best params: {'C': 0.4893900918477494, 'class_weight': None}
