In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
from datetime import datetime
from tqdm import tqdm_notebook

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GroupShuffleSplit
from sklearn.metrics import confusion_matrix, make_scorer, log_loss
from scipy import sparse

from sklearn.preprocessing import Normalizer

%matplotlib inline

In [2]:
PATH_TO_PROCESSED_DATA = './../data/kaggle_receipts/processed/'

## load data

In [3]:
loaded_train = []
loaded_test = []

features_to_load = [
    ('name_cvect', 'npz'),
    #('name_cvect_mms', 'npz'),
    #('name_cvect_norm', 'npz'),
    #('name_cvect_tw', 'npz'),
    #('name_tfidf', 'npz'),
    ('shop_name', 'npz'),
    ('price', 'csv'),
    #('price_mms', 'csv'),
    #('count' , 'csv'),
    #('dayofweek', 'npz'),
    #('hour', 'npz'),
    #('hour_dayofweek_poly', 'csv'),
]

In [4]:
for (name, tp) in features_to_load:
    if tp == 'npz':
        loaded_train.append(sparse.load_npz(PATH_TO_PROCESSED_DATA + name + '_train.npz'))
        loaded_test.append(sparse.load_npz(PATH_TO_PROCESSED_DATA + name + '_test.npz'))
    elif tp == 'csv':
        loaded_train.append(pd.read_csv(PATH_TO_PROCESSED_DATA + name + '_train.csv', index_col=0))
        loaded_test.append(pd.read_csv(PATH_TO_PROCESSED_DATA + name + '_test.csv', index_col=0))
        
categories_train = pd.read_csv(PATH_TO_PROCESSED_DATA + 'categories_train.csv', index_col=0)
check_id_train = pd.read_csv(PATH_TO_PROCESSED_DATA + 'check_id_train.csv', index_col=0)
id_test = pd.read_csv(PATH_TO_PROCESSED_DATA + 'id_test.csv', index_col=0)

In [5]:
X_train_full = sparse.csr_matrix(sparse.hstack(loaded_train))
X_test_full = sparse.csr_matrix(sparse.hstack(loaded_test))
y_train = categories_train['category']
check_id_train = check_id_train['check_id']
id_test = id_test[['id']]

In [6]:
print(X_train_full.shape)
print(X_test_full.shape)
print(y_train.shape)
print(check_id_train.shape)
print(id_test.shape)

(13682, 14432)
(3000, 14432)
(13682,)
(13682,)
(3000, 1)


In [7]:
with open(PATH_TO_PROCESSED_DATA + 'category_labeler.pkl', 'rb') as category_labeler_file:
    category_labeler = pickle.load(category_labeler_file)

## make prediction

In [14]:
predictions = best_estimator.predict_proba(X_test_full)

In [15]:
predictions_df = id_test

for idx, category in enumerate(category_labeler.classes_):
    predictions_df = pd.concat([predictions_df, pd.Series(predictions[:, idx], name=category)], axis=1)

In [16]:
timestamp = datetime.now().strftime('%m%d_%H%M')
predictions_df.to_csv('./submissions/submission_{}.csv'.format(timestamp), encoding='utf-8', index=False)