In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [81]:
import numpy as np
import pandas as pd
from catboost import Pool as catPool
import time

In [75]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from module.prepare_data import *

In [76]:
TEMP_FEATURE_PKL = '../cache/ver10/grid_features.pkl'
END_TRAIN = 1913

In [37]:
# grid_df = pd.read_pickle(TEMP_FEATURE_PKL)
# for col in ['item_id','dept_id','cat_id','event_name_1','event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']:
#     n_nan = grid_df[col].isna().sum()
#     if n_nan > 0:
#         grid_df[col] = grid_df[col].astype(str).fillna('NONE').astype('category')
#         print(col, grid_df[col].dtype, grid_df[col].isna().sum())
# grid_df.to_pickle(TEMP_FEATURE_PKL)
# del grid_df

In [11]:
# grid_df = pd.read_pickle(TEMP_FEATURE_PKL)
# grid_df['groups'] = grid_df['tm_y'].astype(str) + '_' + (grid_df['tm_m']//3).astype(str)
# grid_df.to_pickle(TEMP_FEATURE_PKL)
# del grid_df

In [62]:
grid_df = pd.read_pickle(TEMP_FEATURE_PKL)

In [63]:
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46881677 entries, 0 to 46881676
Data columns (total 79 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   id                      object  
 1   item_id                 category
 2   dept_id                 category
 3   cat_id                  category
 4   store_id                object  
 5   state_id                object  
 6   d                       int16   
 7   sales                   float16 
 8   release                 int16   
 9   wm_yr_wk                int16   
 10  sell_price              float16 
 11  price_max               float16 
 12  price_min               float16 
 13  price_std               float16 
 14  price_mean              float16 
 15  price_norm              float16 
 16  price_nunique           float16 
 17  item_nunique            int16   
 18  price_momentum          float16 
 19  price_momentum_m        float16 
 20  price_momentum_y        float16 
 21  event_

In [51]:
grid_df.to_pickle(TEMP_FEATURE_PKL)
del grid_df

In [83]:
from catboost import CatBoostRegressor

cat_params = {
    'n_estimators':1400,
    'loss_function':'Tweedie',
    # 'tweedie_variance_power': 1.1,
    'eval_metric':'RMSE',
    'subsample': 0.5,
    'sampling_frequency':1,
    'learning_rate':0.03,
    'max_leaves': 2 ** 11 - 1,
    'min_data_in_leaf': 2 ** 12 - 1,
#     'feature_fraction': 0.5,
    'max_bin': 100,
    'verbose': 1,
    'random_seed': SEED,
}


def train_evaluate_model(feature_columns, target, base_path, stores_ids=STORES_IDS, permutation=False):

    his = []
    for store_id in stores_ids:
        print('Train', store_id)

        grid_df = get_data_by_store(store_id)
        train_mask = grid_df['d'] <= END_TRAIN
        # valid_mask = (grid_df['d'] > END_TRAIN-28 -100) & (grid_df['d'] <= END_TRAIN)
        preds_mask = grid_df['d'] > (END_TRAIN - 100)

        ## Initiating our GroupKFold
        folds = GroupKFold(n_splits=3)
        # grid_df['groups'] = grid_df['tm_y'].astype(str) + '_' + grid_df['tm_m'].astype(str)
        split_groups = grid_df[train_mask]['groups']

        # Saving part of the dataset for later predictions
        # Removing features that we need to calculate recursively
        keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
        grid_df[preds_mask].reset_index(drop=True)[keep_cols].to_pickle(f'{base_path}/test_{store_id}_ver{VER}.pkl')
        # grid_df[valid_mask].reset_index(drop=True)[keep_cols].to_pickle(f'{base_path}/valid_{store_id}_ver{VER}.pkl')

        feature_columns_i = feature_columns[store_id]
        # Main Data
        X, y = grid_df[train_mask][feature_columns_i], grid_df[train_mask][target]
        del grid_df
        
        categorical_features_indices = np.where(X.dtypes == 'category')[0]


        # Launch seeder again to make lgb training 100% deterministic
        # with each "code line" np.random "evolves"
        # so we need (may want) to "reset" it

        for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_groups)):

            print('Fold:', fold_)
            trn_X, trn_y = X.iloc[trn_idx, :], y[trn_idx]
            val_X, val_y = X.iloc[val_idx, :], y[val_idx]
            
            train_pool = catPool(trn_X, trn_y, cat_features=categorical_features_indices)
            validate_pool = catPool(val_X, val_y, cat_features=categorical_features_indices)
            estimator = CatBoostRegressor(**cat_params)
            estimator = estimator.fit(train_pool, eval_set=validate_pool, cat_features = categorical_features_indices, silent=True)

            if permutation:
                importance_df = permutation_importance(estimator, pd.concat([val_X,val_y], axis=1), feature_columns_i, target, metric=root_mean_sqared_error,verbose=0)
            else:
                importance_df = None

            prediction_val = estimator.predict(val_X.values)
            rmse_val = rmse(val_y.values, prediction_val)
            prediction_trn = estimator.predict(trn_X.values)
            rmse_trn = rmse(trn_y.values, prediction_trn)

            # Save model - it's not real '.bin' but a pickle file
            # estimator = lgb.Booster(model_file='model.txt')
            # can only predict with the best iteration (or the saving iteration)
            # pickle.dump gives us more flexibility
            # like estimator.predict(TEST, num_iteration=100)
            # num_iteration - number of iteration want to predict with,
            # NULL or <= 0 means use best iteration
            model_name = f'{base_path}/cat_model_{store_id}_fold{fold_}_ver{VER}.bin'
            pickle.dump(estimator, open(model_name, 'wb'))

            # Remove temporary files and objects
            # to free some hdd space and ram memory
            del estimator, trn_X, val_X, trn_y, val_y
            gc.collect()

            his.append({'rmse_val': rmse_val, 'rmse_trn':rmse_trn, 'rmse_diff':rmse_val-rmse_trn, 'fold_': fold_, 'store_id': store_id, 'prediction_val':prediction_val, 'permutation_importance':importance_df})

    return pd.DataFrame(his)

def get_data_by_store(store_id, grid_df_path = TEMP_FEATURE_PKL):
    grid_df = pd.read_pickle(grid_df_path)
    return grid_df[grid_df['store_id']==store_id].reset_index(drop=False)

In [84]:
useful_cols = dict(zip(STORES_IDS, [M5_FEATURES]*len(STORES_IDS)))

In [85]:
history_df = train_evaluate_model(useful_cols, TARGET, BASE_PATH)#stores_ids=['CA_1']
print(history_df.rmse_trn.mean(), history_df.rmse_val.mean(), history_df.rmse_diff.mean())

Train CA_1
Fold: 0


TypeError: No matching signature found