In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import time

In [3]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from module.prepare_data import *

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [5]:
TEMP_FEATURE_PKL = '../cache/ver10/grid_features.pkl'

In [9]:
END_TRAIN = 1913

In [6]:
grid_df = pd.read_pickle(TEMP_FEATURE_PKL)

In [11]:
# grid_df = pd.read_pickle(TEMP_FEATURE_PKL)
grid_df['groups'] = grid_df['tm_y'].astype(str) + '_' + (grid_df['tm_m']//3).astype(str)
grid_df.to_pickle(TEMP_FEATURE_PKL)
del grid_df

In [7]:
grid_df.tail()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,wm_yr_wk,...,rolling_mean_tmp_1_30,rolling_mean_tmp_1_60,rolling_mean_tmp_7_7,rolling_mean_tmp_7_14,rolling_mean_tmp_7_30,rolling_mean_tmp_7_60,rolling_mean_tmp_14_7,rolling_mean_tmp_14_14,rolling_mean_tmp_14_30,rolling_mean_tmp_14_60
46881672,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,1941,,0,11617,...,,,,,,,,,,
46881673,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,1941,,0,11617,...,,,,,,,,,,
46881674,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,1941,,0,11617,...,,,,,,,,,,
46881675,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,1941,,230,11617,...,,,,,,,,,,
46881676,FOODS_3_827_WI_3_validation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,1941,,304,11617,...,,,,,,,,,,


In [17]:
from catboost import CatBoostRegressor

cat_params = {
    'n_estimators':1400,
    'loss_function':'Tweedie',
    # 'tweedie_variance_power': 1.1,
    'eval_metric':'RMSE',
    'subsample': 0.5,
    'sampling_frequency':1,
    'learning_rate':0.03,
    'max_leaves': 2 ** 11 - 1,
    'min_data_in_leaf': 2 ** 12 - 1,
#     'feature_fraction': 0.5,
    'max_bin': 100,
    'verbose': 1,
    'random_seed': SEED,
}


def train_evaluate_model(feature_columns, target, base_path, stores_ids=STORES_IDS, permutation=False):

    his = []
    for store_id in stores_ids:
        print('Train', store_id)

        grid_df = get_data_by_store(store_id)
        train_mask = grid_df['d'] <= END_TRAIN
        # valid_mask = (grid_df['d'] > END_TRAIN-28 -100) & (grid_df['d'] <= END_TRAIN)
        preds_mask = grid_df['d'] > (END_TRAIN - 100)

        ## Initiating our GroupKFold
        folds = GroupKFold(n_splits=3)
        # grid_df['groups'] = grid_df['tm_y'].astype(str) + '_' + grid_df['tm_m'].astype(str)
        split_groups = grid_df[train_mask]['groups']

        # Saving part of the dataset for later predictions
        # Removing features that we need to calculate recursively
        keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
        grid_df[preds_mask].reset_index(drop=True)[keep_cols].to_pickle(f'{base_path}/test_{store_id}_ver{VER}.pkl')
        # grid_df[valid_mask].reset_index(drop=True)[keep_cols].to_pickle(f'{base_path}/valid_{store_id}_ver{VER}.pkl')

        feature_columns_i = feature_columns[store_id]
        # Main Data
        X, y = grid_df[train_mask][feature_columns_i], grid_df[train_mask][target]
        del grid_df


        # Launch seeder again to make lgb training 100% deterministic
        # with each "code line" np.random "evolves"
        # so we need (may want) to "reset" it

        for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_groups)):

            print('Fold:', fold_)
            trn_X, trn_y = X.iloc[trn_idx, :], y[trn_idx]
            val_X, val_y = X.iloc[val_idx, :], y[val_idx]
            # train_data = lgb.Dataset(trn_X, label=trn_y)
            # valid_data = lgb.Dataset(val_X, label=val_y)
            estimator = CatBoostRegressor(**cat_params)
            estimator = estimator.fit(trn_X, trn_y, eval_set=(trn_y, val_y), silent=True)

            if permutation:
                importance_df = permutation_importance(estimator, pd.concat([val_X,val_y], axis=1), feature_columns_i, target, metric=root_mean_sqared_error,verbose=0)
            else:
                importance_df = None

            prediction_val = estimator.predict(val_X)
            rmse_val = rmse(val_y, prediction_val)
            prediction_trn = estimator.predict(trn_X)
            rmse_trn = rmse(trn_y, prediction_trn)

            # Save model - it's not real '.bin' but a pickle file
            # estimator = lgb.Booster(model_file='model.txt')
            # can only predict with the best iteration (or the saving iteration)
            # pickle.dump gives us more flexibility
            # like estimator.predict(TEST, num_iteration=100)
            # num_iteration - number of iteration want to predict with,
            # NULL or <= 0 means use best iteration
            model_name = f'{base_path}/lgb_model_{store_id}_fold{fold_}_ver{VER}.bin'
            pickle.dump(estimator, open(model_name, 'wb'))

            # Remove temporary files and objects
            # to free some hdd space and ram memory
            del estimator, trn_X, val_X, trn_y, val_y
            gc.collect()

            his.append({'rmse_val': rmse_val, 'rmse_trn':rmse_trn, 'rmse_diff':rmse_val-rmse_trn, 'fold_': fold_, 'store_id': store_id, 'prediction_val':prediction_val, 'permutation_importance':importance_df})

    return pd.DataFrame(his)

def get_data_by_store(store_id, grid_df_path = TEMP_FEATURE_PKL):
    grid_df = pd.read_pickle(grid_df_path)
    return grid_df[grid_df['store_id']==store_id].reset_index(drop=False)

In [18]:
useful_cols = dict(zip(STORES_IDS, [M5_FEATURES]*len(STORES_IDS)))

In [19]:
history_df = train_evaluate_model(useful_cols, TARGET, BASE_PATH)#stores_ids=['CA_1']
print(history_df.rmse_trn.mean(), history_df.rmse_val.mean(), history_df.rmse_diff.mean())

Train CA_1
Fold: 0


CatBoostError: features data: pandas.DataFrame column 'item_id' has dtype 'category' but is not in  cat_features list