In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [3]:
#PATHS for Features
ORIGINAL = '../input/m5-forecasting-accuracy/'
BASE = 'grid_df.pkl'
FEATURE1     = 'feature1.pkl'
FEATURE2     = 'feature2.pkl'

########################### Model params
#################################################################################
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
#                     'verbose': -1,
                } 

########################### Vars
#################################################################################
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores

#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913               # End day of our train set
P_HORIZON   = 28                 # Prediction horizon

#STORES ids
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())

remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d','item_id', 'dept_id', 'cat_id',TARGET]

In [4]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    df = pd.read_pickle(BASE)
    df = df[df['store_id']==store]
    for p in [FEATURE1, FEATURE2]:
        df_feat = pd.read_pickle(p)
        df = pd.merge(df, df_feat.drop(columns=['sales']), on=['id', 'd'], how='left')
        
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
#     df = df[['id','d',TARGET]+features]
    
    train_mask = df['d']<=END_TRAIN-P_HORIZON
    valid_mask = (df['d']<=END_TRAIN)&(df['d']>(END_TRAIN-P_HORIZON))
    test_mask = df['d']>END_TRAIN
        
    tran_df = df[train_mask].reset_index(drop=True)
    valid_df = df[valid_mask].reset_index(drop=True)
    test_df = df[test_mask].reset_index(drop=True)
    
    return tran_df, valid_df, test_df, features

In [None]:
predictions = pd.DataFrame()

########################### Train Models
#################################################################################
for store_id in STORES_IDS:
    
    start_time = time.time()
    
    # Get grid for current store
    tran_df, valid_df, test_df, features_columns = get_data_by_store(store_id)
    
    print('Train', store_id, tran_df.shape[0], valid_df.shape[0], test_df.shape[0])
    train_data = lgb.Dataset(tran_df[features_columns], label=tran_df[TARGET])
    valid_data = lgb.Dataset(valid_df[features_columns], label=valid_df[TARGET])
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,train_data,valid_sets = [valid_data],verbose_eval = 100)
    
    test_df[TARGET] = estimator.predict(test_df[features_columns])
    predictions = pd.concat([predictions, test_df], axis=0)
    
    print('%0.2f min: train' % ((time.time() - start_time) / 60))
    
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')
predictions = predictions[['id', 'date', 'sales']]
predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'sales').reset_index()
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
evaluation = submission[submission['id'].isin(evaluation_rows)]
validation = submission[['id']].merge(predictions, on = 'id')
final = pd.concat([validation, evaluation])
    
    


Train CA_1 4617523 85372 85372
[100]	valid_0's rmse: 2.03753
[200]	valid_0's rmse: 2.03082
[300]	valid_0's rmse: 2.02636
[400]	valid_0's rmse: 2.02506
[500]	valid_0's rmse: 2.02244
[600]	valid_0's rmse: 2.02163
[700]	valid_0's rmse: 2.02151
[800]	valid_0's rmse: 2.02101
[900]	valid_0's rmse: 2.02064
[1000]	valid_0's rmse: 2.02077
[1100]	valid_0's rmse: 2.01978
[1200]	valid_0's rmse: 2.02009
[1300]	valid_0's rmse: 2.01967
[1400]	valid_0's rmse: 2.01918
11.32 min: encode
Train CA_2 4190404 85372 85372
[100]	valid_0's rmse: 1.91443
[200]	valid_0's rmse: 1.87778
[300]	valid_0's rmse: 1.86928
[400]	valid_0's rmse: 1.8652
[500]	valid_0's rmse: 1.86345
[600]	valid_0's rmse: 1.86277
[700]	valid_0's rmse: 1.86181
[800]	valid_0's rmse: 1.86058
[900]	valid_0's rmse: 1.85999
[1000]	valid_0's rmse: 1.8592
[1100]	valid_0's rmse: 1.85882
[1200]	valid_0's rmse: 1.85815
[1300]	valid_0's rmse: 1.85789
[1400]	valid_0's rmse: 1.85813
10.37 min: encode
Train CA_3 4586569 85372 85372
[100]	valid_0's rmse: 2

In [None]:
final