In [1]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

In [3]:
#PATHS for Features
ORIGINAL = '../input/m5-forecasting-accuracy/'
BASE = 'grid_df.pkl'
FEATURE1     = 'feature1.pkl'
FEATURE2     = 'feature2.pkl'

########################### Model params
#################################################################################
import xgboost as xgb
xgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
#                     'verbose': -1,
                } 

########################### Vars
#################################################################################
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
xgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores

#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 1000                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913               # End day of our train set
P_HORIZON   = 28                 # Prediction horizon

#STORES ids
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())

remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]

In [4]:
########################### Helper to load data by store ID
#################################################################################
# Read data
def get_data_by_store(store):
    
    df = pd.read_pickle(BASE)
    df = df[df['store_id']==store]
    for p in [FEATURE1, FEATURE2]:
        df_feat = pd.read_pickle(p)
        df = pd.merge(df, df_feat.drop(columns=['sales']), on=['id', 'd'], how='left')
        
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
#     df = df[['id','d',TARGET]+features]
    
    train_mask = (df['d']>START_TRAIN)&(df['d']<=END_TRAIN-P_HORIZON)
    valid_mask = (df['d']>(END_TRAIN-P_HORIZON))&(df['d']<=END_TRAIN)
    test_mask = df['d']>END_TRAIN
        
    tran_df = df[train_mask].reset_index(drop=True)
    valid_df = df[valid_mask].reset_index(drop=True)
    test_df = df[test_mask].reset_index(drop=True)
    
    return tran_df, valid_df, test_df, features

In [5]:
predictions = pd.DataFrame()

########################### Train Models
#################################################################################
for store_id in STORES_IDS:
    
    start_time = time.time()
    
    # Get grid for current store
    tran_df, valid_df, test_df, features_columns = get_data_by_store(store_id)
    
    print('Train', store_id, tran_df.shape[0], valid_df.shape[0], test_df.shape[0])
    
    train_data = xgb.DMatrix(tran_df[features_columns].values, label=tran_df[TARGET].values)
    valid_data = xgb.DMatrix(valid_df[features_columns].values, label=valid_df[TARGET].values)
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = xgb.train(lgb_params,train_data,evals = [(valid_data,'eval')],verbose_eval = 100)
    
    test_df[TARGET] = estimator.predict(test_df[features_columns])
    predictions = pd.concat([predictions, test_df], axis=0)
    
    print('%0.2f min: train' % ((time.time() - start_time) / 60))
    
submission = pd.read_csv(ORIGINAL+'sample_submission.csv')
predictions = predictions[['id', 'date', 'sales']]
predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'sales').reset_index()
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
evaluation = submission[submission['id'].isin(evaluation_rows)]
validation = submission[['id']].merge(predictions, on = 'id')
final = pd.concat([validation, evaluation])
    

Train CA_1 2593043 85372 85372


ValueError: could not convert string to float: 'HOBBIES_1_001'

In [6]:
tran_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,wm_yr_wk,...,enc_store_id_cat_id_mean,enc_store_id_cat_id_std,enc_store_id_dept_id_mean,enc_store_id_dept_id_std,enc_item_id_mean,enc_item_id_std,enc_item_id_state_id_mean,enc_item_id_state_id_std,enc_item_id_store_id_mean,enc_item_id_store_id_std
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1001,2.0,224,11339,...,1.004883,3.128906,1.261719,3.548828,0.402588,0.739258,0.619629,0.896973,0.57959,0.837402
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1001,0.0,20,11339,...,1.004883,3.128906,1.261719,3.548828,0.275146,0.604004,0.221069,0.512695,0.281494,0.589844
2,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1001,0.0,5,11339,...,1.004883,3.128906,1.261719,3.548828,2.072266,2.675781,3.0,3.324219,1.75,1.996094
3,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1001,1.0,16,11339,...,1.004883,3.128906,1.261719,3.548828,0.810059,1.249023,1.027344,1.363281,1.021484,1.310547
4,HOBBIES_1_006_CA_1_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,1001,1.0,109,11339,...,1.004883,3.128906,1.261719,3.548828,0.876953,1.648438,0.95166,1.700195,1.114258,1.713867
