## Simple benchmark using LGB

In [1]:
import os
import sys
import pandas as pd

In [2]:
sys.path.insert(0, os.path.abspath('/home/jupyter/kaggle/predict_future_sales/src/'))

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import common.com_util as util
import munging.process_data as process_data
import config.constants as constants
import cv.cv_util as cv
import modeling.train_util as model

In [5]:
logger = util.get_logger('sub_2')

In [11]:
train_df = pd.read_feather('/home/jupyter/kaggle/predict_future_sales/data/processed/train_all_merged.feather')
test_df = pd.read_feather('/home/jupyter/kaggle/predict_future_sales/data/processed/test_all_merged.feather')

train_features = ['shop_id', 'item_id', 'date_block_num', 'item_category_id', 'item_cnt_month']
train_df = train_df[train_features]

test_features = ['ID', 'shop_id', 'item_id', 'date_block_num', 'item_category_id']
test_df = test_df[test_features]

sample_submission = pd.read_feather('/home/jupyter/kaggle/predict_future_sales/data/processed/submission_processed.feather')

test = test_df.drop(['ID'], axis='columns')

In [29]:
TARGET = 'item_cnt_month'
ID = 'ID'
SEED = 42

training_months = [0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
holdout_months = [33]

training, validation = cv.get_data_splits_by_date_block(logger, train_df, 
                                                        train_months=training_months, 
                                                        validation_months=holdout_months)
# Clip the item_cnt_month within 20
training.item_cnt_month.clip(lower=0, upper=20, inplace=True)
validation.item_cnt_month.clip(lower=0, upper=20, inplace=True)

[INFO]2020-09-03 04:55:58,463:sub_2:Splitting the data into train and holdout based on months...
[INFO]2020-09-03 04:55:58,464:sub_2:Training months [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
[INFO]2020-09-03 04:55:58,465:sub_2:Validation months [33]
[INFO]2020-09-03 04:55:59,528:sub_2:Shape of the training data (10675678, 5) 
[INFO]2020-09-03 04:55:59,529:sub_2:Shape of the validation data (238172, 5)


In [15]:
lgb_params = {
                'objective' : 'root_mean_squared_error',
                'boosting_type' : 'gbdt',
                'n_estimators' : 100,
                'learning_rate' : 0.1,
                'num_leaves' : 31,
                'tree_learner' : 'serial',
                'n_jobs' :-1,
                'seed' : SEED,
                'max_depth' : -1,
                'early_stopping_rounds' : 100, 
                'max_bin' : 255,
                'metric' : 'rmse',
                'verbose' : 100
                }

In [16]:
predictors = ['shop_id', 'item_id', 'date_block_num', 'item_category_id']

In [36]:
bst, validation_score = model.lgb_train_validate_on_holdout(logger=logger, training=training, validation=validation, predictors=predictors, target=TARGET, params=lgb_params, test_X=None, model_type='lgb')

[INFO]2020-09-03 05:03:09,792:sub_2:Shape of train_X : (10675678, 4)
[INFO]2020-09-03 05:03:09,793:sub_2:Shape of train_Y : (10675678,)
[INFO]2020-09-03 05:03:09,794:sub_2:Shape of validation_X : (238172, 4)
[INFO]2020-09-03 05:03:09,796:sub_2:Shape of validation_Y : (238172,)
[INFO]2020-09-03 05:03:09,797:sub_2:Training model with [lgb]




Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.03643
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 1.03643
[INFO]2020-09-03 05:03:46,941:sub_2:Validation Score 1.0364347687295354
[INFO]2020-09-03 05:03:46,946:sub_2:Best Iteration 100


In [37]:
bst.best_iteration, bst.best_score

(100,
 defaultdict(collections.OrderedDict,
             {'valid_0': OrderedDict([('rmse', 1.0364347687294806)])}))

In [51]:
bst.best_score['valid_0']['rmse']

1.0364347687294806

In [43]:
prediction = bst.predict(test, bst.best_iteration)

In [44]:
len(prediction)

214200

In [45]:
test_df.shape

(214200, 5)

In [46]:
sample_submission.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [28]:
submission = pd.DataFrame({'ID': test_df.ID, 'item_cnt_month': prediction})

In [36]:
submission.to_csv('')

ID                0
item_cnt_month    0
dtype: int64