# lgb model

In [1]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random, time

from math import ceil

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
warnings.filterwarnings('ignore')

import lightgbm as lgb

In [2]:
# # monitor 
# def get_memory_usage():
#     return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
# def sizeof_fmt(num, suffix='B'):
#     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
#         if abs(num) < 1024.0:
#             return "%3.1f%s%s" % (num, unit, suffix)
#         num /= 1024.0
#     return "%.1f%s%s" % (num, 'Yi', suffix)

## Load data

In [5]:
# change the file path if run on different machines
FilePath = "MainData/"

In [4]:
# define a function to reduce the memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 #bytes to MB
    
    # the for loop converts int16 --> int8, int32 --> int 16, etc
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[0:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('Memory usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100*(start_mem-end_mem)/start_mem))
            
    return df

In [5]:
# %% time
# df_train0 = pd.read_csv(FilePath+'sales_train_validation.csv')
# df_train0 = reduce_mem_usage(df_train0)
# print('The shape of training data is {}'.format(df_train0.shape))

In [6]:
df_grid = pd.concat([pd.read_pickle(FilePath+'grid_part_1.pkl'), 
                     pd.read_pickle(FilePath+'grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle(FilePath+'grid_part_3.pkl').iloc[:,2:]], axis=1)
print("the shape of the grid is {}".format(df_grid.shape))

the shape of the grid is (47735397, 38)


**change d column to int16**

In [7]:
df_grid[['col1','col2']] = df_grid['d'].str.split('_', expand=True)
df_grid['d'] = df_grid['col2']
df_grid['d'] = df_grid['d'].astype('int16')
df_grid.drop(['col1','col2'], axis=1, inplace=True)

In [8]:
# save the combined file
df_grid.to_pickle(FilePath+'grid_part123.pkl')

In [9]:
#print(df_grid.info())
df_grid.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,wm_yr_wk,...,snap_CA,snap_TX,snap_WI,tm_d,tm_w,tm_m,tm_y,tm_dw,tm_wm,tm_w_end
0,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1,12.0,11101,11101,...,0,0,0,29,4,1,0,5,5,1
1,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,1,2.0,11101,11101,...,0,0,0,29,4,1,0,5,5,1
2,HOBBIES_1_010_CA_1_evaluation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,11101,11101,...,0,0,0,29,4,1,0,5,5,1
3,HOBBIES_1_012_CA_1_evaluation,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,11101,11101,...,0,0,0,29,4,1,0,5,5,1
4,HOBBIES_1_015_CA_1_evaluation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,1,4.0,11101,11101,...,0,0,0,29,4,1,0,5,5,1


## Encoding mean

In [7]:
# to be sure that our grids are aligned by index
df_grid = pd.read_pickle(FilePath+'grid_part_1.pkl')
#df_grid[TARGET][df_grid['d']>(1913-28)] = np.nan
base_cols = list(df_grid)

In [8]:
icols =  [
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
            ]

In [9]:
df_grid

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,wm_yr_wk
0,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1,12.0,11101,11101
1,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,d_1,2.0,11101,11101
2,HOBBIES_1_010_CA_1_evaluation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,11101,11101
3,HOBBIES_1_012_CA_1_evaluation,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,11101,11101
4,HOBBIES_1_015_CA_1_evaluation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,d_1,4.0,11101,11101
...,...,...,...,...,...,...,...,...,...,...
47735392,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1969,,11101,11621
47735393,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1969,,11101,11621
47735394,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1969,,11101,11621
47735395,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1969,,11331,11621


In [10]:
for col in icols:
    col_name = '_'+'_'.join(col)+'_'
    df_grid['enc'+col_name+'mean'] = df_grid.groupby(col)['sales'].transform('mean').astype(np.float16)
    df_grid['enc'+col_name+'std'] = df_grid.groupby(col)['sales'].transform('std').astype(np.float16)

In [11]:
keep_cols = [col for col in list(df_grid) if col not in base_cols]
df_encoding = df_grid[['id','d']+keep_cols]

## Save customized features

In [12]:
print("The shape of df_enconding is {}".format(df_encoding.shape))
df_encoding.head()

The shape of df_enconding is (47735397, 24)


Unnamed: 0,id,d,enc_state_id_mean,enc_state_id_std,enc_store_id_mean,enc_store_id_std,enc_cat_id_mean,enc_cat_id_std,enc_dept_id_mean,enc_dept_id_std,...,enc_store_id_cat_id_mean,enc_store_id_cat_id_std,enc_store_id_dept_id_mean,enc_store_id_dept_id_std,enc_item_id_mean,enc_item_id_std,enc_item_id_state_id_mean,enc_item_id_state_id_std,enc_item_id_store_id_mean,enc_item_id_store_id_std
0,HOBBIES_1_008_CA_1_evaluation,d_1,1.573242,4.570312,1.635742,4.449219,0.708984,2.251953,0.865234,2.537109,...,1.003906,3.115234,1.259766,3.533203,4.683594,7.148438,6.582031,8.75,7.285156,9.179688
1,HOBBIES_1_009_CA_1_evaluation,d_1,1.573242,4.570312,1.635742,4.449219,0.708984,2.251953,0.865234,2.537109,...,1.003906,3.115234,1.259766,3.533203,0.849609,1.754883,1.137695,2.107422,1.178711,2.013672
2,HOBBIES_1_010_CA_1_evaluation,d_1,1.573242,4.570312,1.635742,4.449219,0.708984,2.251953,0.865234,2.537109,...,1.003906,3.115234,1.259766,3.533203,0.610352,0.861816,0.562988,0.829102,0.716797,0.919434
3,HOBBIES_1_012_CA_1_evaluation,d_1,1.573242,4.570312,1.635742,4.449219,0.708984,2.251953,0.865234,2.537109,...,1.003906,3.115234,1.259766,3.533203,0.381104,0.688965,0.423584,0.723633,0.39209,0.646973
4,HOBBIES_1_015_CA_1_evaluation,d_1,1.573242,4.570312,1.635742,4.449219,0.708984,2.251953,0.865234,2.537109,...,1.003906,3.115234,1.259766,3.533203,4.417969,6.679688,6.910156,8.359375,6.015625,7.324219


In [13]:
df_grid.to_pickle(FilePath+'encoding.pkl')

In [14]:
2+2

4

In [6]:
cus_features = pd.read_pickle(FilePath+'encoding.pkl')
lag_features = pd.read_pickle(FilePath+'lag_rolling.pkl')
basic_features = pd.read_pickle(FilePath+'grid_part123.pkl')

In [7]:
cus_features.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'release', 'wm_yr_wk', 'enc_state_id_mean', 'enc_state_id_std',
       'enc_store_id_mean', 'enc_store_id_std', 'enc_cat_id_mean',
       'enc_cat_id_std', 'enc_dept_id_mean', 'enc_dept_id_std',
       'enc_state_id_cat_id_mean', 'enc_state_id_cat_id_std',
       'enc_state_id_dept_id_mean', 'enc_state_id_dept_id_std',
       'enc_store_id_cat_id_mean', 'enc_store_id_cat_id_std',
       'enc_store_id_dept_id_mean', 'enc_store_id_dept_id_std',
       'enc_item_id_mean', 'enc_item_id_std', 'enc_item_id_state_id_mean',
       'enc_item_id_state_id_std', 'enc_item_id_store_id_mean',
       'enc_item_id_store_id_std'],
      dtype='object')

In [8]:
basic_features.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'release', 'wm_yr_wk', 'sell_price', 'price_max', 'price_min',
       'price_std', 'price_mean', 'price_norm', 'price_nunique',
       'item_nunique', 'month', 'year', 'price_momentum_m', 'price_momentum_y',
       'price_momentum', 'date', 'event_name_1', 'event_type_1',
       'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d',
       'tm_w', 'tm_m', 'tm_y', 'tm_dw', 'tm_wm', 'tm_w_end'],
      dtype='object')