# lgb model

In [1]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random, time

from math import ceil

from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

In [2]:
# # monitor 
# def get_memory_usage():
#     return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
# def sizeof_fmt(num, suffix='B'):
#     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
#         if abs(num) < 1024.0:
#             return "%3.1f%s%s" % (num, unit, suffix)
#         num /= 1024.0
#     return "%.1f%s%s" % (num, 'Yi', suffix)

## Load data

In [3]:
# change the file path if run on different machines
FilePath = "MainData/"

In [4]:
# define a function to reduce the memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 #bytes to MB
    
    # the for loop converts int16 --> int8, int32 --> int 16, etc
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics: 
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[0:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('Memory usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100*(start_mem-end_mem)/start_mem))
            
    return df

In [5]:
#%% time
df_train0 = pd.read_csv(FilePath+'sales_train_validation.csv')
df_train0 = reduce_mem_usage(df_train0)
print('The shape of training data is {}'.format(df_train0.shape))

Memory usage decreased from 446.40 Mb to 95.00 Mb (78.7% reduction)
The shape of training data is (30490, 1919)


In [6]:
# df_train0.head()

## lag and Rolling creation

In [7]:
TARGET = 'sales'         # Our main target
END_TRAIN = 1941 #1913         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [8]:
df_grid = pd.read_pickle(FilePath+'grid_part_1.pkl')

In [9]:
df_grid = df_grid[['id','d', TARGET]]

In [10]:
%%time
# create lag features
shifts = [1, 7, 28]
for shift in shifts:
    df_grid['lag'+str(shift)] = df_grid.groupby('id')[TARGET].transform(lambda x: x.shift(shift))
print("The shape of df_grid is {}".format(df_grid.shape))

The shape of df_grid is (47735397, 6)
CPU times: user 38.1 s, sys: 1.89 s, total: 40 s
Wall time: 40 s


In [11]:
%%time
# create rolling features
windows = [7, 28]

# rolling based on original sales
for window in windows:
    df_grid['rolling_mean'+str(window)] = df_grid.groupby('id')[TARGET].transform(lambda x: x.rolling(window).mean()).astype(np.float16)
    df_grid['rolling_std'+str(window)] = df_grid.groupby('id')[TARGET].transform(lambda x: x.rolling(window).std()).astype(np.float16)

CPU times: user 1min 16s, sys: 2.6 s, total: 1min 18s
Wall time: 1min 18s


In [12]:
%%time
# create rolling mean for lags
lags = [7,28]
lag_windows = [7,28]

# rolling based on lags
for lag in lags:
    for lag_window in lag_windows:
        df_grid['lag'+str(lag)+'rolling'+str(lag_window)]=df_grid.groupby('id')[TARGET].transform(lambda x: x.shift(lag).rolling(lag_window).mean()).astype(np.float16)

CPU times: user 1min 27s, sys: 2.62 s, total: 1min 29s
Wall time: 1min 29s


## Save lag and rolling features

In [13]:
print("The shape of df_grid is {}".format(df_grid.shape))
df_grid.head()

The shape of df_grid is (47735397, 14)


Unnamed: 0,id,d,sales,lag1,lag7,lag28,rolling_mean7,rolling_std7,rolling_mean28,rolling_std28,lag7rolling7,lag7rolling28,lag28rolling7,lag28rolling28
0,HOBBIES_1_008_CA_1_evaluation,d_1,12.0,,,,,,,,,,,
1,HOBBIES_1_009_CA_1_evaluation,d_1,2.0,,,,,,,,,,,
2,HOBBIES_1_010_CA_1_evaluation,d_1,0.0,,,,,,,,,,,
3,HOBBIES_1_012_CA_1_evaluation,d_1,0.0,,,,,,,,,,,
4,HOBBIES_1_015_CA_1_evaluation,d_1,4.0,,,,,,,,,,,


In [14]:
df_grid.to_pickle(FilePath+'lag_rolling.pkl')