In [2]:
## In this kernel I would like to show: 
## 1. FE creation approaches
## 2. Sequential fe validation
## 3. Dimension reduction
## 4. FE validation by Permutation importance
## 5. Mean encodings
## 6. Parallelization for FE

In [3]:
import numpy as np 
import pandas as pd 
import os, sys, gc, warnings, psutil, random
# from sklearn.metrics import rm

warnings.filterwarnings('ignore')

In [10]:
########################### Baseline model
#################################################################################

# We will need some global VARS for future

SEED = 42             # Our random seed for everything
random.seed(SEED)     # to make all tests "deterministic"
np.random.seed(SEED)
N_CORES = psutil.cpu_count()     # Available CPU cores

TARGET = 'sales'      # Our Target
END_TRAIN = 1913      # And we will use last 28 days as validation

In [4]:
## RMSE
def rmse(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))

def permutation_importance(model, validation_df, features_columns, target, metric=rmse, verbose=0):

    list_ = []
    # Make normal prediction with our model and save score
    validation_df['preds'] = model.predict(validation_df[features_columns])
    base_score = metric(validation_df[target], validation_df['preds'])
    if verbose>0:
        print('Standart RMSE', base_score)

    # Now we are looping over all our numerical features
    for col in features_columns:

        # We will make validation set copy to restore
        # features states on each run
        temp_df = validation_df.copy()

        # Error here appears if we have "categorical" features and can't 
        # do np.random.permutation without disrupt categories
        # so we need to check if feature is numerical
        if temp_df[col].dtypes.name != 'category':
            temp_df[col] = np.random.permutation(temp_df[col].values)
            temp_df['preds'] = model.predict(temp_df[features_columns])
            cur_score = metric(temp_df[target], temp_df['preds'])
            
            list_.append({'feature':col, 'permutation_importance':np.round(cur_score - base_score, 4)})
            # If our current rmse score is less than base score
            # it means that feature most probably is a bad one
            # and our model is learning on noise
            if verbose>0:
                print(col, np.round(cur_score - base_score, 4))
            
    return pd.DataFrame(list_).sort_values(by=['permutation_importance'], ascending=False)


# permutation_importance_df = permutation_importance(estimator, valid_df, features_columns, TARGET, metric=rmse, verbose=1)

In [10]:
########################### Load data
########################### Basic features were created here:
########################### https://www.kaggle.com/kyakovlev/m5-simple-fe
#################################################################################

# Read data
grid_df = pd.concat([pd.read_pickle('./grid_part_1.pkl'),
                     pd.read_pickle('./grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle('./grid_part_3.pkl').iloc[:,2:]],
                     axis=1)

# Subsampling
# to make all calculations faster.
# Keep only 5% of original ids.
keep_id = np.array_split(list(grid_df['id'].unique()), 20)[0]
grid_df = grid_df[grid_df['id'].isin(keep_id)].reset_index(drop=True)

# Let's "inspect" our grid DataFrame
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2960025 entries, 0 to 2960024
Data columns (total 39 columns):
id                  category
item_id             category
dept_id             category
cat_id              category
store_id            category
state_id            category
d                   int16
sales               float64
release             int16
sell_price          float16
price_max           float16
price_min           float16
price_std           float16
price_mean          float16
price_norm          float16
price_nunique       float16
item_nunique        int16
price_momentum      float16
price_momentum_m    float16
price_momentum_y    float16
event_name_1        category
event_type_1        category
event_name_2        category
event_type_2        category
snap_CA             category
snap_TX             category
snap_WI             category
is_Halloween        category
is_ValentinesDay    category
is_Thanksgiving     category
is_Christmas        category
is_NewYe

In [11]:
########################### Baseline model
#################################################################################

# We will need some global VARS for future

SEED = 42             # Our random seed for everything
random.seed(SEED)     # to make all tests "deterministic"
np.random.seed(SEED)
N_CORES = psutil.cpu_count()     # Available CPU cores

TARGET = 'sales'      # Our Target
END_TRAIN = 1913      # And we will use last 28 days as validation

# Drop some items from "TEST" set part (1914...)
grid_df = grid_df[grid_df['d']<=END_TRAIN].reset_index(drop=True)

# Features that we want to exclude from training
remove_features = ['id','d',TARGET]

# Our baseline model serves
# to do fast checks of
# new features performance 

# We will use LightGBM for our tests
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',         # Standart boosting type
                    'objective': 'regression',       # Standart loss for RMSE
                    'metric': ['rmse'],              # as we will use rmse as metric "proxy"
                    'subsample': 0.8,                
                    'subsample_freq': 1,
                    'learning_rate': 0.05,           # 0.5 is "fast enough" for us
                    'num_leaves': 2**7-1,            # We will need model only for fast check
                    'min_data_in_leaf': 2**8-1,      # So we want it to train faster even with drop in generalization 
                    'feature_fraction': 0.8,
                    'n_estimators': 5000,            # We don't want to limit training (you can change 5000 to any big enough number)
                    'early_stopping_rounds': 30,     # We will stop training almost immediately (if it stops improving) 
                    'seed': SEED,
                    'verbose': -1,
                } 

## RMSE
def rmse(y, y_pred):
    return np.sqrt(np.mean(np.square(y - y_pred)))

# Small function to make fast features tests
# estimator = make_fast_test(grid_df)
# it will return lgb booster for future analisys
def make_fast_test(df, permutate=False):

    features_columns = [col for col in list(df) if col not in remove_features]

    tr_x, tr_y = df[df['d']<=(END_TRAIN-28)][features_columns], df[df['d']<=(END_TRAIN-28)][TARGET]              
    vl_x, v_y = df[df['d']>(END_TRAIN-28)][features_columns], df[df['d']>(END_TRAIN-28)][TARGET]
    
    train_data = lgb.Dataset(tr_x, label=tr_y)
    valid_data = lgb.Dataset(vl_x, label=v_y)
    
    estimator = lgb.train(
                            lgb_params,
                            train_data,
                            valid_sets = [train_data,valid_data],
                            verbose_eval = 500,
                        )
    if permutate:
        permutation_importance_df = permutation_importance(estimator, df[df['d']>(END_TRAIN-28)], features_columns, TARGET, metric=rmse, verbose=0)
    else:
        permutation_importance_df = None
    
    return estimator, permutation_importance_df

# Make baseline model
baseline_model,permutation_importance_df = make_fast_test(grid_df)

Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[302]	training's rmse: 2.83927	valid_1's rmse: 2.39352


In [None]:
########################### Lets test our normal Lags (7 days)
########################### Some more info about lags here:
########################### https://www.kaggle.com/kyakovlev/m5-lags-features
#################################################################################

# Small helper to make lags creation faster
from multiprocessing import Pool                # Multiprocess Runs

## Multiprocessing Run.
# :t_split - int of lags days                   # type: int
# :func - Function to apply on each split       # type: python function
# This function is NOT 'bulletproof', be carefull and pass only correct types of variables.
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

def make_normal_lag(lag_day):
    lag_df = grid_df[['id','d',TARGET]] # not good to use df from "global space"
    col_name = 'sales_lag_'+str(lag_day)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(lag_day)).astype(np.float16)
    return lag_df[[col_name]]

# Launch parallel lag creation
# and "append" to our grid
LAGS_SPLIT = [col for col in range(1,1+7)]
grid_df = pd.concat([grid_df, df_parallelize_run(make_normal_lag,LAGS_SPLIT)], axis=1)

# Make features test
test_model,permutation_importance_df  = make_fast_test(grid_df)

In [13]:
########################### Permutation importance Test
########################### https://www.kaggle.com/dansbecker/permutation-importance @dansbecker
#################################################################################

# Let's creat validation dataset and features
features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)

# Make normal prediction with our model and save score
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)


# Now we are looping over all our numerical features
for col in features_columns:
    
    # We will make validation set copy to restore
    # features states on each run
    temp_df = validation_df.copy()
    
    # Error here appears if we have "categorical" features and can't 
    # do np.random.permutation without disrupt categories
    # so we need to check if feature is numerical
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        
        # If our current rmse score is less than base score
        # it means that feature most probably is a bad one
        # and our model is learning on noise
        print(col, np.round(cur_score - base_score, 4))

# Remove Temp data
del temp_df, validation_df

# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if 'sales_lag_' not in col]
grid_df = grid_df[keep_cols]


# Results:
## Lags with 1 days shift (nearest past) are important
## Some other features are not important and probably just noise
## Better make several Permutation runs to confirm useless of the feature
## link again https://www.kaggle.com/dansbecker/permutation-importance @dansbecker

## price_nunique -0.002 : strong negative values are most probably noise
## price_max -0.0002 : values close to 0 need deeper investigation

Standart RMSE 2.2671115304002565
release 0.0
sell_price 0.003
price_max -0.0005
price_min 0.0002
price_std 0.0063
price_mean 0.003
price_norm 0.0083
price_nunique -0.0022
item_nunique 0.0012
price_momentum 0.0002
price_momentum_m 0.0085
price_momentum_y 0.001
tm_d 0.0034
tm_w -0.0
tm_m 0.0
tm_y 0.0
tm_wm 0.0001
tm_dw 0.0951
tm_w_end 0.0139
sales_lag_1 0.4595
sales_lag_2 0.021
sales_lag_3 0.0037
sales_lag_4 0.0121
sales_lag_5 0.0119
sales_lag_6 0.0122
sales_lag_7 0.0278


In [None]:
permutation_importance_df

In [14]:
########################### Lets test far away Lags (7 days with 56 days shift)
########################### and check permutation importance
#################################################################################

LAGS_SPLIT = [col for col in range(56,56+7)]
grid_df = pd.concat([grid_df, df_parallelize_run(make_normal_lag,LAGS_SPLIT)], axis=1)
test_model = make_fast_test(grid_df)

features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)

for col in features_columns:
    temp_df = validation_df.copy()
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        print(col, np.round(cur_score - base_score, 4))

del temp_df, validation_df
        
# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if 'sales_lag_' not in col]
grid_df = grid_df[keep_cols]


# Results:
## Lags with 56 days shift (far away past) are not as important
## as nearest past lags
## and at some point will be just noise for our model

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 2.74855	valid_1's rmse: 2.37852
Early stopping, best iteration is:
[517]	training's rmse: 2.74172	valid_1's rmse: 2.37664
Standart RMSE 2.376639096408089
release 0.0
sell_price 0.0064
price_max 0.0075
price_min 0.0043
price_std 0.0021
price_mean 0.0018
price_norm 0.0111
price_nunique 0.0188
item_nunique 0.0076
price_momentum 0.0006
price_momentum_m 0.0364
price_momentum_y 0.0088
tm_d 0.0042
tm_w 0.0042
tm_m 0.0015
tm_y 0.0
tm_wm -0.0007
tm_dw 0.1074
tm_w_end 0.0153
sales_lag_56 0.0148
sales_lag_57 -0.0022
sales_lag_58 0.0182
sales_lag_59 0.0033
sales_lag_60 0.0036
sales_lag_61 -0.0031
sales_lag_62 0.0019


In [16]:
########################### PCA
#################################################################################

# The main question here - can we have 
# almost same rmse boost with less features
# less dimensionality?

# Lets try PCA and make 7->3 dimensionality reduction

# PCA is "unsupervised" learning
# and with shifted target we can be sure
# that we have no Target leakage
from sklearn.decomposition import PCA

def make_pca(df, pca_col, n_days):
    print('PCA:', pca_col, n_days)
    
    # We don't need any other columns to make pca
    pca_df = df[[pca_col,'d',TARGET]]
    
    # If we are doing pca for other series "levels" 
    # we need to agg first
    if pca_col != 'id':
        merge_base = pca_df[[pca_col,'d']]
        pca_df = pca_df.groupby([pca_col,'d'])[TARGET].agg(['sum']).reset_index()
        pca_df[TARGET] = pca_df['sum']
        del pca_df['sum']
    
    # Min/Max scaling
    pca_df[TARGET] = pca_df[TARGET]/pca_df[TARGET].max()
    
    # Making "lag" in old way (not parallel)
    LAG_DAYS = [col for col in range(1,n_days+1)]
    format_s = '{}_pca_'+pca_col+str(n_days)+'_{}'
    pca_df = pca_df.assign(**{
            format_s.format(col, l): pca_df.groupby([pca_col])[col].transform(lambda x: x.shift(l))
            for l in LAG_DAYS
            for col in [TARGET]
        })
    
    pca_columns = list(pca_df)[3:]
    pca_df[pca_columns] = pca_df[pca_columns].fillna(0)
    pca = PCA(random_state=SEED)
    
    # You can use fit_transform here
    pca.fit(pca_df[pca_columns])
    pca_df[pca_columns] = pca.transform(pca_df[pca_columns])
    
    print(pca.explained_variance_ratio_)
    
    # we will keep only 3 most "valuable" columns/dimensions 
    keep_cols = pca_columns[:3]
    print('Columns to keep:', keep_cols)
    
    # If we are doing pca for other series "levels"
    # we need merge back our results to merge_base df
    # and only than return resulted df
    # I'll skip that step here
    
    return pca_df[keep_cols]


# Make PCA
grid_df = pd.concat([grid_df, make_pca(grid_df,'id',7)], axis=1)

# Make features test
test_model = make_fast_test(grid_df)

features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)

for col in features_columns:
    temp_df = validation_df.copy()
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        print(col, np.round(cur_score - base_score, 4))

# Remove test features
# As we will compare performance with baseline model for now
keep_cols = [col for col in list(grid_df) if '_pca_' not in col]
grid_df = grid_df[keep_cols]

PCA: id 7
[0.72243389 0.06622603 0.05933126 0.04200092 0.0388851  0.03610057
 0.03502223]
Columns to keep: ['sales_pca_id7_1', 'sales_pca_id7_2', 'sales_pca_id7_3']
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[424]	training's rmse: 2.60036	valid_1's rmse: 2.27278
Standart RMSE 2.2727815880775624
release 0.0
sell_price 0.0097
price_max 0.0008
price_min -0.0005
price_std 0.002
price_mean 0.0021
price_norm 0.0049
price_nunique -0.0063
item_nunique -0.0011
price_momentum 0.0
price_momentum_m 0.0165
price_momentum_y 0.0009
tm_d 0.0103
tm_w -0.0001
tm_m 0.0016
tm_y 0.0
tm_wm 0.0005
tm_dw 0.208
tm_w_end 0.0073
sales_pca_id7_1 1.4509
sales_pca_id7_2 0.016
sales_pca_id7_3 0.007


In [19]:
########################### Mean/std target encoding
#################################################################################

# We will use these three columns for test
# (in combination with store_id)
icols = ['item_id','cat_id','dept_id']

# But we can use any other column or even multiple groups
# like these ones
#            'state_id',
#            'store_id',
#            'cat_id',
#            'dept_id',
#            ['state_id', 'cat_id'],
#            ['state_id', 'dept_id'],
#            ['store_id', 'cat_id'],
#            ['store_id', 'dept_id'],
#            'item_id',
#            ['item_id', 'state_id'],
#            ['item_id', 'store_id']

# There are several ways to do "mean" encoding
## K-fold scheme
## LOO (leave one out)
## Smoothed/regularized 
## Expanding mean
## etc 

# You can test as many options as you want
# and decide what to use
# Because of memory issues you can't 
# use many features.

# We will use simple target encoding
# by std and mean agg
for col in icols:
    print('Encoding', col)
    temp_df = grid_df[grid_df['d']<=(1913-28)] # to be sure we don't have leakage in our validation set
    
    temp_df = temp_df.groupby([col,'store_id']).agg({TARGET: ['std','mean']})
    joiner = '_'+col+'_encoding_'
    temp_df.columns = [joiner.join(col).strip() for col in temp_df.columns.values]
    temp_df = temp_df.reset_index()
    grid_df = grid_df.merge(temp_df, on=[col,'store_id'], how='left')
    del temp_df

# Make features test
test_model = make_fast_test(grid_df)

features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)

for col in features_columns:
    temp_df = validation_df.copy()
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        print(col, np.round(cur_score - base_score, 4))
        

# Remove test features
keep_cols = [col for col in list(grid_df) if '_encoding_' not in col]
grid_df = grid_df[keep_cols]

# Bad thing that for some items  
# we are using past and future values.
# But we are looking for "categorical" similiarity
# on a "long run". So future here is not a big problem.

Encoding item_id
Encoding cat_id
Encoding dept_id
Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 2.74303	valid_1's rmse: 2.37972
Early stopping, best iteration is:
[490]	training's rmse: 2.74583	valid_1's rmse: 2.37905
Standart RMSE 2.3790545391991853
release 0.0
sell_price 0.0254
price_max 0.0095
price_min 0.0001
price_std 0.0081
price_mean 0.0004
price_norm 0.0117
price_nunique -0.0045
item_nunique -0.0026
price_momentum -0.0
price_momentum_m 0.0345
price_momentum_y 0.0109
tm_d 0.0068
tm_w 0.007
tm_m 0.0011
tm_y 0.0
tm_wm 0.0011
tm_dw 0.1773
tm_w_end 0.0124
sales_item_id_encoding_std 0.0116
sales_item_id_encoding_mean 1.9479
sales_cat_id_encoding_std 0.0008
sales_cat_id_encoding_mean 0.0012
sales_dept_id_encoding_std 0.0029
sales_dept_id_encoding_mean -0.0


In [20]:
########################### Last non O sale
#################################################################################

def find_last_sale(df,n_day):
    
    # Limit initial df
    ls_df = df[['id','d',TARGET]]
    
    # Convert target to binary
    ls_df['non_zero'] = (ls_df[TARGET]>0).astype(np.int8)
    
    # Make lags to prevent any leakage
    ls_df['non_zero_lag'] = ls_df.groupby(['id'])['non_zero'].transform(lambda x: x.shift(n_day).rolling(2000,1).sum()).fillna(-1)

    temp_df = ls_df[['id','d','non_zero_lag']].drop_duplicates(subset=['id','non_zero_lag'])
    temp_df.columns = ['id','d_min','non_zero_lag']

    ls_df = ls_df.merge(temp_df, on=['id','non_zero_lag'], how='left')
    ls_df['last_sale'] = ls_df['d'] - ls_df['d_min']

    return ls_df[['last_sale']]


# Find last non zero
# Need some "dances" to fit in memory limit with groupers
grid_df = pd.concat([grid_df, find_last_sale(grid_df,1)], axis=1)

# Make features test
test_model = make_fast_test(grid_df)

features_columns = [col for col in list(grid_df) if col not in remove_features]
validation_df = grid_df[grid_df['d']>(END_TRAIN-28)].reset_index(drop=True)
validation_df['preds'] = test_model.predict(validation_df[features_columns])
base_score = rmse(validation_df[TARGET], validation_df['preds'])
print('Standart RMSE', base_score)

for col in features_columns:
    temp_df = validation_df.copy()
    if temp_df[col].dtypes.name != 'category':
        temp_df[col] = np.random.permutation(temp_df[col].values)
        temp_df['preds'] = test_model.predict(temp_df[features_columns])
        cur_score = rmse(temp_df[TARGET], temp_df['preds'])
        print(col, np.round(cur_score - base_score, 4))

# Remove test features
keep_cols = [col for col in list(grid_df) if 'last_sale' not in col]
grid_df = grid_df[keep_cols]

Training until validation scores don't improve for 30 rounds
[500]	training's rmse: 2.63222	valid_1's rmse: 2.28391
Early stopping, best iteration is:
[840]	training's rmse: 2.56826	valid_1's rmse: 2.27466
Standart RMSE 2.2746555036589906
release 0.0
sell_price 0.0314
price_max 0.0207
price_min 0.0127
price_std 0.0384
price_mean 0.0073
price_norm 0.0223
price_nunique 0.0169
item_nunique 0.006
price_momentum 0.0001
price_momentum_m 0.0174
price_momentum_y -0.0001
tm_d 0.0072
tm_w 0.0016
tm_m 0.0009
tm_y 0.0
tm_wm 0.0005
tm_dw 0.2029
tm_w_end 0.0061
last_sale 0.6512


In [7]:
# Read data
grid_df = pd.concat([pd.read_pickle('../cache/grid_part_1.pkl'),
                     pd.read_pickle('../cache/grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle('../cache/grid_part_3.pkl').iloc[:,2:]],
                     axis=1)

In [9]:
pd.set_option('max_columns', 200)

In [13]:
grid_df.tm_d.unique()

array([29, 30, 31,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
       15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int8)

In [10]:
grid_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,sell_price,price_max,price_min,price_std,price_mean,price_norm,price_nunique,item_nunique,price_momentum,price_momentum_m,price_momentum_y,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,is_Halloween,is_ValentinesDay,is_Thanksgiving,is_Christmas,is_NewYear,tm_d,tm_w,tm_m,tm_y,tm_wm,tm_dw,tm_w_end
0,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1,12.0,0,0.459961,0.5,0.419922,0.01976,0.476318,0.919922,4.0,16,,0.96875,0.949219,,,,,0,0,0,0,0,0,0,0,29,4,1,0,5,5,1
1,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,1,2.0,0,1.55957,1.769531,1.55957,0.032745,1.764648,0.881348,2.0,9,,0.885742,0.896484,,,,,0,0,0,0,0,0,0,0,29,4,1,0,5,5,1
2,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0,3.169922,3.169922,2.970703,0.046356,2.980469,1.0,2.0,20,,1.064453,1.043945,,,,,0,0,0,0,0,0,0,0,29,4,1,0,5,5,1
3,HOBBIES_1_012_CA_1_validation,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0,5.980469,6.519531,5.980469,0.115967,6.46875,0.916992,3.0,71,,0.921875,0.958984,,,,,0,0,0,0,0,0,0,0,29,4,1,0,5,5,1
4,HOBBIES_1_015_CA_1_validation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,1,4.0,0,0.700195,0.720215,0.680176,0.011337,0.706543,0.972168,3.0,16,,0.990234,1.001953,,,,,0,0,0,0,0,0,0,0,29,4,1,0,5,5,1


In [28]:
########################### Apply on grid_df
#################################################################################
# lets read grid from 
# https://www.kaggle.com/kyakovlev/m5-simple-fe
# to be sure that our grids are aligned by index
grid_df = pd.concat([pd.read_pickle('../cache/grid_part_1.pkl'),
                     pd.read_pickle('../cache/grid_part_2.pkl').iloc[:,2:],
                     pd.read_pickle('../cache/grid_part_3.pkl').iloc[:,2:]],
                     axis=1)
TARGET = 'sales'
grid_df[TARGET][grid_df['d']>(1913-28)] = np.nan
base_cols = list(grid_df)

icols =  [
            ['state_id'],
            ['store_id'],
            ['cat_id'],
            ['dept_id'],
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id'],
            ['tm_dw','item_id'],
            ['tm_dw'],
#             ['tm_m'],
            ]

for col in icols:
    print('Encoding', col)
    col_name = '_'+'_'.join(col)+'_'
    grid_df['enc'+col_name+'mean'] = grid_df.groupby(col)[TARGET].transform('mean').astype(np.float16)
    grid_df['enc'+col_name+'std'] = grid_df.groupby(col)[TARGET].transform('std').astype(np.float16)

keep_cols = [col for col in list(grid_df) if col not in base_cols]
grid_df = grid_df[['id','d',TARGET]+keep_cols]

Encoding ['state_id']
Encoding ['store_id']
Encoding ['cat_id']
Encoding ['dept_id']
Encoding ['state_id', 'cat_id']
Encoding ['state_id', 'dept_id']
Encoding ['store_id', 'cat_id']
Encoding ['store_id', 'dept_id']
Encoding ['item_id']
Encoding ['item_id', 'state_id']
Encoding ['item_id', 'store_id']
Encoding ['tm_dw', 'item_id']
Encoding ['tm_dw']


In [29]:
from sklearn.decomposition import PCA

def make_pca(df, pca_col, n_days):
    print('PCA:', pca_col, n_days)
    
    # We don't need any other columns to make pca
    pca_df = df[[pca_col,'d',TARGET]]
    
    # If we are doing pca for other series "levels" 
    # we need to agg first
    if pca_col != 'id':
        merge_base = pca_df[[pca_col,'d']]
        pca_df = pca_df.groupby([pca_col,'d'])[TARGET].agg(['sum']).reset_index()
        pca_df[TARGET] = pca_df['sum']
        del pca_df['sum']
    
    # Min/Max scaling
    pca_df[TARGET] = pca_df[TARGET]/pca_df[TARGET].max()
    
    # Making "lag" in old way (not parallel)
    LAG_DAYS = [col for col in range(1,n_days+1)]
    format_s = '{}_pca_'+pca_col+str(n_days)+'_{}'
    pca_df = pca_df.assign(**{
            format_s.format(col, l): pca_df.groupby([pca_col])[col].transform(lambda x: x.shift(l))
            for l in LAG_DAYS
            for col in [TARGET]
        })
    
    pca_columns = list(pca_df)[3:]
    
    pca_df[pca_columns] = pca_df[pca_columns].fillna(-999999)
    pca = PCA(random_state=SEED)
    
    # You can use fit_transform here
    
    pca.fit(pca_df[pca_columns])
    pca_df[pca_columns] = pca.transform(pca_df[pca_columns])
    
    print(pca.explained_variance_ratio_)
    
    # we will keep only 3 most "valuable" columns/dimensions 
    keep_cols = pca_columns[:3]
    print('Columns to keep:', keep_cols)
    
    # If we are doing pca for other series "levels"
    # we need merge back our results to merge_base df
    # and only than return resulted df
    # I'll skip that step here
    
    return pca_df[keep_cols]


# Make PCA
grid_df = pd.concat([grid_df, make_pca(grid_df,'id',7)], axis=1)

PCA: id 7
[0.9577294  0.02673342 0.00694716 0.00340516 0.00216168 0.00163071
 0.00139247]
Columns to keep: ['sales_pca_id7_1', 'sales_pca_id7_2', 'sales_pca_id7_3']


In [30]:
for id_, group in grid_df[grid_df['d']>=1913-28].groupby('id'):
    break
group

Unnamed: 0,id,d,sales,enc_state_id_mean,enc_state_id_std,enc_store_id_mean,enc_store_id_std,enc_cat_id_mean,enc_cat_id_std,enc_dept_id_mean,...,enc_item_id_state_id_std,enc_item_id_store_id_mean,enc_item_id_store_id_std,enc_tm_dw_item_id_mean,enc_tm_dw_item_id_std,enc_tm_dw_mean,enc_tm_dw_std,sales_pca_id7_1,sales_pca_id7_2,sales_pca_id7_3
45145359,FOODS_1_001_CA_1_validation,1885,0.0,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.744141,1.576172,1.708008,5.027344,-96354.21,-0.0007432287,-907.273008
45175849,FOODS_1_001_CA_1_validation,1886,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.562988,1.197266,1.367188,4.082031,-96354.21,-0.0008517103,-907.272396
45206339,FOODS_1_001_CA_1_validation,1887,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.563477,1.207031,1.262695,3.804688,276333.6,-521120.4,482872.773364
45236829,FOODS_1_001_CA_1_validation,1888,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.553711,1.121094,1.249023,3.796875,654274.3,-939026.5,607426.06424
45267319,FOODS_1_001_CA_1_validation,1889,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.59375,1.15332,1.256836,3.828125,1035378.0,-1170947.0,277822.496733
45297809,FOODS_1_001_CA_1_validation,1890,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.688965,1.378906,1.422852,4.332031,1417539.0,-1170947.0,-254725.778064
45328299,FOODS_1_001_CA_1_validation,1891,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.781738,1.494141,1.726562,5.121094,1798643.0,-939026.5,-584329.345817
45358789,FOODS_1_001_CA_1_validation,1892,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.744141,1.576172,1.708008,5.027344,2176584.0,-521120.4,-459776.054631
45389279,FOODS_1_001_CA_1_validation,1893,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.562988,1.197266,1.367188,4.082031,2549271.0,9.663192e-07,24003.992398
45419769,FOODS_1_001_CA_1_validation,1894,,1.576172,4.605469,1.639648,4.476562,2.109375,5.769531,1.442383,...,1.666016,0.779785,1.256836,0.563477,1.207031,1.262695,3.804688,2549271.0,9.663192e-07,24003.992398


In [32]:
#################################################################################
print('Save Mean/Std encoding')
grid_df.to_pickle('../cache/mean_encoding_df.pkl')

Save Mean/Std encoding


In [31]:
########################### Final list of new features
#################################################################################
grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46881677 entries, 0 to 46881676
Data columns (total 32 columns):
id                           category
d                            int16
sales                        float64
enc_state_id_mean            float16
enc_state_id_std             float16
enc_store_id_mean            float16
enc_store_id_std             float16
enc_cat_id_mean              float16
enc_cat_id_std               float16
enc_dept_id_mean             float16
enc_dept_id_std              float16
enc_state_id_cat_id_mean     float16
enc_state_id_cat_id_std      float16
enc_state_id_dept_id_mean    float16
enc_state_id_dept_id_std     float16
enc_store_id_cat_id_mean     float16
enc_store_id_cat_id_std      float16
enc_store_id_dept_id_mean    float16
enc_store_id_dept_id_std     float16
enc_item_id_mean             float16
enc_item_id_std              float16
enc_item_id_state_id_mean    float16
enc_item_id_state_id_std     float16
enc_item_id_store_id_mean    float1