Version 1.0.2, Python 2.7.14, conda 4.3.34

# Final Project - LightGBM

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

from prepare_data2 import k_fold_mean, loo_mean, smoothing_mean, expanding_mean
from prepare_data2 import make_grid, get_target, prepare_test
from prepare_data2 import create_df, downcast_dtypes

import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook
from itertools import product
import gc

for p in [np, pd, lgb]:
    print (p.__name__, p.__version__)

numpy 1.14.2
pandas 0.22.0
lightgbm 2.1.0


### Load the data:

In [2]:
from prepare_data import read_data
sales, items, categories, shops, test = read_data()
print(test.shape)

(214200, 3)


### Create a clean data frame with all the informations given, which can be used for feature engineering:

In [3]:
df = create_df(sales, items, categories, shops) 

### Create data frame which is then used for training and validation including the target:

In [4]:
index_3 = ['shop_id', 'item_id','date_block_num']

grid = make_grid(df, index_3)
target = get_target(df, index_3, 'item_cnt_day', 'target', 'sum')
all_data = pd.merge(grid, target, how='left', on=index_3).fillna(0)
all_data.shape

(10913804, 4)

In [5]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target
0,59,22154,0,1.0
1,59,21759,0,1.0
2,59,21347,0,1.0
3,59,22015,0,1.0
4,59,18977,0,1.0


In [6]:
test_data = prepare_test(test)

### Feature Engineering:

In [7]:
all_data = all_data.assign(month = (all_data.date_block_num%12)+1)
all_data = all_data.assign(year = 2013)
all_data['year'] = np.where(all_data.date_block_num < 12, 2013, 2014)
all_data['year'] = np.where(all_data.date_block_num > 23, 2015, all_data.year)

target_shop = get_target(sales, ['shop_id', 'date_block_num'], 'item_cnt_day', 'target_shop', 'sum')
all_data = pd.merge(all_data, target_shop, how='left', on=['shop_id', 'date_block_num']).fillna(0)

In [8]:
index_cols = ['shop_id', 'item_id']
temp = test_data.merge(target_shop, how='left',on=['shop_id'])
temp = temp.groupby('shop_id', as_index=False)[['target_shop']].mean()
mean_target = all_data['target_shop'].mean()
test_data = pd.merge(test_data, temp, how='left', on='shop_id').fillna(mean_target)

In [9]:
target_item = get_target(sales, ['item_id', 'date_block_num'], 'item_cnt_day', 'target_item', 'sum')
all_data = pd.merge(all_data, target_item, how='left', on=['item_id', 'date_block_num']).fillna(0)

In [10]:
temp = test_data.merge(target_item, how='left',on=['item_id'])
temp = temp.groupby('item_id', as_index=False)[['target_item']].mean()
mean_target = all_data['target_item'].mean()
test_data = pd.merge(test_data, temp, how='left', on='item_id').fillna(mean_target)

In [11]:
all_data.sort_values(index_3,inplace=True)

Add all the informations we have to `all_data`:

In [12]:
all_data = pd.merge(all_data, items, how='left', on='item_id')
all_data = pd.merge(all_data, categories, how='left', on='item_category_id')
all_data = pd.merge(all_data, shops, how='left', on='shop_id')

and then to `test_data`:

In [13]:
test_data = pd.merge(test_data, items, how='left', on='item_id')
test_data = pd.merge(test_data, categories, how='left', on='item_category_id')
test_data = pd.merge(test_data, shops, how='left', on='shop_id')
test_data = test_data.drop(['item_name', 'shop_name', 'item_category_name'], axis=1)
all_data = all_data.drop(['item_name', 'shop_name', 'item_category_name'], axis=1)

In [14]:
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,month,year,target_shop,target_item,item_category_id
0,0,12,1,0.0,2,2013,6127.0,1.0,55
1,0,19,0,0.0,1,2013,5578.0,1.0,40
2,0,27,0,0.0,1,2013,5578.0,7.0,19
3,0,27,1,0.0,2,2013,6127.0,3.0,19
4,0,28,0,0.0,1,2013,5578.0,8.0,30


In [15]:
test_data.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,month,year,target_shop,target_item,item_category_id
0,0,5,5037,34,11,2015,1295.818182,90.714286,19
1,1,5,5320,34,11,2015,1295.818182,15.651483,55
2,2,5,5233,34,11,2015,1295.818182,72.428571,19
3,3,5,5232,34,11,2015,1295.818182,47.0,23
4,4,5,5268,34,11,2015,1295.818182,15.651483,20


add also category features:

In [16]:
mean_t = all_data.target.mean()

In [17]:
category = get_target(df, index_3, 'item_category_id', 'category', 'count')
all_data = pd.merge(all_data, category, how='left', on=index_3).fillna(0)
mean_c = all_data.category.mean()

category_shop = get_target(df, ['shop_id', 'date_block_num'], 'item_category_id','category_shop', 'count')
all_data = pd.merge(all_data, category_shop, how='left', on=['shop_id', 'date_block_num']).fillna(0)

category_item = get_target(df, ['item_id', 'date_block_num'], 'item_category_id', 'category_item', 'count')
all_data = pd.merge(all_data, category_item, how='left', on=['item_id', 'date_block_num']).fillna(0)

In [18]:
temp = test_data.merge(category, how='left',on=index_cols)
temp = temp.groupby(index_cols, as_index=False)[['category']].mean()
test_data = pd.merge(test_data, temp, how='left', on=index_cols).fillna(mean_c)

In [19]:
test_data.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num,month,year,target_shop,target_item,item_category_id,category
0,0,5,5037,34,11,2015,1295.818182,90.714286,19,1.444444
1,1,5,5320,34,11,2015,1295.818182,15.651483,55,0.269003
2,2,5,5233,34,11,2015,1295.818182,72.428571,19,1.8
3,3,5,5232,34,11,2015,1295.818182,47.0,23,1.0
4,4,5,5268,34,11,2015,1295.818182,15.651483,20,0.269003


In [20]:
temp = test_data.merge(category_shop, how='left', on=['shop_id'])
temp = temp.groupby('shop_id', as_index=False)[['category_shop']].mean()
mean_cat = all_data.category_shop.mean()
test_data = pd.merge(test_data, temp, how='left', on='shop_id').fillna(mean_cat)

In [21]:
temp = test_data.merge(category_item, how='left', on=['item_id'])
temp = temp.groupby('item_id', as_index=False)[['category_item']].mean()
mean_cat = all_data.category_shop.mean()
test_data = pd.merge(test_data, temp, how='left', on='item_id').fillna(mean_cat)

Mean of target:

In [22]:
target_mean = all_data[['shop_id','item_id','target']].groupby(index_cols, as_index=False)[['target']].mean()
target_mean = target_mean.rename(columns={'target':'target_mean'})
all_data = pd.merge(all_data, target_mean, how='left', on=index_cols).fillna(0)
test_data = pd.merge(test_data, target_mean, how='left', on=index_cols).fillna(mean_t)
all_data.shape

(10913804, 13)

Add the same features of `all_data` to the `test_data`:

In [23]:
test_data.loc[:,'date_block_num'] = all_data.date_block_num.max()+1

for col in all_data.columns.values:
    if col not in test_data.columns.values:
        test_data.loc[:,col] = 0

In [24]:
test_data = test_data.drop(['ID'], axis=1)

Concatenate test and train data for further feature engineering:

In [25]:
features = all_data.columns.values
ntrain = all_data.shape
ntest = test_data.shape
merged_data = pd.concat([all_data[features], test_data[features]], ignore_index=True, copy=False)

print(ntrain, ntest, merged_data.shape)

(10913804, 13) (214200, 13) (11128004, 13)


I am not interested in the names of shop, item or categories as features:

In [26]:
#merged_data = merged_data.drop(['item_name', 'item_category_name', 'shop_name'], axis=1)
merged_data = downcast_dtypes(merged_data)
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11128004 entries, 0 to 11128003
Data columns (total 13 columns):
shop_id             int32
item_id             int32
date_block_num      int32
target              float32
month               int32
year                int32
target_shop         float32
target_item         float32
item_category_id    int32
category            float32
category_shop       float32
category_item       float32
target_mean         float32
dtypes: float32(7), int32(6)
memory usage: 551.8 MB


In [27]:
del all_data
gc.collect()

74

Mean encoded features:

In [28]:
merged_data  = k_fold_mean(merged_data, 'target', mean_t, 'item_id')
merged_data = merged_data.rename(columns={'new_name':'target_kf'})
merged_data = loo_mean(merged_data, 'target', mean_t, 'item_id')
merged_data = merged_data.rename(columns={'new_name':'target_loo'})
merged_data = smoothing_mean(merged_data, 'target', mean_t, 'item_id')
merged_data = merged_data.rename(columns={'new_name':'target_sm'})
merged_data = expanding_mean(merged_data, 'target', mean_t, 'item_id')
merged_data = merged_data.rename(columns={'new_name':'target_exp'})

In [29]:
merged_data = k_fold_mean(merged_data, 'category', mean_c, 'shop_id')
merged_data = merged_data.rename(columns={'new_name':'category_kf'})
merged_data = loo_mean(merged_data, 'category', mean_c, 'shop_id')
merged_data = merged_data.rename(columns={'new_name':'category_loo'})
merged_data = smoothing_mean(merged_data, 'category', mean_c, 'shop_id')
merged_data = merged_data.rename(columns={'new_name':'category_sm'})
merged_data = expanding_mean(merged_data, 'category', mean_c, 'shop_id')
merged_data = merged_data.rename(columns={'new_name':'category_exp'})
merged_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,month,year,target_shop,target_item,item_category_id,category,...,category_item,target_mean,target_kf,target_loo,target_sm,target_exp,category_kf,category_loo,category_sm,category_exp
0,0,12,1,0.0,2,2013,6127.0,1.0,55,0.0,...,1.0,0.0,0.030303,0.022222,0.235804,0.334274,0.269003,0.605392,0.603302,0.269003
1,0,19,0,0.0,1,2013,5578.0,1.0,40,0.0,...,1.0,0.0,0.030303,0.022727,0.237431,0.334274,0.269003,0.605392,0.603302,0.0
2,0,27,0,0.0,1,2013,5578.0,7.0,19,0.0,...,7.0,0.0,0.049037,0.056911,0.089902,0.334274,0.269003,0.605392,0.603302,0.0
3,0,27,1,0.0,2,2013,6127.0,3.0,19,0.0,...,3.0,0.0,0.049037,0.056911,0.089902,0.0,0.269003,0.605392,0.603302,0.0
4,0,28,0,0.0,1,2013,5578.0,8.0,30,0.0,...,8.0,0.0,0.170678,0.141414,0.16896,0.334274,0.269003,0.605392,0.603302,0.0


Lagged Features:

In [30]:
# List of columns that we will use to create lags
cols_to_rename = list(merged_data.columns.difference(index_3)) 

shift_range = [1, 3, 5, 9]

for month_shift in tqdm_notebook(shift_range):
    train_shift = merged_data[index_3 + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    merged_data = pd.merge(merged_data, train_shift, on=index_3, how='left').fillna(0)

del train_shift




In [31]:
# Don't use old data from year 2013
#merged_data = merged_data[merged_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in merged_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(merged_data.columns)) - (set(fit_cols)|set(index_3))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

In [32]:
merged_data = pd.merge(merged_data, item_category_mapping, how='left', on=['item_id','item_category_id'])
merged_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,month,year,target_shop,target_item,item_category_id,category,...,month_lag_9,target_lag_9,target_exp_lag_9,target_item_lag_9,target_kf_lag_9,target_loo_lag_9,target_mean_lag_9,target_shop_lag_9,target_sm_lag_9,year_lag_9
0,0,12,1,0.0,2,2013,6127.0,1.0,55,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,19,0,0.0,1,2013,5578.0,1.0,40,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,27,0,0.0,1,2013,5578.0,7.0,19,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,27,1,0.0,2,2013,6127.0,3.0,19,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,28,0,0.0,1,2013,5578.0,8.0,30,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
to_drop_cols

['target',
 'month',
 'category_shop',
 'year',
 'category',
 'category_kf',
 'category_loo',
 'target_kf',
 'category_item',
 'category_exp',
 'target_exp',
 'target_item',
 'target_mean',
 'target_shop',
 'target_sm',
 'target_loo',
 'category_sm',
 'item_category_id',
 'date_block_num']

In [34]:
merged_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,month,year,target_shop,target_item,item_category_id,category,...,month_lag_9,target_lag_9,target_exp_lag_9,target_item_lag_9,target_kf_lag_9,target_loo_lag_9,target_mean_lag_9,target_shop_lag_9,target_sm_lag_9,year_lag_9
0,0,12,1,0.0,2,2013,6127.0,1.0,55,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,19,0,0.0,1,2013,5578.0,1.0,40,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,27,0,0.0,1,2013,5578.0,7.0,19,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,27,1,0.0,2,2013,6127.0,3.0,19,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,28,0,0.0,1,2013,5578.0,8.0,30,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Split the data into train, validation and test:

In [35]:
dates = merged_data['date_block_num']
last_block = dates.max()
prev_block = last_block-1

dates_train = dates[dates <  prev_block]
dates_valid = dates[dates ==  prev_block]
dates_test = dates[dates == last_block]

X_train = merged_data.loc[dates < prev_block].drop(to_drop_cols, axis=1)
X_valid = merged_data.loc[dates == prev_block].drop(to_drop_cols, axis=1)
X_test = merged_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

NameError: name 'y' is not defined

In [36]:
y_train = merged_data.loc[dates < prev_block, 'target']
y_valid = merged_data.loc[dates == prev_block, 'target']
y_test = merged_data.loc[dates == last_block, 'target']

In [37]:
X = merged_data.loc[dates < last_block].drop(to_drop_cols, axis=1)
y = merged_data.loc[dates < last_block, 'target']

In [38]:
X_train = downcast_dtypes(X_train)
X_valid = downcast_dtypes(X_valid)
X_test = downcast_dtypes(X_test)

In [None]:
merged_data.to_pickle('Merged_data.pkl')
merged_data = read_dataad_pickle('Merged_data.pkl')
#or
#store = HDFStore('store.h5')
#store['df'] = df  # save it
#store['df']  # load it

So, ready to train and validate, i.e. separate train data into train and validation set and train with xgboost or sth. else. Then prepare test data.

In [42]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

lgb_model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = lgb_model.predict(X_valid)

print('Test R-squared for LightGBM is %f' % r2_score(y_valid, pred_lgb))

Test R-squared for LightGBM is 0.298000


In [40]:
lgb_model.feature_importance()

array([ 631, 1029,  469,  145,  980,   63,  192,  189,   74,  770,  759,
        615,  131,  979,  135,  205, 1090,  159,  285,  455,   70,   67,
        277,   36,   41,   61,    5,  151,  130,  128,  146,  190,  130,
         77,  279,   54,   43,   95,   32,   43,  120,   13,   16,   39,
          3,   98,  121,   71,   43,   70,   39,   30,   86,   40,   15,
         16,    3,   63,   60,   10,    5,    3,    0,   58,   58,   12,
         34,   66,   37,    9,   14,    6,   23,    9])

In [None]:
#LightGBM
#def lgb_rmse(preds, y):
#    y = np.array(list(y.get_label()))
#    score = np.sqrt(mean_squared_error(y.clip(0.,20.), preds.clip(0.,20.)))
#    return 'RMSE', score, False

In [None]:
#params = {'learning_rate': 0.1, 'max_depth': 7, 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'mse', 'is_training_metric': False, 'seed': 18}
#lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), 100, lgb.Dataset(X_valid, label=y_valid), feval=lgb_rmse, verbose_eval=10, early_stopping_rounds=20)

In [41]:
test['item_cnt_month'] = lgb_model.predict(X_test)
test[['ID','item_cnt_month']].to_csv('lgb_submission.csv', index=False)