In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import gc
from itertools import product

pd.set_option('display.max_rows',99)
pd.set_option('display.max_columns',50)

import warnings
warnings.filterwarnings('ignore')

def downcast_dtypes(df):
    float_cols = [col for col in df if df[col].dtype == 'float64']
    int_cols = [col for col in df if df[col].dtype in ['int64','int32']]
    
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    
    return(df)

In [None]:
sale_train = pd.read_csv('sales_train.csv')
test = pd.read_csv('test.csv')

In [None]:
sale_train[sale_train['item_id'] == 11373][['item_price']].sort_values(['item_price'])

In [None]:
sale_train[sale_train['item_id'] == 11365].sort_values(['item_price'])

In [None]:
sale_train['item_price'][2909818] = np.nan
sale_train['item_cnt_day'][2909818] = np.nan

In [None]:
sale_train['item_price'][2909818] = sale_train[(sale_train['shop_id'] ==12) & (sale_train['item_id'] == 11373) & (sale_train['date_block_num'] == 33)]['item_price'].median()
sale_train['item_cnt_day'][2909818] = round(sale_train[(sale_train['shop_id'] ==12) & (sale_train['item_id'] == 11373) & (sale_train['date_block_num'] == 33)]['item_cnt_day'].median())
sale_train['item_price'][885138] = np.nan
sale_train['item_price'][885138] = sale_train[(sale_train['item_id'] == 11365) & (sale_train['shop_id'] ==12) & (sale_train['date_block_num'] == 8)]['item_price'].median()

In [None]:
test_nrow = test.shape[0]
sale_train = sale_train.merge(test[['shop_id']].drop_duplicates(), how = 'inner')
sale_train['date'] = pd.to_datetime(sale_train['date'], format = '%d.%m.%Y')

In [None]:
sale_train.head()

In [None]:
#Creating grid
grid = []
index_cols = ['shop_id','item_id','date_block_num']

for block in sale_train['date_block_num'].unique():
    cur_shops = sale_train.loc[sale_train['date_block_num'] == block, 'shop_id'].unique()
    cur_items = sale_train.loc[sale_train['date_block_num'] == block, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops,cur_items,[block]])),dtype=np.int32))

grid = pd.DataFrame(np.vstack(grid),columns = index_cols, dtype=np.int32)
grid.head()

In [None]:
sale_train['item_cnt_day']=sale_train['item_cnt_day'].clip(0,20)
sale_train.item_cnt_day.hist()

In [None]:
gb_cnt = sale_train.groupby(index_cols)['item_cnt_day'].agg('sum').reset_index().rename(columns={'item_cnt_day':'item_cnt_month'})
gb_cnt['item_cnt_month'] = gb_cnt['item_cnt_month'].clip(0,20).astype(np.int)

In [None]:
gb_cnt['item_cnt_month'].hist()

In [None]:
train = pd.merge(grid,gb_cnt,how='left',on=index_cols).fillna(0)
train['item_cnt_month'] = train['item_cnt_month'].astype(int)
train = downcast_dtypes(train)

In [None]:
train.item_cnt_month.hist()

In [None]:
train.sort_values(['date_block_num','shop_id','item_id'],inplace = True)
train                

In [None]:
item = pd.read_csv('items.csv')
train = train.merge(item[['item_id','item_category_id']],on='item_id',how='left')
test = test.merge(item[['item_id','item_category_id']],on='item_id',how='left')

In [None]:
item_cat = pd.read_csv('item_categories.csv')

In [None]:
l_cat = list(item_cat.item_category_name)

In [None]:
for ind in range(0,1):

    l_cat[ind] = 'PC Headsets / Headphones'

for ind in range(1,8):

    l_cat[ind] = 'Access'

l_cat[8] = 'Tickets (figure)'

l_cat[9] = 'Delivery of goods'

for ind in range(10,18):

    l_cat[ind] = 'Consoles'

for ind in range(18,25):

    l_cat[ind] = 'Consoles Games'

l_cat[25] = 'Accessories for games'

for ind in range(26,28):

    l_cat[ind] = 'phone games'

for ind in range(28,32):

    l_cat[ind] = 'CD games'

for ind in range(32,37):

    l_cat[ind] = 'Card'

for ind in range(37,43):

    l_cat[ind] = 'Movie'

for ind in range(43,55):

    l_cat[ind] = 'Books'

for ind in range(55,61):

    l_cat[ind] = 'Music'

for ind in range(61,73):

    l_cat[ind] = 'Gifts'

for ind in range(73,79):

    l_cat[ind] = 'Soft'

for ind in range(79,81):

    l_cat[ind] = 'Office'

for ind in range(81,83):

    l_cat[ind] = 'Clean'

l_cat[83] = 'Elements of a food'

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
item_cat['item_cat_id_fix'] = lb.fit_transform(l_cat)

In [None]:
train = train.merge(item_cat[['item_category_id','item_cat_id_fix']],on='item_category_id',how='left')
test = test.merge(item_cat[['item_category_id','item_cat_id_fix']],on='item_category_id',how='left')

In [None]:
del(item, item_cat, grid, gb_cnt)
gc.collect()

In [None]:
#Encodings
Target = 'item_cnt_month'
global_mean = train[Target].mean()
y_tr = train[Target].values

In [None]:
mean_encoded_col = ['shop_id', 'item_id', 'item_category_id', 'item_cat_id_fix']
SEED = 0

for col in tqdm_notebook(mean_encoded_col):

    col_tr = train[[col] + [Target]]
   
    corrcoefs = pd.DataFrame(columns = ['Cor'])



    # 3.1.1 Mean encodings - KFold scheme

    from sklearn.model_selection import KFold

    kf = KFold(n_splits = 5, shuffle = False, random_state = SEED)



    col_tr[col + '_cnt_month_mean_Kfold'] = global_mean

    for tr_ind, val_ind in kf.split(col_tr):

        X_tr, X_val = col_tr.iloc[tr_ind], col_tr.iloc[val_ind]

        means = X_val[col].map(X_tr.groupby(col)[Target].mean())

        X_val[col + '_cnt_month_mean_Kfold'] = means

        col_tr.iloc[val_ind] = X_val

        # X_val.head()

    col_tr.fillna(global_mean, inplace = True)

    corrcoefs.loc[col + '_cnt_month_mean_Kfold'] = np.corrcoef(y_tr, col_tr[col + '_cnt_month_mean_Kfold'])[0][1]



    # 3.1.2 Mean encodings - Leave-one-out scheme

    item_id_target_sum = col_tr.groupby(col)[Target].sum()

    item_id_target_count = col_tr.groupby(col)[Target].count()

    col_tr[col + '_cnt_month_sum'] = col_tr[col].map(item_id_target_sum)

    col_tr[col + '_cnt_month_count'] = col_tr[col].map(item_id_target_count)

    col_tr[col + '_target_mean_LOO'] = (col_tr[col + '_cnt_month_sum'] - col_tr[Target]) / (col_tr[col + '_cnt_month_count'] - 1)

    col_tr.fillna(global_mean, inplace = True)

    corrcoefs.loc[col + '_target_mean_LOO'] = np.corrcoef(y_tr, col_tr[col + '_target_mean_LOO'])[0][1]





    # 3.1.3 Mean encodings - Smoothing

    item_id_target_mean = col_tr.groupby(col)[Target].mean()

    item_id_target_count = col_tr.groupby(col)[Target].count()

    col_tr[col + '_cnt_month_mean'] = col_tr[col].map(item_id_target_mean)

    col_tr[col + '_cnt_month_count'] = col_tr[col].map(item_id_target_count)

    alpha = 100

    col_tr[col + '_cnt_month_mean_Smooth'] = (col_tr[col + '_cnt_month_mean'] *  col_tr[col + '_cnt_month_count'] + global_mean * alpha) / (alpha + col_tr[col + '_cnt_month_count'])

    col_tr[col + '_cnt_month_mean_Smooth'].fillna(global_mean, inplace=True)

    corrcoefs.loc[col + '_cnt_month_mean_Smooth'] = np.corrcoef(y_tr, col_tr[col + '_cnt_month_mean_Smooth'])[0][1]





    # 3.1.4 Mean encodings - Expanding mean scheme

    cumsum = col_tr.groupby(col)[Target].cumsum() - col_tr[Target]

    sumcnt = col_tr.groupby(col).cumcount()

    col_tr[col + '_cnt_month_mean_Expanding'] = cumsum / sumcnt

    col_tr[col + '_cnt_month_mean_Expanding'].fillna(global_mean, inplace=True)

    corrcoefs.loc[col + '_cnt_month_mean_Expanding'] = np.corrcoef(y_tr, col_tr[col + '_cnt_month_mean_Expanding'])[0][1]



    train = pd.concat([train, col_tr[corrcoefs['Cor'].idxmax()]], axis = 1)

    print(corrcoefs.sort_values('Cor'))





In [None]:
Validation = False
train = train.loc[:,~train.columns.duplicated()]

if Validation == False:

    test['date_block_num'] = 34

    all_data = pd.concat([train, test], axis = 0)

    all_data = all_data.drop(columns = ['ID'])

else:

    all_data = train



del train, test, col_tr

gc.collect()


all_data = downcast_dtypes(all_data)

In [None]:
all_data

In [None]:
# Adding Lags
index_cols = ['shop_id', 'item_id', 'item_category_id', 'item_cat_id_fix', 'date_block_num']

cols_to_rename = list(all_data.columns.difference(index_cols))

print(cols_to_rename)

shift_range = [1, 2, 3, 4, 12]



for month_shift in tqdm_notebook(shift_range):

    train_shift = all_data[index_cols + cols_to_rename].copy()



    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift



    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x

    train_shift = train_shift.rename(columns=foo)



    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)



del train_shift

gc.collect()

In [None]:
all_data = all_data[all_data['date_block_num'] >= 12] # Don't use old data from year 2013

lag_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]]

all_data = downcast_dtypes(all_data)

### 2.3 Creating date features --------------------------------------------------------

In [None]:
dates_train = sale_train[['date', 'date_block_num']].drop_duplicates()

dates_test = dates_train[dates_train['date_block_num'] == 34-12]
dates_test['date_block_num'] = 34
dates_test['date'] = dates_test['date'] + pd.DateOffset(years=1)
dates_all = pd.concat([dates_train, dates_test])

In [None]:
dates_all['dow'] = dates_all['date'].dt.dayofweek
dates_all['year'] = dates_all['date'].dt.year
dates_all['month'] = dates_all['date'].dt.month
dates_all = pd.get_dummies(dates_all, columns=['dow'])

In [None]:
dow_col = ['dow_' + str(x) for x in range(7)]

date_features = dates_all.groupby(['year', 'month', 'date_block_num'])[dow_col].agg('sum').reset_index()

date_features['days_of_month'] = date_features[dow_col].sum(axis=1)

date_features['year'] = date_features['year'] - 2013



date_features = date_features[['month', 'year', 'days_of_month', 'date_block_num']]

all_data = all_data.merge(date_features, on = 'date_block_num', how = 'left')

date_columns = date_features.columns.difference(set(index_cols))


### 2.4 Scale feature columns --------------------------------------------------------

In [32]:
from sklearn.preprocessing import StandardScaler

train = all_data[all_data['date_block_num'] != all_data['date_block_num'].max()]

test = all_data[all_data['date_block_num']== all_data['date_block_num'].max()]

sc = StandardScaler()

to_drop_cols = ['date_block_num']

feature_columns = list(set(lag_cols + index_cols + list(date_columns)).difference(to_drop_cols))



train[feature_columns] = sc.fit_transform(train[feature_columns])

test[feature_columns] = sc.transform(test[feature_columns])

all_data = pd.concat([train, test], axis = 0)

all_data = downcast_dtypes(all_data)



del train, test, date_features, sale_train

gc.collect()


0

### 3. First-level model ------------------------------------------------------------------

In [35]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts

dates = all_data['date_block_num']

last_block = dates.max()

print('Test `date_block_num` is %d' % last_block)

print(feature_columns)

#start_first_level_total = time.perf_counter()

scoringMethod = 'r2'; from sklearn.metrics import mean_squared_error; from math import sqrt

Test `date_block_num` is 34
['item_cat_id_fix_cnt_month_mean_Expanding_lag_12', 'item_cat_id_fix_cnt_month_mean_Expanding_lag_4', 'item_cat_id_fix_cnt_month_mean_Expanding_lag_1', 'shop_id_cnt_month_mean_Expanding_lag_12', 'item_id', 'shop_id_cnt_month_mean_Expanding_lag_2', 'item_category_id_cnt_month_mean_Expanding_lag_1', 'item_id_cnt_month_mean_Expanding_lag_1', 'item_cnt_month_lag_4', 'item_cnt_month_lag_3', 'item_category_id_cnt_month_mean_Expanding_lag_4', 'item_id_cnt_month_mean_Expanding_lag_12', 'item_cat_id_fix', 'month', 'item_cnt_month_lag_12', 'shop_id_cnt_month_mean_Expanding_lag_3', 'shop_id_cnt_month_mean_Expanding_lag_4', 'shop_id', 'days_of_month', 'year', 'item_category_id_cnt_month_mean_Expanding_lag_3', 'shop_id_cnt_month_mean_Expanding_lag_1', 'item_cat_id_fix_cnt_month_mean_Expanding_lag_2', 'item_cnt_month_lag_2', 'item_cnt_month_lag_1', 'item_cat_id_fix_cnt_month_mean_Expanding_lag_3', 'item_id_cnt_month_mean_Expanding_lag_4', 'item_id_cnt_month_mean_Expanding

In [37]:
# Train meta-features M = 15 (12 + 15 = 27)

num_first_level_models = 3

months_to_generate_meta_features = range(27,last_block +1)

mask = dates.isin(months_to_generate_meta_features)

Target = 'item_cnt_month'

y_all_level2 = all_data[Target][mask].values

X_all_level2 = np.zeros([y_all_level2.shape[0], num_first_level_models])



In [40]:
# Now fill `X_train_level2` with metafeatures

slice_start = 0

range(27, 35)

In [42]:
from sklearn.linear_model import (LinearRegression, SGDRegressor)
import lightgbm as lgb

sgdr= SGDRegressor(

        penalty = 'l2' ,

        random_state = SEED )
 
estimators = [sgdr]
estimators

[SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
              eta0=0.01, fit_intercept=True, l1_ratio=0.15,
              learning_rate='invscaling', loss='squared_loss', max_iter=1000,
              n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=0,
              shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
              warm_start=False)]

In [43]:
import time

start_time = time.time()

In [48]:
for cur_block_num in tqdm_notebook(months_to_generate_meta_features):

    print('-' * 50)

    print('Start training for month %d'% cur_block_num)
    
    start_cur_month = time.perf_counter()

    cur_X_train = all_data.loc[dates <  cur_block_num][feature_columns]

    cur_X_test =  all_data.loc[dates == cur_block_num][feature_columns]


    cur_y_train = all_data.loc[dates <  cur_block_num, Target].values

    cur_y_test =  all_data.loc[dates == cur_block_num, Target].values



    # Create Numpy arrays of train, test and target dataframes to feed into models

    train_x = cur_X_train.values

    train_y = cur_y_train.ravel()

    test_x = cur_X_test.values

    test_y = cur_y_test.ravel()



    preds = []


    #Modeling



    sgdr= SGDRegressor(

        penalty = 'l2' ,

        random_state = SEED )

    lgb_params = {

                  'feature_fraction': 0.75,

                  'metric': 'rmse',

                  'nthread':1,

                  'min_data_in_leaf': 2**7,

                  'bagging_fraction': 0.75,

                  'learning_rate': 0.03,

                  'objective': 'mse',

                  'bagging_seed': 2**7,

                  'num_leaves': 2**7,

                  'bagging_freq':1,

                  'verbose':0

                  }



    estimators = [sgdr]



    for estimator in estimators:

        print('Training Model %d: %s'%(len(preds), estimator.__class__.__name__))

        start = time.perf_counter()

        estimator.fit(train_x, train_y)

        pred_test = estimator.predict(test_x)

        preds.append(pred_test)

        # pred_train = estimator.predict(train_x)

        # print('Train RMSE for %s is %f' % (estimator.__class__.__name__, sqrt(mean_squared_error(cur_y_train, pred_train))))

        # print('Test RMSE for %s is %f' % (estimator.__class__.__name__, sqrt(mean_squared_error(cur_y_test, pred_test))))

        run = time.perf_counter() - start

        print('{} runs for {:.2f} seconds.'.format(estimator.__class__.__name__, run))

        print()





    print('Training Model %d: %s'%(len(preds), 'lightgbm'))

    #start = time.perf_counter()

    estimator = lgb.train(lgb_params, lgb.Dataset(train_x, label=train_y), 300)

    pred_test = estimator.predict(test_x)

    preds.append(pred_test)

    # pred_train = estimator.predict(train_x)

    # print('Train RMSE for %s is %f' % ('lightgbm', sqrt(mean_squared_error(cur_y_train, pred_train))))

    # print('Test RMSE for %s is %f' % ('lightgbm', sqrt(mean_squared_error(cur_y_test, pred_test))))

    run = time.perf_counter() - start

    print('{} runs for {:.2f} seconds.'.format('lightgbm', run))

    print()





    print('Training Model %d: %s'%(len(preds), 'keras'))

    start = time.perf_counter()

    from keras.models import Sequential

    from keras.layers import Dense

    from keras.wrappers.scikit_learn import KerasRegressor



    def baseline_model():

    	# create model

        model = Sequential()

        model.add(Dense(20, input_dim=train_x.shape[1], kernel_initializer='uniform', activation='softplus'))

        model.add(Dense(1, kernel_initializer='uniform', activation = 'relu'))

        # Compile model

        model.compile(loss='mse', optimizer='Nadam', metrics=['mse'])

        # model.compile(loss='mean_squared_error', optimizer='adam')

        return model



    estimator = KerasRegressor(build_fn=baseline_model, verbose=1, epochs=5, batch_size = 55000)



    estimator.fit(train_x, train_y)

    pred_test = estimator.predict(test_x)

    preds.append(pred_test)



    run = time.perf_counter() - start

    print('{} runs for {:.2f} seconds.'.format('lightgbm', run))





    cur_month_run_total = time.perf_counter() - start_cur_month

    print('Total running time was {:.2f} minutes.'.format(cur_month_run_total/60))

    print('-' * 50)



    slice_end = slice_start + cur_X_test.shape[0]

    X_all_level2[ slice_start : slice_end , :] = np.c_[preds].transpose()

    slice_start = slice_end


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

--------------------------------------------------
Start training for month 27
Training Model 0: SGDRegressor
SGDRegressor runs for 11.64 seconds.

Training Model 1: lightgbm
lightgbm runs for 194.07 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
lightgbm runs for 19.27 seconds.
Total running time was 3.57 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month 28
Training Model 0: SGDRegressor
SGDRegressor runs for 10.80 seconds.

Training Model 1: lightgbm
lightgbm runs for 198.36 seconds.

Training Model 2: keras
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
lightgbm runs for 19.84 seconds.
Total running time was 3.65 minutes.
--------------------------------------------------
--------------------------------------------------
Start training for month 29
Training Model 0: SGDRegressor
SGDRegressor runs for 11.21 seconds.

Training Model 1: lightgbm
lightgbm runs f

In [61]:
# Split train and test

test_nrow = len(preds[0])

X_train_level2 = X_all_level2[ : -test_nrow, :]

X_test_level2 = X_all_level2[ -test_nrow: , :]

y_train_level2 = y_all_level2[ : -test_nrow]

y_test_level2 = y_all_level2[ -test_nrow : ]

In [63]:
# 4. Ensembling -------------------------------------------------------------------

pred_list = {}



# A. Second level learning model via linear regression

print('Training Second level learning model via linear regression')



from sklearn.linear_model import (LinearRegression, SGDRegressor)

lr = LinearRegression()

lr.fit(X_train_level2, y_train_level2)

# Compute R-squared on the train and test sets.

# print('Train R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, lr.predict(X_train_level2)))))

test_preds_lr_stacking = lr.predict(X_test_level2)

train_preds_lr_stacking = lr.predict(X_train_level2)

print('Train RMSE for %s is %f' %('train_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, train_preds_lr_stacking))))

Training Second level learning model via linear regression
Train RMSE for train_preds_lr_stacking is 0.815870


In [70]:
pred_list['test_preds_lr_stacking'] = test_preds_lr_stacking

if Validation:

    print('Test R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_test_level2, test_preds_lr_stacking))))





# B. Second level learning model via SGDRegressor

print('Training Second level learning model via SGDRegressor')

sgdr= SGDRegressor(

    penalty = 'l2' ,

    random_state = SEED )

Training Second level learning model via SGDRegressor


In [71]:
sgdr.fit(X_train_level2, y_train_level2)

# Compute R-squared on the train and test sets.

# print('Train R-squared for %s is %f' %('test_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, lr.predict(X_train_level2)))))

test_preds_sgdr_stacking = sgdr.predict(X_test_level2)

train_preds_sgdr_stacking = sgdr.predict(X_train_level2)

print('Train R-squared for %s is %f' %('train_preds_lr_stacking', sqrt(mean_squared_error(y_train_level2, train_preds_sgdr_stacking))))


Train R-squared for train_preds_lr_stacking is 0.818250


In [72]:
pred_list['test_preds_sgdr_stacking'] = test_preds_sgdr_stacking

if Validation:

    print('Test R-squared for %s is %f' %('test_preds_sgdr_stacking', sqrt(mean_squared_error(y_test_level2, test_preds_sgdr_stacking))))


In [73]:
pred_list

{'test_preds_sgdr_stacking': array([0.50737028, 0.30767959, 0.82742931, ..., 0.04457132, 0.02347315,
        0.04805526]),
 'test_preds_lr_stacking': array([0.54395551, 0.31808356, 0.90842739, ..., 0.04801198, 0.02955192,
        0.05232132])}

# Submission -------------------------------------------------------------------

In [76]:
if not Validation:

    submission = pd.read_csv('sample_submission.csv')



    ver = 6

    for pred_ver in ['lr_stacking', 'sgdr_stacking']:

        print(pred_list['test_preds_' + pred_ver].clip(0,20).mean())

        submission['item_cnt_month'] = pred_list['test_preds_' + pred_ver].clip(0,20)

        submission[['ID', 'item_cnt_month']].to_csv('%d_%s.csv' % (ver, pred_ver), index = False)




0.28339887459977325
0.2613999374887801
