## Objectives:
* Step 1: prepare the training and evaluation data set
* Step 2: training with random forest

In [1]:
import pandas as pd
import numpy as np
import datetime
import calendar
from sklearn import preprocessing
from sklearn import feature_extraction
import itertools
from collections import OrderedDict

# To use part or all the train set
training = True

#### Prepare the tables

In [2]:
#Load all Files (hey must be in input directory in a brother directory of the notebook)
data_load = {
    'item_categories': pd.read_csv('../input/item_categories.csv'), 
    'items': pd.read_csv('../input/items.csv'), 
    'sales_train': pd.read_csv('../input/sales_train_v2.csv'),
    'sample_submission': pd.read_csv('../input/sample_submission.csv'),
    'shops': pd.read_csv('../input/shops.csv'),
    'test': pd.read_csv('../input/test.csv')
}

In [3]:
Data = {}

# Sales data 
data_load['sales_train']['date'] = pd.to_datetime(data_load['sales_train']['date'], format = "%d.%m.%Y")

#Data['sales']['day'] = transactions['date'].dt.day
data_load['sales_train']['month'] = data_load['sales_train']['date'].dt.month
data_load['sales_train']['year'] = data_load['sales_train']['date'].dt.year

In [None]:
data_load['sales_train'].describe(include = 'all')

In [None]:
 Data['train'][( Data['train'].shop_id == 5) & ( Data['train'].item_id == 485)]

### Split the training/evaluation set, with similar pattern as the test set:
* All shops in both sets
* evaluation set on the last month
* unknown items in the evaluation set

In [4]:
Data['train'] = data_load['sales_train'].groupby(['date_block_num', 'shop_id', 'item_id'], as_index = False).agg({
    'item_price': np.mean,
    'item_cnt_day': np.sum
}).rename(columns = {'item_cnt_day': 'item_shop_count',
          'item_price': 'item_shop_price' })

if training: 
    
    # Split on date to create the evaluation set
    
    print("Preparing the evaluation set")
    
    condition = Data['train']['date_block_num']==33
    Data['evaluation'] = Data['train'][condition]
    Data['train'] = Data['train'][~condition]
    
    print("sizes:" ,Data['evaluation'].shape, Data['train'].shape)
    
else:
    
    # Prepare the test set
    
    print("Preparing the test set")

    Data['test'] = data_load['test'].copy()
    Data['test']['month'] = 11
    Data['test']['year'] = 2015
    Data['test']['date_block_num'] = 34

    Data['test'] = Data['test'][cols]

Preparing the evaluation set
sizes: (31531, 5) (1577593, 5)


In [None]:
 Data['train'][( Data['train'].shop_id == 5) & ( Data['train'].item_id == 485)]

### Winsorization - on training set only to evaluate the effect on prediction

In [5]:
price_limit = Data['train'].item_shop_price.quantile([0.0, 0.999])[0.999]
prices = Data['train'].item_shop_price
Data['train'].loc[(prices > price_limit), 'item_shop_price'] = price_limit

#### Add the missing rows

In [6]:
if training:
    set_list = ['train', 'evaluation']
else:
    set_list = ['train']
    
months = data_load['sales_train'].groupby(['month', 'year'], as_index = False).date_block_num.first()

for set_name in set_list:

    # Add the missing rows date_block_num,shop_id, item_id with item_cnt_month = 0
    
    print('Adding the missing rows for', set_name, 'set...')
    
    # From the assignment of week 3
    # It differs from old method, because we do not consider the shops without sales or the items not sold during a month
    # whereas before we were considering all combinations, and afterwards we were removing the couple shop, item
    # with no sales on the full period: that means 10M rows vs 14M and a mean value of 0.3343 which is too high compare to 
    # the test set: 0.28
    # However, it solves the issue of items not sold for months impacting the average
    index_cols = ['shop_id', 'item_id', 'date_block_num']

    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in Data[set_name]['date_block_num'].unique():
        cur_shops = Data[set_name][Data[set_name]['date_block_num']==block_num]['shop_id'].unique()
        cur_items = Data[set_name][Data[set_name]['date_block_num']==block_num]['item_id'].unique()
        grid.append(np.array(list(itertools.product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    #turn the grid into pandas dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    #join aggregated data to the grid
    Data[set_name] = pd.merge(grid,Data[set_name],how='left',on=index_cols)
    #sort the data
    Data[set_name].sort_values(['date_block_num','shop_id','item_id'],inplace=True)

    # Add the month and year
    
    Data[set_name] = Data[set_name].merge(months,
                   how = 'left',
                   on = ['date_block_num'])
    
    # For new rows missing values:
    
    Data[set_name].item_shop_count = Data[set_name].item_shop_count.fillna(0)

Adding the missing rows for train set...
Adding the missing rows for evaluation set...


#### Add the lagged features

In [7]:
# From http://mlwhiz.com/blog/2017/12/26/How_to_win_a_data_science_competition/

lag_variables  = ['item_shop_count', 'item_shop_price']
lags = [1, 3 , 6 , 12]
for lag in lags:
    
    print('Adding lag:', lag)
    
    train_df = Data['train'].copy()
    train_df.date_block_num+=lag
    train_df = train_df[['date_block_num','shop_id','item_id']+lag_variables]
    train_df.columns = ['date_block_num','shop_id','item_id']+ [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variables]
    Data['train'] = pd.merge(Data['train'], train_df,on=['date_block_num','shop_id','item_id'] ,how='left')
    
    if training:
        Data['evaluation'] = pd.merge(Data['evaluation'], train_df,on=['date_block_num','shop_id','item_id'] ,how='left')
    else:
        Data['test'] = pd.merge(Data['test'], train_df,on=['date_block_num','shop_id','item_id'] ,how='left')

Adding lag: 1
Adding lag: 3
Adding lag: 6
Adding lag: 12


In [None]:
Data['train'][-25:]

In [None]:
a = Data['train'].copy()
b = Data['evaluation'].copy()

In [None]:
Data['train'] = a.copy()
Data['evaluation'] = b.copy()

#### Add the other features

In [9]:
def add_mean_encoding(enc_cols, 
                      mean_encodings, 
                      label_suffix, 
                      price_rename, 
                      count_rename, 
                      train_set, 
                      test_set):
             
    # Create the encoding on the train set
    tmp = Data[train_set].groupby(enc_cols, as_index = False).agg(mean_encodings)\
        .rename(columns = {'item_shop_price': price_rename,
                            'item_shop_count': count_rename})
    tmp.columns = [col[0] if col[-1]=='' else col[0] + '_' + col[-1] + label_suffix for col in tmp.columns.values]

    # Add the encoding on each set
 
    Data[train_set] = Data[train_set].merge(tmp,
            how = 'left',
            on = enc_cols
            )
    
    Data[test_set] =  Data[test_set].merge(tmp,
            how = 'left',
            on = enc_cols
            )

In [10]:
# Measures to use for the mean encoding
mean_encodings = {'item_shop_price':['max','mean'], 'item_shop_count':['max', 'mean']}

if training:
    test_set = 'evaluation'
else:
    test_set = 'test'
    
for set_name in ['train', test_set]:
    Data[set_name] = Data[set_name].merge(data_load['items'],
            how = 'left',
            on = 'item_id',
            ).drop(['item_name'], axis = 1)

### Add the mean encoding per item_category

print("Adding shop category stats")

add_mean_encoding(enc_cols=['shop_id', 'item_category_id'],
                  mean_encodings = mean_encodings,
                  label_suffix='_over_months',
                  price_rename='category_shop_price',
                  count_rename='category_shop_count',
                  train_set = 'train',
                  test_set = test_set)
    
# Add the average price per item and shop, over the train data set (average price will be missing for some items)

print("Adding shop item stats")

add_mean_encoding(enc_cols=['shop_id', 'item_id'],
                  mean_encodings = mean_encodings,
                  label_suffix='_over_months',
                  price_rename='item_shop_price',
                  count_rename='item_shop_count',
                  train_set = 'train',
                  test_set = test_set)

# Add the average price per item over the train data set (average price will be missing for some items)

print("Adding overall item stats")  

add_mean_encoding(enc_cols=['shop_id', 'item_id'],
                  mean_encodings = mean_encodings,
                  label_suffix='_over_all',
                  price_rename='item_price',
                  count_rename='item_count',
                  train_set = 'train',
                  test_set = test_set)

# Add the mean encodings per shop over all

print("Adding overall shop stats")   

add_mean_encoding(enc_cols=['shop_id', 'item_id'],
                  mean_encodings = mean_encodings,
                  label_suffix='_over_all',
                  price_rename='shop_price',
                  count_rename='shop_count',
                  train_set = 'train',
                  test_set = test_set)

# Remove the item_id

for set_name in set_list:
    Data[set_name] = Data[set_name].drop(['item_id', 'item_category_id', 'shop_id'], axis = 1)
    
Data['train'] = Data['train'].drop(['item_shop_price'], axis = 1)
if training:
    Data['evaluation'] = Data['evaluation'].drop(['item_shop_price'], axis = 1)

print("Done!")

Adding item category stats
Adding shop item stats
Adding overall item stats
Adding overall shop stats
Done!


#### Reorder the columns alphabetically to avoid issues with columns position

In [11]:
if training:
    set_list = ['train','evaluation']
else:
    set_list = ['train','test']

for set_name in set_list:
    Data[set_name].sort_index(axis=1, inplace=True)

#### Save data to save memory

In [12]:
import pickle
import gc
gc.collect()

case = '20180201a'

DATA_LEARNING_FILE = "../data/sales-" + case
DATA_EVALUATION_FILE = "../data/evaluation-" + case
DATA_TEST_FILE = "../data/test-" + case

Data['train'].to_pickle(DATA_LEARNING_FILE)
if training:
    Data['evaluation'].to_pickle(DATA_EVALUATION_FILE)
else:
    Data['test'].to_pickle(DATA_TEST_FILE)

In [None]:
# 20180127c : with windsorization 0.999 , removing rows for store, item without any sale
# 20180127d : idem without windsorization 
# 20180127e : idem with windsorization 0.9999
# 20180127f : idem with windsorization 0.99
# 20180127g : idem with windsorization 0.99, with full set
# 20180127h : idem with windsorization 0.999, with full set
# 20180128c : windsorization on train only 0.999, with train/eval split
# 20180128d : windsorization on train only 0.99, with train/eval split
# 20180128e : no windsorization, with train/eval split
# 20180128f : with windsor 0.999 on price, with train/eval split
# 20180128g : with windsor 0.999 on price, without eval
# 201801230a : with windsor 0.999 on price, with eval, with lagged item_count, shop_id, item_category_id
# 201801230b : with windsor 0.999 on price, without eval, with lagged item_count, shop_id, item_category_id
# 20180201a : with new mean encoding, lagged features, no shop,item, category ids

### Restart it to retrieve data (optional)

In [2]:
import pickle

training = True

case = '20180201a'

DATA_LEARNING_FILE = "../data/sales-" + case
DATA_EVALUATION_FILE = "../data/evaluation-" + case

Data = {}

Data['train'] = pd.read_pickle(DATA_LEARNING_FILE)
if training:
    Data['evaluation'] = pd.read_pickle(DATA_EVALUATION_FILE)   

In [None]:
Data['train'].describe()

#### Create train/eval set

In [14]:
clipping  = True

# Random split
#from sklearn.model_selection import train_test_split
#train_set, test_set = train_test_split(Data['train'], test_size = 0.2, random_state = 42)

# Remove the first year to decrease training time (and after we will add delayed values)
Data['train'] = Data['train'][Data['train'].date_block_num >11].sample(frac=1).reset_index(drop=True)

x_train = Data['train'].drop(['item_shop_count'], axis = 1)

if clipping:
    y_train = Data['train'].item_shop_count.clip(0,20)
else:
    y_train = Data['train'].item_shop_count

# I should remove the evaluation prediction rows with missings category

if training:
    x_eval = Data['evaluation'].drop(['item_shop_count'], axis = 1)
    y_eval = Data['evaluation'].item_shop_count

del(Data)

In [17]:
x_train.describe()

Unnamed: 0,category_shop_count_max_over_months,category_shop_count_mean_over_months,category_shop_price_max_over_months,category_shop_price_mean_over_months,date_block_num,item_count_max_over_all,item_count_mean_over_all,item_price_max_over_all,item_price_mean_over_all,item_shop_count_lag_1,...,item_shop_price_lag_3,item_shop_price_lag_6,item_shop_price_max_over_months,item_shop_price_mean_over_months,month,shop_count_max_over_all,shop_count_mean_over_all,shop_price_max_over_all,shop_price_mean_over_all,year
count,6186922.0,6186922.0,5554142.0,5554142.0,6186922.0,6186922.0,6186922.0,3750817.0,3750817.0,4898597.0,...,855295.0,818884.0,3750817.0,3750817.0,6186922.0,6186922.0,6186922.0,3750817.0,3750817.0,6186922.0
mean,31.27808,0.3387926,3239.089,754.8547,21.1578,2.126528,0.4021299,853.61,766.5754,0.4190263,...,847.795518,798.105608,853.61,766.5754,5.798793,2.126528,0.4021299,853.61,766.5754,2014.363
std,68.58849,1.969598,3178.09,1178.192,5.869434,9.731047,2.795261,1548.723,1457.555,3.892317,...,1562.323323,1448.916012,1548.723,1457.555,3.288499,9.731047,2.795261,1548.723,1457.555,0.4809363
min,0.0,0.0,3.0,3.0,12.0,0.0,-0.2,0.09,0.09,-2.0,...,0.5,0.1,0.09,0.09,1.0,0.0,-0.2,0.09,0.09,2014.0
25%,5.0,0.06407323,999.0,276.2345,16.0,0.0,0.0,229.0,201.7273,0.0,...,199.33,199.0,229.0,201.7273,3.0,0.0,0.0,229.0,201.7273,2014.0
50%,13.0,0.1534579,2299.0,397.7126,21.0,1.0,0.08695652,399.0,349.0,0.0,...,399.0,399.0,399.0,349.0,6.0,1.0,0.08695652,399.0,349.0,2014.0
75%,29.0,0.3439967,3799.0,985.7859,26.0,2.0,0.3333333,990.0,826.6477,0.0,...,959.0,899.0,990.0,826.6477,8.0,2.0,0.3333333,990.0,826.6477,2015.0
max,1644.0,470.2619,22490.0,22317.5,32.0,1644.0,650.1,22490.0,22490.0,1305.0,...,22490.0,22490.0,22490.0,22490.0,12.0,1644.0,650.1,22490.0,22490.0,2015.0


In [33]:
from sklearn.metrics import mean_squared_error

def model_evaluation(model_reg, x_train, y_train, x_test, y_test, use_average = True): 
    sales_predictions = model_reg.predict(x_train.fillna(-999))
    mse = mean_squared_error(y_train, sales_predictions)
    rmse_train = np.sqrt(mse)

    sales_predictions = pd.DataFrame({'pred': model_reg.predict(x_test.fillna(-999))})
    
    if use_average:
        # replace the shop, item rows with no values, with the average on the category 
        missing_shop_item_rows = pd.isnull(x_test.item_shop_price_mean_over_months)
        print('Missing lines for shop,items: ', len(x_test[missing_shop_item_rows]))
        sales_predictions.loc[missing_shop_item_rows, 'pred'] = x_test[missing_shop_item_rows].category_shop_count_mean_over_months
    
        # replace the shop, category with no values, with 0 (the shop is not selling this category)
        missing_shop_category_rows = pd.isnull(sales_predictions.pred)  
        print('Missing lines for shop,category: ', len(x_test[missing_shop_category_rows]))
        sales_predictions.loc[missing_shop_category_rows, 'pred'] = 0
    
    mse = mean_squared_error(y_test.clip(0,20), sales_predictions.pred.clip(0,20))
    rmse_test = np.sqrt(mse)

    print("train error: ", '{0:.3f}'.format(rmse_train), "evaluation error: ", '{0:.3f}'.format(rmse_test))

#### Training with Random Forest Regressor

In [21]:
from sklearn.ensemble import RandomForestRegressor

df_reg = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           n_estimators=25, n_jobs=5, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
df_reg.fit(x_train.fillna(-999), y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           n_estimators=25, n_jobs=5, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [34]:
model_evaluation(df_reg, x_train, y_train, x_eval, y_eval)

Missing lines for shop,items:  121115
Missing lines for shop,category:  5559
train error:  0.537 evaluation error:  1.011


In [28]:
x_eval.columns

Index(['category_shop_count_max_over_months',
       'category_shop_count_mean_over_months',
       'category_shop_price_max_over_months',
       'category_shop_price_mean_over_months', 'date_block_num',
       'item_count_max_over_all', 'item_count_mean_over_all',
       'item_price_max_over_all', 'item_price_mean_over_all',
       'item_shop_count_lag_1', 'item_shop_count_lag_12',
       'item_shop_count_lag_3', 'item_shop_count_lag_6',
       'item_shop_count_max_over_months', 'item_shop_count_mean_over_months',
       'item_shop_price_lag_1', 'item_shop_price_lag_12',
       'item_shop_price_lag_3', 'item_shop_price_lag_6',
       'item_shop_price_max_over_months', 'item_shop_price_mean_over_months',
       'month', 'shop_count_max_over_all', 'shop_count_mean_over_all',
       'shop_price_max_over_all', 'shop_price_mean_over_all', 'year'],
      dtype='object')

In [None]:
# with 20180127c
# max_depth=15, n_estimators=20, max_features: 3 : 1.004 / 1.403
# max_depth=20, n_estimators=20, max_features: 2 : 0.839 / 1.399
# max_depth=20, n_estimators=20, max_features: 3 : 0.818 / 1.394
# max_depth=20, n_estimators=50, max_features: 3 : 0.834 / 1.391 > best
# max_depth=25, n_estimators=20, max_features: 3 : 0.657 / 1.404
# max_depth=20, n_estimators=20, max_features: 4 : 0.795 / 1.410

# with 20180127d, max_depth=20, n_estimators=50, max_features: 3
# train error: 1.104 evaluation error:  4.596 > windsorization is essential!!

# with 20180127e, max_depth=20, n_estimators=25, max_features: 3
# train error:  0.949 evaluation error:  1.612 still not as good

# with 20180127f, max_depth=20, n_estimators=25, max_features: 3
# train error:  0.696 evaluation error:  1.027 > best but beware, 
#I may just be reducing the variance of the evaluation set :)  
# let check with the Kaggle :) 
# with clip 0-20 train error:  0.692 evaluation error:  0.924 (meaningless as it was windsored)

# with 20180127h, max_depth=20, n_estimators=50, max_features: 3, windsor:0.999, full set
# kaggle: 1.41

# with 20180127g, max_depth=20, n_estimators=50, max_features: 3, windsor:0.99, full set
# kaggle: 1.40 > best

# Conclusion: we have brought the results much closer 0.7 / 1.0 / 1.4
# but we are still higher than before ??? 
# does the category do help or harm?

# with 20180127f, removing category stat, max_depth=20, n_estimators=25, max_features: 3, windsor:0.99, full set
# 0.677 evaluation error:  1.043 : slightly worse

# with 20180128d, max_depth=20, n_estimators=25, max_features: 3, windsor on train 0.99
# train error:  0.601 evaluation error:  5.237
# with clipping 0-20: train error:  0.607 evaluation error:  0.979

# with 20180128c, max_depth=20, n_estimators=25, max_features: 3, windsor on train 0.999
# train error:  0.743 evaluation error:  5.039
# with clipping 0-20: train error:  0.743 evaluation error:  0.996

# with 20180128e, max_depth=20, n_estimators=25, max_features: 3, no windsor on train
# train error:  1.095 evaluation error:  4.702
# with clipping 0-20 at evaluation (y_eval and pred): train error:  1.094 evaluation error:  1.002
# with clipping 0-20 of y_train train error:  0.616 evaluation error:  0.975 > best

# with 20180128f, max_depth=20, n_estimators=25, max_features: 3,  windsor 0.999 on price only, clipping 0-20 on train
# train error:  0.616 evaluation error:  0.973 > best

# with 20180128g, max_depth=20, n_estimators=100, max_features: 3,  windsor 0.999 on price only, clipping 0-20 on train
# no eval
# kaggle: 1.02119

# with 20180130a, max_depth=20, n_estimators=25, max_features: 3,  with lagged features, item_id, category_item_id
# 0.567 evaluation error:  0.962 > overfitted...

# with 20180130b, max_depth=20, n_estimators=25, max_features: 3,  with lagged features, without item_id, category_item_id
# 0.579 evaluation error:  0.968 > actually slightly worse
# Kaggle:

# with 20180201a, train error:  0.537 evaluation error:  1.011 > overfit
# idem Xgboost eta = 0.3,max_depth=4, n_estimators=300, learning_rate=0.05:  0.876 evaluation error:  0.977

In [None]:
from sklearn.externals import joblib
joblib.dump(df_reg, '../models/randomforest_20180128g.pkl')
#df_reg = joblib.load('../models/randomforest_20180127g.pkl') 

In [None]:
importances = df_reg.feature_importances_
importances

### Xgboost

In [None]:
import xgboost as xgb

# You can experiment with many other options here, using the same .fit() and .predict()
# methods; see http://scikit-learn.org
# This example uses the current build of XGBoost, from https://github.com/dmlc/xgboost

eval_set = [(x_eval, y_eval)]

gbm = xgb.XGBRegressor(eta = 0.3,
                       max_depth=5, 
                       n_estimators=300, 
                       learning_rate=0.05,
                       n_jobs = 5).fit(x_train, y_train, eval_metric="rmse", eval_set=eval_set, verbose=True)

if training:
    model_evaluation(gbm, x_train, y_train, x_eval, y_eval, use_average = True)

#### Optimizing the hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [10,50, 100], 'max_features': [2,4,6], 'max_depth': [5, 10, 20]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, n_jobs = 4,
                          scoring='neg_mean_squared_error')

grid_search.fit(x_train, y_train)

In [None]:
model_evaluation(grid_search.best_estimator_, x_train, y_train, x_test, y_test)

In [None]:
grid_search.best_estimator_

#### Training with GBRT

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt_reg = GradientBoostingRegressor(learning_rate = 0.1)
gbrt_reg.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

x_test =  test_set.drop(['item_cnt_month'], axis = 1)
y_test = test_set.item_cnt_month

model_evaluation(gbrt_reg, x_train, y_train, x_test, y_test)

## Result

* 20180120-01 with default randomForestRegressor, only 4 features, eval: 4.90 (not better through grid search)
* idem, but adding the item_category, eval: 5.4 vs 2.42 (train set)
* with GBRT, eval = 7.1 vs 7.36(train set) > not working
* with Random Forest, adding the min, max, mean of sales per store,item eval 4.86 vs 2.04
* idem, removing the item_id: train error:  2.16 test_error:  4.64
* idem, selecting the best estimator: train error:  1.93 test_error:  4.56
* adding the average price per item, per shop and overall, max_features 3, n_estimators=50 train error:  train error:  1.98 test_error:  4.45 (best)
* with one hot encoding of the shop_id: train error:  1.97 test_error:  4.90
* with one hot encoding of the item_category_id: train error:  2.07 test_error:  4.79
* with windsorization of the count_item_day, and the price the results improve a lot: train error:  1.08 test_error:  2.91
* adding the 0 values: train error:  0.33 test_error:  0.94 > kaggle 1.27 without putting missing items to 0 this does not change a thing actually)
* item category is useless > split and provide some price per category
* with new train/test set, not removing the new item prediction: train error:  0.34 test_error:  1.94 / Kaggle: 2.61 (I did not remove the 0 prediction there, nor retrain on the full set)
* with category stats randomforest_20180126b: tain_error:  0.35 test_error:  1.75 Kaggle: 2.4 
* idem, removing missing items: Kaggle: 2.42 it is worse, removing missing category is better!
* with full training: Kaggle: 2.58!!! maybe I am just tragically overfitting
* trying to reduce overfitting by putting max_features = 2/ min_samples_leaf=2: train error:  0.657 test_error:  1.805
* removing the mean, removing the artificial item deletion from validation set: train error:  0.339 evaluation error:  1.768 Kaggle: 

* with removal of missing shop, item rows, and replacement by avrage and 0, 100 estimators: train error:  0.400 evaluation error:  1.417

#### Submission preparation

In [None]:
import pickle

DATA_TEST_FILE = "../data/test-201801230b"

Data = {}

Data['test'] = pd.read_pickle(DATA_TEST_FILE)

Data['test'].head(20)

In [None]:
use_average = True

X_test = Data['test'].drop(['ID'], axis = 1)

# Option 1
# It trust the model will learn from the category count for the missing item - 
# for the missing category, I set the prediction to 0
# missing_shop_item_indices = pd.isnull(Data['test']['item_cnt_month_mean'])
# This is not improving the score!

predictions = pd.DataFrame({'pred': df_reg.predict(X_test.fillna(0)).clip(0,20)})

# replace the shop, item rows with no values, with the average on the category

if use_average:
        # replace the shop, item rows with no values, with the average on the category 
        missing_shop_item_rows = pd.isnull(X_test.item_shop_mean_price)
        print('Missing lines for shop,items: ', len(X_test[missing_shop_item_rows]))
        predictions.loc[missing_shop_item_rows, 'pred'] = X_test[missing_shop_item_rows].category_cnt_month_mean

# for the shop with not category, replace with 

missing_shop_category_rows = pd.isnull(predictions.pred)
print('Missing lines for shop,category: ', len(X_test[missing_shop_category_rows]))
predictions.loc[missing_shop_category_rows, 'pred'] = 0

# Create the submission file:

submission = data_load['sample_submission'].copy()

submission.loc[:, 'item_cnt_month'] = predictions.pred
submission.head(10)

In [None]:
SUBMISSION_FILE = "../data/sales_sub_20180130b.csv"

submission.to_csv(SUBMISSION_FILE, index = False)

### One-hot encoding (optional, only for DNN)

In [None]:
# One hot encoding of shop_id
# The test set has only a few shops, so we have to use scikitlearn onehotencoder

cols = ['shop_id']

enc = preprocessing.OneHotEncoder()

# FIT
enc.fit(Data['train'][cols])

# Transform
for set_name in ['train', 'test']:
    vec_data= pd.DataFrame(enc.transform(Data[set_name][cols]).toarray())
    vec_data.columns = ["shop_id_" + str(i) for i in range(enc.feature_indices_[1])]
    vec_data.index = Data[set_name].index
    Data[set_name] = Data[set_name].drop(cols, axis=1)
    Data[set_name] = Data[set_name].join(vec_data)
    
# One hot encoding of item_category_id
# The test set has only a few shops, so we have to use scikitlearn onehotencoder

cols = ['item_category_id']

enc = preprocessing.OneHotEncoder()

# FIT
enc.fit(Data['train'][cols])

# Transform
for set_name in ['train', 'test']:
    vec_data= pd.DataFrame(enc.transform(Data[set_name][cols]).toarray())
    vec_data.columns = ["item_category_id_" + str(i) for i in range(enc.feature_indices_[1])]
    vec_data.index = Data[set_name].index
    Data[set_name] = Data[set_name].drop(cols, axis=1)
    Data[set_name] = Data[set_name].join(vec_data)   

In [None]:
y_train.describe()

In [None]:
from sklearn.metrics import mean_squared_error


sales_predictions = pd.DataFrame(Data['evaluation'].item_cnt_month.copy())
sales_predictions.item_cnt_month = 0.28



mse = mean_squared_error(Data['evaluation'].item_cnt_month, sales_predictions.item_cnt_month)
rmse_test = np.sqrt(mse)
rmse_test

In [None]:
y_train.describe()

In [None]:
Data['train']['item_cnt_month_minus_12'] = 0
Data['train']['item_cnt_month_minus_3'] = 0
Data['train']['item_cnt_month_minus_1'] = 0 

for month in range(12, 33,1):
    condition = Data['train'].date_block_num == month
    Data['train'].loc[condition, 'item_cnt_month_minus_12'] = Data['train'][Data['train'].date_block_num == (month-12)]
    Data['train'].loc[condition, 'item_cnt_month_minus_3'] = Data['train'][Data['train'].date_block_num == (month-3)]
    Data['train'].loc[condition, 'item_cnt_month_minus_1'] = Data['train'][Data['train'].date_block_num == (month-1)]

if training:
    Data['evaluation']['item_cnt_month_minus_12'] = 0
    Data['evaluation']['item_cnt_month_minus_3'] = 0
    Data['evaluation']['item_cnt_month_minus_1'] = 0 

    Data['evaluation'].loc[:, 'item_cnt_month_minus_12'] = Data['train'][Data['train'].date_block_num == (33-12)]
    Data['evaluation'].loc[:, 'item_cnt_month_minus_3'] = Data['train'][Data['train'].date_block_num == (33-3)]
    Data['evaluation'].loc[:, 'item_cnt_month_minus_1'] = Data['train'][Data['train'].date_block_num == (33-1)]    

# Merge version for evaluation and test
Data['test']['item_cnt_month_minus_12'] = 0
Data['test']['item_cnt_month_minus_3'] = 0
Data['test']['item_cnt_month_minus_1'] = 0 