## Objectives:
* Step 1: prepare the training and evaluation data set
* Step 2: training with random forest

In [None]:
import pandas as pd
import numpy as np
import datetime
import calendar
from sklearn import preprocessing
from sklearn import feature_extraction
import itertools
from collections import OrderedDict

# To use part or all the train set
training = True

#### Prepare the tables

In [None]:
#Load all Files (hey must be in input directory in a brother directory of the notebook)
data_load = {
    'item_categories': pd.read_csv('../input/item_categories.csv'), 
    'items': pd.read_csv('../input/items.csv'), 
    'sales_train': pd.read_csv('../input/sales_train_v2.csv'),
    'sample_submission': pd.read_csv('../input/sample_submission.csv'),
    'shops': pd.read_csv('../input/shops.csv'),
    'test': pd.read_csv('../input/test.csv')
}

In [None]:
Data = {}

# Sales data 
data_load['sales_train']['date'] = pd.to_datetime(data_load['sales_train']['date'], format = "%d.%m.%Y")

#Data['sales']['day'] = transactions['date'].dt.day
data_load['sales_train']['month'] = data_load['sales_train']['date'].dt.month
data_load['sales_train']['year'] = data_load['sales_train']['date'].dt.year

In [None]:
data_load['sales_train'][(data_load['sales_train'].shop_id == 5) & (data_load['sales_train'].item_id == 485)]

In [None]:
data_load['sales_train'].describe(include = 'all')

### Split the training/evaluation set, with similar pattern as the test set:
* All shops in both sets
* evaluation set on the last month
* unknown items in the evaluation set

In [None]:
import random

# Calculate the total of items sold per month, item, shop

Data['train'] = data_load['sales_train'].groupby(['date_block_num', 'shop_id', 'item_id'], as_index = False).item_cnt_day.agg({
    'item_cnt_month': np.sum
})

# We calculate the average per line, not weighted by the number of sales to simplify (and avoid division per 0)

tmp = data_load['sales_train'].groupby(['date_block_num', 'shop_id', 'item_id'], as_index= False).item_price.agg({
    'item_mean_price_shop_month': np.mean
})

Data['train'] = Data['train'].merge(tmp,
                    how = 'left',
                    on = ['date_block_num', 'shop_id', 'item_id'])

if training: 
    
    # Split on date
    
    condition = Data['train']['date_block_num']==33
    Data['evaluation'] = Data['train'][condition]
    Data['train'] = Data['train'][~condition]
    
    print("sizes:" ,Data['evaluation'].shape, Data['train'].shape)

    # The following part was disable, because the evaluation set already presents a larger proportion of unknown items than 
    # the test set
    
    # Select 7.1% of the items to remove from the data set (363 on 5100 )
    # This may be too high, as some items could be appearing only in this last month: TO CHECK > in fact the proportion 
    # of missing item is already too large

    #list_item_ids = list(Data['train'].item_id.unique())
    #n = int(363/5100*21807)

    # Remove the items from training set

    #removed_item_ids = random.sample(list_item_ids, n)
    #print("Number of items removed from train set:", len(removed_item_ids))

    #condition = Data['train'].item_id.isin(removed_item_ids)
    #Data['train'] = Data['train'][~condition]

    #print(Data['evaluation'].shape, Data['train'].shape)

In [None]:
 Data['train'][( Data['train'].shop_id == 5) & ( Data['train'].item_id == 485)]

### Winsorization - on training set only to evaluate the effect on prediction

In [None]:
price_limit = Data['train'].item_mean_price_shop_month.quantile([0.0, 0.999])[0.999]
prices = Data['train'].item_mean_price_shop_month
Data['train'].loc[(prices > price_limit), 'item_mean_price_shop_month'] = price_limit

# clipping must be done after the stats are calculated
#Data['train'].loc[:,'item_cnt_month'] = Data['train'].item_cnt_month.clip(0,20)

#### Add the missing rows

In [None]:
if training:
    set_list = ['train', 'evaluation']
else:
    set_list = ['train']
    
months = data_load['sales_train'].groupby(['month', 'year'], as_index = False).date_block_num.first()

for set_name in set_list:

    # Add the missing rows date_block_num,shop_id, item_id with item_cnt_month = 0
    
    print('Adding the missing rows for', set_name, 'set...')
    unique_date_block_num = sorted(Data[set_name].date_block_num.unique())
    unique_shop_id = sorted(Data[set_name].shop_id.unique())
    unique_item_id = sorted(Data[set_name].item_id.unique())

    d = {
        'date_block_num': unique_date_block_num,
        'item_id': unique_item_id,
        'shop_id': unique_shop_id
    }

    tmp = list(itertools.product(*[unique_date_block_num, unique_item_id, unique_shop_id]))
    od = OrderedDict(sorted(d.items()))

    df = pd.DataFrame(tmp,columns=od.keys())
    Data[set_name] = df.merge(Data[set_name],
                             how= 'left',
                             on = ['shop_id', 'item_id', 'date_block_num'])

    # Add the month and year
    
    Data[set_name] = Data[set_name].merge(months,
                   how = 'left',
                   on = ['date_block_num'])
    
    # For new rows missing values:
    
    Data[set_name].item_cnt_month = Data[set_name].item_cnt_month.fillna(0)

In [None]:
Data['train'][ (Data['train'].date_block_num == 6) & ( Data['train'].shop_id == 5) & ( Data['train'].item_id == 485)]

In [None]:

### Add the lagged information on item count
print("Adding item count lagged values")

Data['train']['item_cnt_month_minus_12'] = 0
Data['train']['item_cnt_month_minus_3'] = 0
Data['train']['item_cnt_month_minus_1'] = 0 

for it_month in range(12, 33,1):
    condition = Data['train'].date_block_num == it_month
    Data['train'].loc[condition, 'item_cnt_month_minus_12'] = Data['train'][(Data['train'].date_block_num == it_month-12)].as_matrix(columns=['item_cnt_month'])


In [None]:
Data['train'][ (Data['train'].date_block_num == 18) & ( Data['train'].shop_id == 5) & ( Data['train'].item_id == 485)]

#### Add the other features

In [None]:
# 'train' must be first in those lists
if training:
    set_list = ['train', 'test' ,'evaluation']
else:
    set_list = ['train', 'test']

# Prepare the test set
print("Preparing the test set")

Data['test'] = data_load['test'].copy()
Data['test']['month'] = 11
Data['test']['year'] = 2015
Data['test']['date_block_num'] = 34

cols = ['ID', 'date_block_num', 'item_id', 'shop_id','month', 'year']
Data['test'] = Data['test'][cols]

#print(Data['test'].columns.tolist())

### Add the item_category

print("Adding item category stats")

for set_name in set_list:
    Data[set_name] = Data[set_name].merge(data_load['items'],
            how = 'left',
            on = 'item_id',
            ).drop(['item_name'], axis = 1)

# Stats over the items and the months
tmp = Data['train'].groupby(['shop_id', 'item_category_id'], as_index = False).item_cnt_month.agg({
    'category_cnt_month_mean': np.mean,   
    'category_cnt_month_max': np.max
})

for set_name in set_list:
    Data[set_name] = Data[set_name].merge(tmp,
            how = 'left',
            on = ['item_category_id', 'shop_id']
            )
    
# The category without sales for this restaurant will remain with a NaN (removed when creating the X/y)

### Add the lagged information on item count
print("Adding item count lagged values")

Data['train']['item_cnt_month_minus_12'] = 0
Data['train']['item_cnt_month_minus_3'] = 0
Data['train']['item_cnt_month_minus_1'] = 0 

for it_month in range(12, 33,1):
    condition = Data['train'].date_block_num == it_month
    Data['train'].loc[condition, 'item_cnt_month_minus_12'] = Data['train'][(Data['train'].date_block_num == it_month-12)].as_matrix(columns=['item_cnt_month'])
    Data['train'].loc[condition, 'item_cnt_month_minus_3'] = Data['train'][(Data['train'].date_block_num == it_month-3)].as_matrix(columns=['item_cnt_month'])
    Data['train'].loc[condition, 'item_cnt_month_minus_1'] = Data['train'][(Data['train'].date_block_num == it_month-1)].as_matrix(columns=['item_cnt_month'])

if training:
    for lag in [1,3,12]:
        print(lag)
        tmp = Data['train'][Data['train'].date_block_num == (33-lag)].item_cnt_month
        Data['evaluation'] = Data['evaluation'].merge(tmp,
                how = 'left',
                on = ['item_id', 'shop_id']
                ).rename(columns={'item_cnt_month':'item_cnt_month_minus_'+lag })    
else:     
    for lag in [1,3,12]:
        print(lag)
        tmp = Data['train'][Data['train'].date_block_num == (34-lag)][['item_cnt_month']]
        Data['test'] = Data['test'].merge(tmp,
                how = 'left',
                on = ['item_id', 'shop_id']
                ).rename(columns={'item_cnt_month':'item_cnt_month_minus_'+lag })

  
    
### Add the stats on item_cnt_month for every couple store,item
# Only known values are for the train set

print("Adding item count stats")

# Stats over the months
tmp = Data['train'].groupby(['shop_id', 'item_id'], as_index = False).item_cnt_month.agg({
    'item_cnt_month_mean': np.mean,
    'item_cnt_month_max': np.max
})
# Warning: if the couple item, shop is not present in the train set for any month, the following will generate a NaN:
# * for merge with the train set: this will produce a 0 (all couples exist) > we remove those later as they are all the same
# and cannot be used to learn anything
# * for merge with evaluation/test set: this will produce a NaN > those lines will be replaced by stats
# > this is wrong, the system cannot learn, we have to replace those lines in the eval/test after prediction

for set_name in set_list:
    Data[set_name] = Data[set_name].merge(tmp,
            how = 'left',
            on = ['item_id', 'shop_id']
            )
    
# Add the average price per item and shop, over the train data set (average price will be missing for some items)

print("Adding shop item price stats")

# Average over the months of the train set per item and shop
# The mean is not weighted by the sales

tmp = Data['train'].groupby(['item_id', 'shop_id'], as_index = False).item_mean_price_shop_month.agg({
    'item_shop_mean_price': np.mean
})

for set_name in set_list:
    Data[set_name] = Data[set_name].merge(tmp,
            how = 'left',
            on = ['item_id', 'shop_id']
            )

# We remove the couple (item, shop) without any sale from the training set
condition = pd.isnull(Data['train'].item_shop_mean_price)
Data['train'] = Data['train'][~condition]
    
    #missing_shop_item_indices = pd.isnull(Data[set_name]['item_shop_mean_price'])
    #Data[set_name].loc[missing_shop_item_indices, 'item_shop_mean_price'] = -999

print("Adding overall item price stats")    
    
# Add the average price per item over the train data set (average price will be missing for some items)
# The mean is not weighted by the sales

tmp = Data['train'].groupby(['item_id'], as_index = False).item_mean_price_shop_month.agg({
    'item_overall_mean_price': np.mean
})

for set_name in set_list:
    Data[set_name] = Data[set_name].merge(tmp,
            how = 'left',
            on = ['item_id']
            )
    # The item with no sale in the training set will be addressed after prediction
    # missing_item_indices = pd.isnull(Data[set_name]['item_overall_mean_price'])
    # Data[set_name].loc[missing_item_indices, 'item_overall_mean_price'] = -999

# Remove the item_id TO PUT BACK

#for set_name in set_list:
#    Data[set_name] = Data[set_name].drop(['item_id', 'item_category_id'], axis = 1)
    
Data['train'] = Data['train'].drop(['item_mean_price_shop_month'], axis = 1)
if training:
    Data['evaluation'] = Data['evaluation'].drop(['item_mean_price_shop_month'], axis = 1)

print("Done!")
#Data['evaluation'].head(10)

In [None]:
tmp = Data['train'][Data['train'].date_block_num == (34-lag)][['item_cnt_month']]
print(lag)
tmp

In [None]:
if training:
    print(Data['evaluation'].shape, Data['train'].shape)

#### Save data to save memory

In [None]:
import pickle
import gc
gc.collect()

DATA_LEARNING_FILE = "../data/sales-20180128g"
DATA_EVALUATION_FILE = "../data/evaluation-20180128g"
DATA_TEST_FILE = "../data/test-20180128g"

Data['train'].to_pickle(DATA_LEARNING_FILE)
if training:
    Data['evaluation'].to_pickle(DATA_EVALUATION_FILE)
Data['test'].to_pickle(DATA_TEST_FILE)

In [None]:
# 20180127c : with windsorization 0.999 , removing rows for store, item without any sale
# 20180127d : idem without windsorization 
# 20180127e : idem with windsorization 0.9999
# 20180127f : idem with windsorization 0.99
# 20180127g : idem with windsorization 0.99, with full set
# 20180127h : idem with windsorization 0.999, with full set
# 20180128c : windsorization on train only 0.999, with train/eval split
# 20180128d : windsorization on train only 0.99, with train/eval split
# 20180128e : no windsorization, with train/eval split
# 20180128f : with windsor 0.999 on price, with train/eval split
# 20180128g : with windsor 0.999 on price, without eval

### Restart it to retrieve data (optional)

In [None]:
import pickle

training = True

DATA_LEARNING_FILE = "../data/sales-20180128f"
DATA_EVALUATION_FILE = "../data/evaluation-20180128f"

Data = {}

Data['train'] = pd.read_pickle(DATA_LEARNING_FILE)
if training:
    Data['evaluation'] = pd.read_pickle(DATA_EVALUATION_FILE)   

In [None]:
Data['train'].describe()

#### Create train/eval set

In [None]:
clipping  = True

# Random split
#from sklearn.model_selection import train_test_split
#train_set, test_set = train_test_split(Data['train'], test_size = 0.2, random_state = 42)

# Remove the first year to decrease training time (and after we will add delayed values)
Data['train'] = Data['train'][Data['train'].date_block_num >11].sample(frac=1).reset_index(drop=True)

x_train = Data['train'].drop(['item_cnt_month'], axis = 1)

if clipping:
    y_train = Data['train'].item_cnt_month.clip(0,20)
else:
    y_train = Data['train'].item_cnt_month

# I should remove the evaluation prediction rows with missings category

if training:
    x_eval = Data['evaluation'].drop(['item_cnt_month'], axis = 1)
    y_eval = Data['evaluation'].item_cnt_month

del(Data)

In [None]:
from sklearn.metrics import mean_squared_error

def model_evaluation(model_reg, x_train, y_train, x_test, y_test, use_average = True): 
    sales_predictions = model_reg.predict(x_train)
    mse = mean_squared_error(y_train, sales_predictions)
    rmse_train = np.sqrt(mse)

    sales_predictions = pd.DataFrame({'pred': model_reg.predict(x_test.fillna(0))})
    
    if use_average:
        # replace the shop, item rows with no values, with the average on the category 
        missing_shop_item_rows = pd.isnull(x_test.item_shop_mean_price)
        print('Missing lines for shop,items: ', len(x_test[missing_shop_item_rows]))
        sales_predictions.loc[missing_shop_item_rows, 'pred'] = x_test[missing_shop_item_rows].category_cnt_month_mean
    
    # replace the shop, category with no values, with 0 (the shop is not selling this category)
    missing_shop_category_rows = pd.isnull(sales_predictions.pred)  
    print('Missing lines for shop,category: ', len(x_test[missing_shop_category_rows]))
    sales_predictions.loc[missing_shop_category_rows, 'pred'] = 0
    
    mse = mean_squared_error(y_test.clip(0,20), sales_predictions.pred.clip(0,20))
    rmse_test = np.sqrt(mse)

    print("train error: ", '{0:.3f}'.format(rmse_train), "evaluation error: ", '{0:.3f}'.format(rmse_test))

#### Training with Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

df_reg = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=5, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
df_reg.fit(x_train, y_train)

In [None]:
# with 20180127c
# max_depth=15, n_estimators=20, max_features: 3 : 1.004 / 1.403
# max_depth=20, n_estimators=20, max_features: 2 : 0.839 / 1.399
# max_depth=20, n_estimators=20, max_features: 3 : 0.818 / 1.394
# max_depth=20, n_estimators=50, max_features: 3 : 0.834 / 1.391 > best
# max_depth=25, n_estimators=20, max_features: 3 : 0.657 / 1.404
# max_depth=20, n_estimators=20, max_features: 4 : 0.795 / 1.410

# with 20180127d, max_depth=20, n_estimators=50, max_features: 3
# train error: 1.104 evaluation error:  4.596 > windsorization is essential!!

# with 20180127e, max_depth=20, n_estimators=25, max_features: 3
# train error:  0.949 evaluation error:  1.612 still not as good

# with 20180127f, max_depth=20, n_estimators=25, max_features: 3
# train error:  0.696 evaluation error:  1.027 > best but beware, 
#I may just be reducing the variance of the evaluation set :)  
# let check with the Kaggle :) 
# with clip 0-20 train error:  0.692 evaluation error:  0.924 (meaningless as it was windsored)

# with 20180127h, max_depth=20, n_estimators=50, max_features: 3, windsor:0.999, full set
# kaggle: 1.41

# with 20180127g, max_depth=20, n_estimators=50, max_features: 3, windsor:0.99, full set
# kaggle: 1.40 > best


# Conclusion: we have brought the results much closer 0.7 / 1.0 / 1.4
# but we are still higher than before ??? 
# does the category do help or harm?

# with 20180127f, removing category stat, max_depth=20, n_estimators=25, max_features: 3, windsor:0.99, full set
# 0.677 evaluation error:  1.043 : slightly worse

# with 20180128d, max_depth=20, n_estimators=25, max_features: 3, windsor on train 0.99
# train error:  0.601 evaluation error:  5.237
# with clipping 0-20: train error:  0.607 evaluation error:  0.979

# with 20180128c, max_depth=20, n_estimators=25, max_features: 3, windsor on train 0.999
# train error:  0.743 evaluation error:  5.039
# with clipping 0-20: train error:  0.743 evaluation error:  0.996

# with 20180128e, max_depth=20, n_estimators=25, max_features: 3, no windsor on train
# train error:  1.095 evaluation error:  4.702
# with clipping 0-20 at evaluation (y_eval and pred): train error:  1.094 evaluation error:  1.002
# with clipping 0-20 of y_train train error:  0.616 evaluation error:  0.975 > best

# with 20180128f, max_depth=20, n_estimators=25, max_features: 3,  windsor 0.999 on price only, clipping 0-20 on train
# train error:  0.616 evaluation error:  0.973 > best

# with 20180128g, max_depth=20, n_estimators=100, max_features: 3,  windsor 0.999 on price only, clipping 0-20 on train
# no eval
# kaggle: 1.02119

In [None]:
from sklearn.externals import joblib
joblib.dump(df_reg, '../models/randomforest_20180128g.pkl')
#df_reg = joblib.load('../models/randomforest_20180127g.pkl') 

In [None]:
importances = df_reg.feature_importances_
importances

### Xgboost

In [None]:
import xgboost as xgb

# You can experiment with many other options here, using the same .fit() and .predict()
# methods; see http://scikit-learn.org
# This example uses the current build of XGBoost, from https://github.com/dmlc/xgboost
gbm = xgb.XGBRegressor(max_depth=3, 
                       n_estimators=300, 
                       learning_rate=0.05,
                       n_jobs = 5).fit(x_train, y_train)

if training:
    model_evaluation(gbm, x_train, y_train, x_eval, y_eval, use_average = True)

#### Optimizing the hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [10,50, 100], 'max_features': [2,4,6], 'max_depth': [5, 10, 20]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, n_jobs = 4,
                          scoring='neg_mean_squared_error')

grid_search.fit(x_train, y_train)

In [None]:
model_evaluation(grid_search.best_estimator_, x_train, y_train, x_test, y_test)

In [None]:
grid_search.best_estimator_

#### Training with GBRT

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt_reg = GradientBoostingRegressor(learning_rate = 0.1)
gbrt_reg.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

x_test =  test_set.drop(['item_cnt_month'], axis = 1)
y_test = test_set.item_cnt_month

model_evaluation(gbrt_reg, x_train, y_train, x_test, y_test)

## Result

* 20180120-01 with default randomForestRegressor, only 4 features, eval: 4.90 (not better through grid search)
* idem, but adding the item_category, eval: 5.4 vs 2.42 (train set)
* with GBRT, eval = 7.1 vs 7.36(train set) > not working
* with Random Forest, adding the min, max, mean of sales per store,item eval 4.86 vs 2.04
* idem, removing the item_id: train error:  2.16 test_error:  4.64
* idem, selecting the best estimator: train error:  1.93 test_error:  4.56
* adding the average price per item, per shop and overall, max_features 3, n_estimators=50 train error:  train error:  1.98 test_error:  4.45 (best)
* with one hot encoding of the shop_id: train error:  1.97 test_error:  4.90
* with one hot encoding of the item_category_id: train error:  2.07 test_error:  4.79
* with windsorization of the count_item_day, and the price the results improve a lot: train error:  1.08 test_error:  2.91
* adding the 0 values: train error:  0.33 test_error:  0.94 > kaggle 1.27 without putting missing items to 0 this does not change a thing actually)
* item category is useless > split and provide some price per category
* with new train/test set, not removing the new item prediction: train error:  0.34 test_error:  1.94 / Kaggle: 2.61 (I did not remove the 0 prediction there, nor retrain on the full set)
* with category stats randomforest_20180126b: tain_error:  0.35 test_error:  1.75 Kaggle: 2.4 
* idem, removing missing items: Kaggle: 2.42 it is worse, removing missing category is better!
* with full training: Kaggle: 2.58!!! maybe I am just tragically overfitting
* trying to reduce overfitting by putting max_features = 2/ min_samples_leaf=2: train error:  0.657 test_error:  1.805
* removing the mean, removing the artificial item deletion from validation set: train error:  0.339 evaluation error:  1.768 Kaggle: 

* with removal of missing shop, item rows, and replacement by avrage and 0, 100 estimators: train error:  0.400 evaluation error:  1.417

#### Submission preparation

In [None]:
import pickle

DATA_TEST_FILE = "../data/test-20180128g"

Data = {}

Data['test'] = pd.read_pickle(DATA_TEST_FILE)

Data['test'].head(20)

In [None]:
use_average = True

X_test = Data['test'].drop(['ID'], axis = 1)

# Option 1
# It trust the model will learn from the category count for the missing item - 
# for the missing category, I set the prediction to 0
# missing_shop_item_indices = pd.isnull(Data['test']['item_cnt_month_mean'])
# This is not improving the score!

predictions = pd.DataFrame({'pred': df_reg.predict(X_test.fillna(0)).clip(0,20)})

# replace the shop, item rows with no values, with the average on the category

if use_average:
        # replace the shop, item rows with no values, with the average on the category 
        missing_shop_item_rows = pd.isnull(X_test.item_shop_mean_price)
        print('Missing lines for shop,items: ', len(X_test[missing_shop_item_rows]))
        predictions.loc[missing_shop_item_rows, 'pred'] = X_test[missing_shop_item_rows].category_cnt_month_mean

# for the shop with not category, replace with 

missing_shop_category_rows = pd.isnull(predictions.pred)
print('Missing lines for shop,category: ', len(X_test[missing_shop_category_rows]))
predictions.loc[missing_shop_category_rows, 'pred'] = 0

# Create the submission file:

submission = data_load['sample_submission'].copy()

submission.loc[:, 'item_cnt_month'] = predictions.pred
submission.head(10)

In [None]:
SUBMISSION_FILE = "../data/sales_sub_20180128g.csv"

submission.to_csv(SUBMISSION_FILE, index = False)

### One-hot encoding (optional, only for DNN)

In [None]:
# One hot encoding of shop_id
# The test set has only a few shops, so we have to use scikitlearn onehotencoder

cols = ['shop_id']

enc = preprocessing.OneHotEncoder()

# FIT
enc.fit(Data['train'][cols])

# Transform
for set_name in ['train', 'test']:
    vec_data= pd.DataFrame(enc.transform(Data[set_name][cols]).toarray())
    vec_data.columns = ["shop_id_" + str(i) for i in range(enc.feature_indices_[1])]
    vec_data.index = Data[set_name].index
    Data[set_name] = Data[set_name].drop(cols, axis=1)
    Data[set_name] = Data[set_name].join(vec_data)
    
# One hot encoding of item_category_id
# The test set has only a few shops, so we have to use scikitlearn onehotencoder

cols = ['item_category_id']

enc = preprocessing.OneHotEncoder()

# FIT
enc.fit(Data['train'][cols])

# Transform
for set_name in ['train', 'test']:
    vec_data= pd.DataFrame(enc.transform(Data[set_name][cols]).toarray())
    vec_data.columns = ["item_category_id_" + str(i) for i in range(enc.feature_indices_[1])]
    vec_data.index = Data[set_name].index
    Data[set_name] = Data[set_name].drop(cols, axis=1)
    Data[set_name] = Data[set_name].join(vec_data)   

In [None]:
y_train.describe()

In [None]:
from sklearn.metrics import mean_squared_error


sales_predictions = pd.DataFrame(Data['evaluation'].item_cnt_month.copy())
sales_predictions.item_cnt_month = 0.28



mse = mean_squared_error(Data['evaluation'].item_cnt_month, sales_predictions.item_cnt_month)
rmse_test = np.sqrt(mse)
rmse_test

In [None]:
y_train.describe()

In [None]:
Data['train']['item_cnt_month_minus_12'] = 0
Data['train']['item_cnt_month_minus_3'] = 0
Data['train']['item_cnt_month_minus_1'] = 0 

for month in range(12, 33,1):
    condition = Data['train'].date_block_num == month
    Data['train'].loc[condition, 'item_cnt_month_minus_12'] = Data['train'][Data['train'].date_block_num == (month-12)]
    Data['train'].loc[condition, 'item_cnt_month_minus_3'] = Data['train'][Data['train'].date_block_num == (month-3)]
    Data['train'].loc[condition, 'item_cnt_month_minus_1'] = Data['train'][Data['train'].date_block_num == (month-1)]

if training:
    Data['evaluation']['item_cnt_month_minus_12'] = 0
    Data['evaluation']['item_cnt_month_minus_3'] = 0
    Data['evaluation']['item_cnt_month_minus_1'] = 0 

    Data['evaluation'].loc[:, 'item_cnt_month_minus_12'] = Data['train'][Data['train'].date_block_num == (33-12)]
    Data['evaluation'].loc[:, 'item_cnt_month_minus_3'] = Data['train'][Data['train'].date_block_num == (33-3)]
    Data['evaluation'].loc[:, 'item_cnt_month_minus_1'] = Data['train'][Data['train'].date_block_num == (33-1)]    

# Merge version for evaluation and test
Data['test']['item_cnt_month_minus_12'] = 0
Data['test']['item_cnt_month_minus_3'] = 0
Data['test']['item_cnt_month_minus_1'] = 0 