## Objectives:
* Step 1: prepare the training and evaluation data set
* Step 2: training with random forest

In [6]:
import pandas as pd
import numpy as np
import datetime
import calendar
from sklearn import preprocessing
from sklearn import feature_extraction
import itertools
from collections import OrderedDict

# To use part or all the train set
training = True

#### Prepare the tables

In [None]:
#Load all Files (hey must be in input directory in a brother directory of the notebook)
data_load = {
    'item_categories': pd.read_csv('../input/item_categories.csv'), 
    'items': pd.read_csv('../input/items.csv'), 
    'sales_train': pd.read_csv('../input/sales_train_v2.csv'),
    'sample_submission': pd.read_csv('../input/sample_submission.csv'),
    'shops': pd.read_csv('../input/shops.csv'),
    'test': pd.read_csv('../input/test.csv')
}

In [None]:
Data = {}

# Sales data 
data_load['sales_train']['date'] = pd.to_datetime(data_load['sales_train']['date'], format = "%d.%m.%Y")

#Data['sales']['day'] = transactions['date'].dt.day
data_load['sales_train']['month'] = data_load['sales_train']['date'].dt.month
data_load['sales_train']['year'] = data_load['sales_train']['date'].dt.year

In [None]:
data_load['sales_train'].describe(include = 'all')

### Split the training/evaluation set, with similar pattern as the test set:
* All shops in both sets
* evaluation set on the last month
* unknown items in the evaluation set

In [None]:
Data['train'] = data_load['sales_train'].groupby(['date_block_num', 'shop_id', 'item_id'], as_index = False).agg({
    'item_price': np.mean,
    'item_cnt_day': np.sum
}).rename(columns = {'item_cnt_day': 'item_shop_count',
          'item_price': 'item_shop_price' })

if training: 
    
    # Split on date to create the evaluation set
    
    print("Preparing the evaluation set")
    
    condition = Data['train']['date_block_num']==33
    Data['evaluation'] = Data['train'][condition]
    Data['train'] = Data['train'][~condition]
    
    print("sizes:" ,Data['evaluation'].shape, Data['train'].shape)
    
else:
    
    # Prepare the test set
    
    print("Preparing the test set")

    Data['test'] = data_load['test'].copy()
    Data['test']['month'] = 11
    Data['test']['year'] = 2015
    Data['test']['date_block_num'] = 34

    cols = ['ID', 'date_block_num', 'item_id', 'shop_id','month', 'year']
    Data['test'] = Data['test'][cols]

In [None]:
 Data['train'][( Data['train'].shop_id == 5) & ( Data['train'].item_id == 485)]

### Winsorization - on training set only to evaluate the effect on prediction

In [None]:
price_limit = Data['train'].item_shop_price.quantile([0.0, 0.999])[0.999]
prices = Data['train'].item_shop_price
Data['train'].loc[(prices > price_limit), 'item_shop_price'] = price_limit

#### Add the missing rows

In [None]:
if training:
    set_list = ['train', 'evaluation']
else:
    set_list = ['train']
    
months = data_load['sales_train'].groupby(['month', 'year'], as_index = False).date_block_num.first()

for set_name in set_list:

    # Add the missing rows date_block_num,shop_id, item_id with item_cnt_month = 0
    
    print('Adding the missing rows for', set_name, 'set...')
    
    # From the assignment of week 3
    # It differs from old method, because we do not consider the shops without sales or the items not sold during a month
    # whereas before we were considering all combinations, and afterwards we were removing the couple shop, item
    # with no sales on the full period: that means 10M rows vs 14M and a mean value of 0.3343 which is too high compare to 
    # the test set: 0.28
    # However, it solves the issue of items not sold for months impacting the average
    index_cols = ['shop_id', 'item_id', 'date_block_num']

    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in Data[set_name]['date_block_num'].unique():
        cur_shops = Data[set_name][Data[set_name]['date_block_num']==block_num]['shop_id'].unique()
        cur_items = Data[set_name][Data[set_name]['date_block_num']==block_num]['item_id'].unique()
        grid.append(np.array(list(itertools.product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    #turn the grid into pandas dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    #join aggregated data to the grid
    Data[set_name] = pd.merge(grid,Data[set_name],how='left',on=index_cols)
    #sort the data
    Data[set_name].sort_values(['date_block_num','shop_id','item_id'],inplace=True)

    # Add the month and year
    
    Data[set_name] = Data[set_name].merge(months,
                   how = 'left',
                   on = ['date_block_num'])
    
    # For new rows missing values:
    
    Data[set_name].item_shop_count = Data[set_name].item_shop_count.fillna(0)

#### Add the lagged features

In [None]:
# From http://mlwhiz.com/blog/2017/12/26/How_to_win_a_data_science_competition/

lag_variables  = ['item_shop_count', 'item_shop_price']
lags = [1, 3 , 6 , 12]
for lag in lags:
    
    print('Adding lag:', lag)
    
    train_df = Data['train'].copy()
    train_df.date_block_num+=lag
    train_df = train_df[['date_block_num','shop_id','item_id']+lag_variables]
    train_df.columns = ['date_block_num','shop_id','item_id']+ [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variables]
    Data['train'] = pd.merge(Data['train'], train_df,on=['date_block_num','shop_id','item_id'] ,how='left')
    
    if training:
        Data['evaluation'] = pd.merge(Data['evaluation'], train_df,on=['date_block_num','shop_id','item_id'] ,how='left')
    else:
        Data['test'] = pd.merge(Data['test'], train_df,on=['date_block_num','shop_id','item_id'] ,how='left')

In [None]:
Data['train'][-25:]

In [None]:
a = Data['train'].copy()
b = Data['evaluation'].copy()

In [None]:
Data['train'] = a.copy()
Data['evaluation'] = b.copy()

#### Add the other features

In [None]:
def add_mean_encoding(enc_cols, 
                      mean_encodings, 
                      label_suffix, 
                      price_rename, 
                      count_rename, 
                      train_set, 
                      test_set):
             
    # Create the encoding on the train set
    tmp = Data[train_set].groupby(enc_cols, as_index = False).agg(mean_encodings)\
        .rename(columns = {'item_shop_price': price_rename,
                            'item_shop_count': count_rename})
    tmp.columns = [col[0] if col[-1]=='' else col[0] + '_' + col[-1] + label_suffix for col in tmp.columns.values]

    # Add the encoding on each set
 
    Data[train_set] = Data[train_set].merge(tmp,
            how = 'left',
            on = enc_cols
            )
    
    Data[test_set] =  Data[test_set].merge(tmp,
            how = 'left',
            on = enc_cols
            )

In [None]:
# Measures to use for the mean encoding
mean_encodings = {'item_shop_price':['max','mean'], 'item_shop_count':['max', 'mean']}

if training:
    test_set = 'evaluation'
else:
    test_set = 'test'
    
for set_name in ['train', test_set]:
    Data[set_name] = Data[set_name].merge(data_load['items'],
            how = 'left',
            on = 'item_id',
            ).drop(['item_name'], axis = 1)

### Add the mean encoding per item_category

print("Adding shop category stats")

add_mean_encoding(enc_cols=['shop_id', 'item_category_id'],
                  mean_encodings = mean_encodings,
                  label_suffix='_over_months',
                  price_rename='category_shop_price',
                  count_rename='category_shop_count',
                  train_set = 'train',
                  test_set = test_set)
    
# Add the average price per item and shop, over the train data set (average price will be missing for some items)

print("Adding shop item stats")

add_mean_encoding(enc_cols=['shop_id', 'item_id'],
                  mean_encodings = mean_encodings,
                  label_suffix='_over_months',
                  price_rename='item_shop_price',
                  count_rename='item_shop_count',
                  train_set = 'train',
                  test_set = test_set)

# Add the average price per item over the train data set (average price will be missing for some items)

print("Adding overall item stats")  

add_mean_encoding(enc_cols=['item_id'],
                  mean_encodings = mean_encodings,
                  label_suffix='_over_all',
                  price_rename='item_price',
                  count_rename='item_count',
                  train_set = 'train',
                  test_set = test_set)

# Add the mean encodings per shop over all

print("Adding overall shop stats")   

add_mean_encoding(enc_cols=['shop_id'],
                  mean_encodings = mean_encodings,
                  label_suffix='_over_all',
                  price_rename='shop_price',
                  count_rename='shop_count',
                  train_set = 'train',
                  test_set = test_set)

# Remove the item_id

for set_name in ['train', test_set]:
    Data[set_name] = Data[set_name].drop(['item_id', 'item_category_id', 'shop_id'], axis = 1)
    
Data['train'] = Data['train'].drop(['item_shop_price'], axis = 1)
if training:
    Data['evaluation'] = Data['evaluation'].drop(['item_shop_price'], axis = 1)

print("Done!")

#### Reorder the columns alphabetically to avoid issues with columns position

In [None]:
if training:
    set_list = ['train','evaluation']
else:
    set_list = ['train','test']

for set_name in set_list:
    Data[set_name].sort_index(axis=1, inplace=True)

In [None]:
Data['test'].head(10)

#### Save data to save memory

In [None]:
import pickle
import gc
gc.collect()

case = '20180202b'

DATA_LEARNING_FILE = "../data/sales-" + case
DATA_EVALUATION_FILE = "../data/evaluation-" + case
DATA_TEST_FILE = "../data/test-" + case

Data['train'].to_pickle(DATA_LEARNING_FILE)
if training:
    Data['evaluation'].to_pickle(DATA_EVALUATION_FILE)
else:
    Data['test'].to_pickle(DATA_TEST_FILE)

In [None]:
# 20180127c : with windsorization 0.999 , removing rows for store, item without any sale
# 20180127d : idem without windsorization 
# 20180127e : idem with windsorization 0.9999
# 20180127f : idem with windsorization 0.99
# 20180127g : idem with windsorization 0.99, with full set
# 20180127h : idem with windsorization 0.999, with full set
# 20180128c : windsorization on train only 0.999, with train/eval split
# 20180128d : windsorization on train only 0.99, with train/eval split
# 20180128e : no windsorization, with train/eval split
# 20180128f : with windsor 0.999 on price, with train/eval split
# 20180128g : with windsor 0.999 on price, without eval
# 201801230a : with windsor 0.999 on price, with eval, with lagged item_count, shop_id, item_category_id
# 201801230b : with windsor 0.999 on price, without eval, with lagged item_count, shop_id, item_category_id
# 20180201a : with new mean encoding, lagged features, no shop,item, category ids
# there was a mistake on the previous set, for the calculation of the item_price_mean_over_all 
# 20180202a: with proper calculation
# 20180202b: to prepare test set

### Restart it to retrieve data (optional)

In [7]:
import pickle

training = True

case = '20180202a'

DATA_LEARNING_FILE = "../data/sales-" + case
DATA_EVALUATION_FILE = "../data/evaluation-" + case

Data = {}

Data['train'] = pd.read_pickle(DATA_LEARNING_FILE)
if training:
    Data['evaluation'] = pd.read_pickle(DATA_EVALUATION_FILE) 

#### Create train/eval set

In [8]:
clipping  = True

# Random split
#from sklearn.model_selection import train_test_split
#train_set, test_set = train_test_split(Data['train'], test_size = 0.2, random_state = 42)

# Remove the first year to decrease training time (and after we will add delayed values)
Data['train'] = Data['train'][Data['train'].date_block_num >11].sample(frac=1).reset_index(drop=True)

x_train = Data['train'].drop(['item_shop_count'], axis = 1)

if clipping:
    y_train = Data['train'].item_shop_count.clip(0,20)
else:
    y_train = Data['train'].item_shop_count

# I should remove the evaluation prediction rows with missings category

if training:
    x_eval = Data['evaluation'].drop(['item_shop_count'], axis = 1)
    if clipping:
        y_eval = Data['evaluation'].item_shop_count.clip(0,20)
    else:
        y_eval = Data['evaluation'].item_shop_count

del(Data)

In [None]:
x_eval.describe()

In [11]:
from sklearn.metrics import mean_squared_error

def model_evaluation(model_reg, x_train, y_train, x_test, y_test, use_average = True): 
    sales_predictions = model_reg.predict(x_train)
    mse = mean_squared_error(y_train, sales_predictions)
    rmse_train = np.sqrt(mse)

    sales_predictions = pd.DataFrame({'pred': model_reg.predict(x_test)})
    
    if use_average:
        # replace the item rows with no values, with the average on the category for this shop
        missing_shop_item_rows =x_test.item_price_mean_over_all == -999
        print('Missing lines for shop,items: ', len(x_test[missing_shop_item_rows]))
        sales_predictions.loc[missing_shop_item_rows, 'pred'] = x_test[missing_shop_item_rows].category_shop_count_mean_over_months
    
        # replace the shop, category with no values, with 0 (the shop is not selling this category)
        missing_shop_category_rows = sales_predictions.pred == -999  
        print('Missing lines for shop,category: ', len(x_test[missing_shop_category_rows]))
        sales_predictions.loc[missing_shop_category_rows, 'pred'] = 0
    
    mse = mean_squared_error(y_test.clip(0,20), sales_predictions.pred.clip(0,20))
    rmse_test = np.sqrt(mse)

    print("train error: ", '{0:.3f}'.format(rmse_train), "evaluation error: ", '{0:.3f}'.format(rmse_test))

#### Training with Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

df_reg = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           n_estimators=150, n_jobs=5, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
df_reg.fit(x_train.fillna(-999), y_train)

In [None]:
model_evaluation(df_reg, x_train.fillna(-999), y_train, x_eval.fillna(-999), y_eval, use_average = True)

In [None]:
x_eval.columns

In [None]:
# with 20180127c
# max_depth=15, n_estimators=20, max_features: 3 : 1.004 / 1.403
# max_depth=20, n_estimators=20, max_features: 2 : 0.839 / 1.399
# max_depth=20, n_estimators=20, max_features: 3 : 0.818 / 1.394
# max_depth=20, n_estimators=50, max_features: 3 : 0.834 / 1.391 > best
# max_depth=25, n_estimators=20, max_features: 3 : 0.657 / 1.404
# max_depth=20, n_estimators=20, max_features: 4 : 0.795 / 1.410

# with 20180127d, max_depth=20, n_estimators=50, max_features: 3
# train error: 1.104 evaluation error:  4.596 > windsorization is essential!!

# with 20180127e, max_depth=20, n_estimators=25, max_features: 3
# train error:  0.949 evaluation error:  1.612 still not as good

# with 20180127f, max_depth=20, n_estimators=25, max_features: 3
# train error:  0.696 evaluation error:  1.027 > best but beware, 
#I may just be reducing the variance of the evaluation set :)  
# let check with the Kaggle :) 
# with clip 0-20 train error:  0.692 evaluation error:  0.924 (meaningless as it was windsored)

# with 20180127h, max_depth=20, n_estimators=50, max_features: 3, windsor:0.999, full set
# kaggle: 1.41

# with 20180127g, max_depth=20, n_estimators=50, max_features: 3, windsor:0.99, full set
# kaggle: 1.40 > best

# Conclusion: we have brought the results much closer 0.7 / 1.0 / 1.4
# but we are still higher than before ??? 
# does the category do help or harm?

# with 20180127f, removing category stat, max_depth=20, n_estimators=25, max_features: 3, windsor:0.99, full set
# 0.677 evaluation error:  1.043 : slightly worse

# with 20180128d, max_depth=20, n_estimators=25, max_features: 3, windsor on train 0.99
# train error:  0.601 evaluation error:  5.237
# with clipping 0-20: train error:  0.607 evaluation error:  0.979

# with 20180128c, max_depth=20, n_estimators=25, max_features: 3, windsor on train 0.999
# train error:  0.743 evaluation error:  5.039
# with clipping 0-20: train error:  0.743 evaluation error:  0.996

# with 20180128e, max_depth=20, n_estimators=25, max_features: 3, no windsor on train
# train error:  1.095 evaluation error:  4.702
# with clipping 0-20 at evaluation (y_eval and pred): train error:  1.094 evaluation error:  1.002
# with clipping 0-20 of y_train train error:  0.616 evaluation error:  0.975 > best

# with 20180128f, max_depth=20, n_estimators=25, max_features: 3,  windsor 0.999 on price only, clipping 0-20 on train
# train error:  0.616 evaluation error:  0.973 > best

# with 20180128g, max_depth=20, n_estimators=100, max_features: 3,  windsor 0.999 on price only, clipping 0-20 on train
# no eval
# kaggle: 1.02119

# with 20180130a, max_depth=20, n_estimators=25, max_features: 3,  with lagged features, item_id, category_item_id
# 0.567 evaluation error:  0.962 > overfitted...

# with 20180130b, max_depth=20, n_estimators=25, max_features: 3,  with lagged features, without item_id, category_item_id
# 0.579 evaluation error:  0.968 > actually slightly worse
# Kaggle:

# with 20180201a, train error:  0.537 evaluation error:  1.011 > overfit
# idem Xgboost eta = 0.3,max_depth=4, n_estimators=300, learning_rate=0.05:  0.876 evaluation error:  0.977

# with 20180202a train error:  0.517 evaluation error:  0.994 > overfit 
# with 20180202a , nmax_depth = 15, n_estimators = 25
# train error:  0.641 evaluation error:  0.980 > better
# with 20180202a , nmax_depth = 12, n_estimators = 25
# train error:  0.711 evaluation error:  0.978
# with 20180202a , nmax_depth = 10, n_estimators = 25
# train error:  0.739 evaluation error:  0.977 (without use_average)
# train error:  0.739 evaluation error:  0.956 (with use_average) >> best

# 20180202b: kaggle: 0.99725

In [None]:
from sklearn.externals import joblib
joblib.dump(df_reg, '../models/randomforest_20180128g.pkl')
#df_reg = joblib.load('../models/randomforest_20180127g.pkl') 

In [None]:
importances = df_reg.feature_importances_
importances

### Linear Model 

In [13]:
from sklearn.linear_model import SGDRegressor

ln_reg = SGDRegressor(
    penalty='l2'
)

ln_reg.fit(x_train.fillna(0), y_train)



SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=5, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [14]:
model_evaluation(ln_reg, x_train.fillna(0), y_train, x_eval.fillna(0), y_eval, use_average = True)

Missing lines for shop,items:  0
Missing lines for shop,category:  0
train error:  7058554053649491.000 evaluation error:  10.487


### Xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# You can experiment with many other options here, using the same .fit() and .predict()
# methods; see http://scikit-learn.org
# This example uses the current build of XGBoost, from https://github.com/dmlc/xgboost


# replace the shop, item rows with no values, with the average on the category 
missing_item_rows = pd.isnull(x_eval.item_price_mean_over_all)

def clipped_rmse(preds, deval):  

    sales_predictions = pd.DataFrame({'pred': preds})
    sales_predictions.loc[missing_item_rows, 'pred'] = x_eval[missing_item_rows].category_shop_count_mean_over_months
    sales_predictions.loc[pd.isnull(sales_predictions.pred), 'pred'] = 0

    score = np.sqrt(mean_squared_error(y_eval.clip(0,20), sales_predictions.pred.clip(0,20)))
    return 'clipped_rmse', score

dtrain = xgb.DMatrix(x_train.fillna(-999).values, label=y_train.values, missing=-999)
dtest = xgb.DMatrix(x_eval.fillna(-999).values, label=y_eval.values, missing=-999)
evallist = [(dtest, 'eval')]

# specify parameters via map
param = {'max_depth':15,
         'min_child_weight': 5,
         'eta':0.02,
         'silent':0}

num_round= 100

gbm = xgb.train(param, dtrain, num_round, evallist, feval=clipped_rmse, early_stopping_rounds=20,  maximize=False)


In [None]:
sales_predictions = pd.DataFrame({'pred': gbm.predict(dtest, ntree_limit=gbm.best_ntree_limit)})
    
# replace the item rows with no values, with the average on the category for the shop
missing_shop_item_rows = pd.isnull(x_eval.item_price_mean_over_all)
print('Missing lines for items: ', len(x_eval[missing_shop_item_rows]))
sales_predictions.loc[missing_shop_item_rows, 'pred'] = x_eval[missing_shop_item_rows].category_shop_count_mean_over_months

# replace the shop, category with no values, with 0 (the shop is not selling this category)
missing_shop_category_rows = pd.isnull(sales_predictions.pred)
print('Missing lines for shop,category: ', len(x_eval[missing_shop_category_rows]))
sales_predictions.loc[missing_shop_category_rows, 'pred'] = 0

mse = mean_squared_error(y_eval.clip(0,20), sales_predictions.pred.clip(0,20))
rmse_eval = np.sqrt(mse)

print("evaluation error: ", '{0:.3f}'.format(rmse_eval))


In [None]:
x_eval.head(50)

#### Optimizing the hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [10,50, 100], 'max_features': [2,4,6], 'max_depth': [5, 10, 20]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, n_jobs = 4,
                          scoring='neg_mean_squared_error')

grid_search.fit(x_train, y_train)

In [None]:
model_evaluation(grid_search.best_estimator_, x_train, y_train, x_test, y_test)

In [None]:
grid_search.best_estimator_

#### Training with GBRT

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt_reg = GradientBoostingRegressor(learning_rate = 0.1)
gbrt_reg.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

x_test =  test_set.drop(['item_cnt_month'], axis = 1)
y_test = test_set.item_cnt_month

model_evaluation(gbrt_reg, x_train, y_train, x_test, y_test)

#### Submission preparation

In [None]:
import pickle

DATA_TEST_FILE = "../data/test-20180202b"

Data = {}

Data['test'] = pd.read_pickle(DATA_TEST_FILE)

#Data['test'].head(10)

In [None]:
use_average = True

X_test = Data['test'].drop(['ID'], axis = 1)

# Option 1
# It trust the model will learn from the category count for the missing item - 
# for the missing category, I set the prediction to 0
# missing_shop_item_indices = pd.isnull(Data['test']['item_cnt_month_mean'])
# This is not improving the score!

predictions = pd.DataFrame({'pred': df_reg.predict(X_test.fillna(-999)).clip(0,20)})

# replace the shop, item rows with no values, with the average on the category

if use_average:
        # replace the shop, item rows with no values, with the average on the category 
        missing_item_rows = pd.isnull(X_test.item_price_mean_over_all)
        print('Missing lines for items: ', len(X_test[missing_item_rows]))
        predictions.loc[missing_item_rows, 'pred'] = X_test[missing_item_rows].category_shop_count_mean_over_months

# for the shop with not category, replace with 

missing_shop_category_rows = pd.isnull(predictions.pred)
print('Missing lines for shop,category: ', len(X_test[missing_shop_category_rows]))
predictions.loc[missing_shop_category_rows, 'pred'] = 0

# Create the submission file:

submission = data_load['sample_submission'].copy()

submission.loc[:, 'item_cnt_month'] = predictions.pred
submission.head(10)

In [None]:
SUBMISSION_FILE = "../data/sales_sub_20180202b.csv"

submission.to_csv(SUBMISSION_FILE, index = False)