#### Summary

Build a model using xgboost and verify it's performance on the validation set.


In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

##### Get relevant data in monthly form

In [3]:
#INPUT_DIR = '../input/'
INPUT_DIR = '../input/competitive-data-science-predict-future-sales/'

In [4]:
sales_train = pd.read_csv(INPUT_DIR + 'sales_train.csv')

FileNotFoundError: [Errno 2] File b'../input/competitive-data-science-predict-future-sales/sales_train.csv' does not exist: b'../input/competitive-data-science-predict-future-sales/sales_train.csv'

In [None]:
sales_train.head()

##### 1. Aggregate features to get monthly data.

In [None]:
monthly_sales_data = sales_train[['date_block_num', 'shop_id', 'item_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_id']).sum()
monthly_sales_data.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)
monthly_sales_data.reset_index(['date_block_num', 'shop_id', 'item_id'], inplace=True)

In [None]:
monthly_sales_data['shop_id_item_id'] = list(zip(monthly_sales_data.shop_id,monthly_sales_data.item_id))

In [None]:
monthly_sales_data['shop_id_item_id_date_block_num'] = list(zip(monthly_sales_data.shop_id, 
                                                                monthly_sales_data.item_id,
                                                                monthly_sales_data.date_block_num))

In [None]:
shop_id_item_id_date_block_num_to_item_cnt_month = dict(zip(monthly_sales_data.shop_id_item_id_date_block_num,
                                                            monthly_sales_data.item_cnt_month))

In [None]:
monthly_sales_data['prev_month_sale'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_month.get((x.shop_id, x.item_id, x.date_block_num - 1), 0), 
    axis=1)

##### Add another column for item_category_id

In [None]:
items = pd.read_csv(INPUT_DIR + 'items.csv')

In [None]:
items.columns

In [None]:
item_id_to_item_category_id = dict(zip(items.item_id, items.item_category_id))

In [None]:
monthly_sales_data['item_category_id'] = monthly_sales_data['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [None]:
monthly_sales_data.columns

In [None]:
monthly_sales_data['item_cnt_month'].clip(lower=0, upper=20, inplace=True)

In [None]:
monthly_sales_data.head()

##### Let us try to find the best model using cross validation.

In [None]:
def get_most_recent_item_cnt(shop_id, item_id, date_block_num):
    val = 0
    for i in range(date_block_num, -1, -1):
        if (shop_id, item_id, i) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
            val = shop_id_item_id_date_block_num_to_item_cnt_month.get((shop_id, item_id, i))
            break
            
    return val
            

In [None]:
def get_time_since_most_recent_sale(shop_id, item_id, date_block_num):
    i = 0
    for i in range(date_block_num, -1, -1):
        if (shop_id, item_id, i) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
            break
            
    return (date_block_num + 1 - i)
            

In [None]:
monthly_sales_data['most_recent_item_cnt'] = monthly_sales_data.apply(
    lambda x : get_most_recent_item_cnt(x.shop_id, x.item_id, x.date_block_num - 1), axis=1)

In [None]:
monthly_sales_data['time_since_last_sale'] = monthly_sales_data.apply(
    lambda x : get_time_since_most_recent_sale(x.shop_id, x.item_id, x.date_block_num - 1), axis=1)

In [None]:
monthly_sales_data['prev_item_cnt'] = monthly_sales_data.apply(
    lambda x : shop_id_item_id_date_block_num_to_item_cnt_month.get(
        (x.shop_id, x.item_id, x.date_block_num - 1), 0), axis=1)

In [None]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.prev_item_cnt, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

In [None]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.most_recent_item_cnt, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

#### What about those cases, when we do not have any entries in the training data.

We have couple of options here. We can take an average of the entire shop for the previous month or the average of a similar item in that same shop for the previous month (NOTE: how do you measure similarity )?
 

###### Let us start adding item_category_id to the daily training data, so that we could use that as a proxy if the corresponding item is not present.

In [None]:
sales_train['item_category_id'] = sales_train['item_id'].apply(lambda x : item_id_to_item_category_id.get(x))

In [None]:
monthly_sales_item_category = sales_train[
    ['date_block_num', 'shop_id', 'item_category_id', 'item_cnt_day']].groupby(
    ['date_block_num', 'shop_id', 'item_category_id']).sum()

monthly_sales_item_category.reset_index(['date_block_num', 'shop_id', 'item_category_id'], inplace=True)
monthly_sales_item_category.rename(columns={'item_cnt_day' : 'item_cnt_month'}, inplace=True)

In [None]:
monthly_sales_item_category.head()

###### So the idea is this . Let us compute the total number of items sold for the item category for a month and the total number of item_id's corresponding to that category. Dividing the two numbers would give us the average number of items sold  for a particular item_id corresponding that category.

In [None]:
monthly_sales_item_category['shop_id_item_category_id_date_block_num'] = list(
    zip(monthly_sales_item_category.shop_id,
        monthly_sales_item_category.item_category_id,
        monthly_sales_item_category.date_block_num))

In [None]:
shop_id_item_category_id_date_block_num_to_tot_cnt_per_month = dict(zip(
    monthly_sales_item_category.shop_id_item_category_id_date_block_num,
    monthly_sales_item_category.item_cnt_month))

In [None]:
monthly_num_items_per_cat = sales_train[['date_block_num', 'shop_id', 'item_category_id', 'item_id']].groupby(
    ['date_block_num', 'shop_id', 'item_category_id']).nunique()

In [None]:
monthly_num_items_per_cat.head()

In [None]:
monthly_num_items_per_cat.columns

In [None]:
monthly_num_items_per_cat.drop(columns=['date_block_num', 'shop_id', 'item_category_id'], inplace=True)

In [None]:
monthly_num_items_per_cat.head()

In [None]:
monthly_num_items_per_cat.reset_index(['date_block_num', 'shop_id', 'item_category_id'], inplace=True)

In [None]:
monthly_num_items_per_cat['shop_id_item_category_id_date_block_num'] = list(
    zip(monthly_num_items_per_cat.shop_id,
        monthly_num_items_per_cat.item_category_id,
        monthly_num_items_per_cat.date_block_num,))

In [None]:
shop_id_item_category_id_date_block_num_to_num_items = dict(zip(
    monthly_num_items_per_cat.shop_id_item_category_id_date_block_num,
    monthly_num_items_per_cat.item_id))

In [None]:
shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((25, 42, 32), 0)/\
shop_id_item_category_id_date_block_num_to_num_items.get((25, 42, 32), 1)

###### Now, let design several predictors and see how well they perform on the validation set.

In [None]:
def get_prev_count_or_category_count(shop_id, item_id, item_category_id, date_block_num):
    if (shop_id, item_id, date_block_num) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
        val = shop_id_item_id_date_block_num_to_item_cnt_month.get((shop_id, item_id, date_block_num))
    else:
        val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                                item_category_id, 
                                                                                date_block_num), 0)/\
        shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, date_block_num), 1)
    return val

In [None]:
def get_most_recent_item_cnt_or_category_count(shop_id, item_id, item_category_id, date_block_num):
    val = 0
    for i in range(date_block_num, -1, -1):
        if (shop_id, item_id, i) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
            val = shop_id_item_id_date_block_num_to_item_cnt_month.get((shop_id, item_id, i))
            break
    
    val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                            item_category_id, 
                                                                            date_block_num), 0)/\
    shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, date_block_num), 1)
    
    return val
            

In [None]:
monthly_sales_data['most_recent_item_cnt_or_category_count'] = monthly_sales_data.apply(
    lambda x : get_most_recent_item_cnt_or_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1)

In [None]:
monthly_sales_data['prev_count_or_category_count'] = monthly_sales_data.apply(
    lambda x : get_prev_count_or_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1)

In [None]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.most_recent_item_cnt_or_category_count, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

In [None]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.prev_count_or_category_count, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

##### Let us try one more refinement, where we to iteratively find the average monthly count for the same item category for the most recent month.

In [None]:
def get_most_recent_item_cnt_or_most_recent_category_count(shop_id, item_id, item_category_id, date_block_num):
    val = 0
    for i in range(date_block_num, -1, -1):
        if (shop_id, item_id, i) in shop_id_item_id_date_block_num_to_item_cnt_month.keys():
            val = shop_id_item_id_date_block_num_to_item_cnt_month.get((shop_id, item_id, i))
            break
    
    if val != 0:
        for i in range(date_block_num, -1, -1):

            if (shop_id, item_category_id, i) in shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.keys():
                val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                                        item_category_id, 
                                                                                        i))/\
                shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, i))
                break
    
    return val
            

In [None]:
def get_prev_month_category_count(shop_id, item_id, item_category_id, date_block_num):
    val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                            item_category_id, 
                                                                            date_block_num), 0)/\
    shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, date_block_num), 1)
    
    return val
            

In [None]:
def get_most_recent_category_count(shop_id, item_id, item_category_id, date_block_num): 
    val = 0
    for i in range(date_block_num, -1, -1):

        if (shop_id, item_category_id, i) in shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.keys():
            val = shop_id_item_category_id_date_block_num_to_tot_cnt_per_month.get((shop_id, 
                                                                                    item_category_id, 
                                                                                    i))/\
            shop_id_item_category_id_date_block_num_to_num_items.get((shop_id, item_category_id, i))
            break
    return val
            

In [None]:
monthly_sales_data['most_recent_item_cnt_or_most_recent_category_count'] = monthly_sales_data.apply(
    lambda x : get_most_recent_item_cnt_or_most_recent_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1) 

In [None]:
monthly_sales_data['prev_month_category_count'] = monthly_sales_data.apply(
    lambda x : get_prev_month_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1) 

In [None]:
monthly_sales_data['most_recent_category_count'] = monthly_sales_data.apply(
    lambda x : get_most_recent_category_count(
        x.shop_id, x.item_id, x.item_category_id, x.date_block_num - 1), axis=1) 

In [None]:
np.sqrt(mean_squared_error(np.clip(monthly_sales_data.most_recent_item_cnt_or_most_recent_category_count, 0 , 20),  
                           np.clip(monthly_sales_data.item_cnt_month, 0, 20)))

#### Formalize what we did above to an xgboost algorithm.

In [None]:
X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_month_sale', 
             'prev_month_category_count']
#X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_count_or_category_count']
Y_COLUMN = 'item_cnt_month'

In [None]:
train_data = monthly_sales_data[(monthly_sales_data.date_block_num != 32) &(monthly_sales_data.date_block_num != 33)]
validation_data = monthly_sales_data[monthly_sales_data.date_block_num == 32]
test_data = monthly_sales_data[monthly_sales_data.date_block_num == 33]

In [None]:
train_data.date_block_num.describe()

In [None]:
X = train_data[X_COLUMNS]
Y = train_data[[Y_COLUMN]]

In [None]:
shops = pd.read_csv(INPUT_DIR + 'shops.csv')

In [None]:
shops.head()

In [None]:
item_categories = pd.read_csv(INPUT_DIR + 'item_categories.csv')

In [None]:
item_categories

In [None]:
new_X = validation_data[X_COLUMNS]
new_Y = validation_data[[Y_COLUMN]]

In [22]:
def build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS):
    xgb_train_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)
    xgb_validation_data = xgb.DMatrix(new_X, new_Y, feature_names=X_COLUMNS)
    xgb_params = {'eta' : 0.1, 'eval_metric' : 'rmse'}
    model_obj = xgb.train(params=xgb_params,
                      dtrain=xgb_train_data,
                      num_boost_round=4000)
    validation_predictions = model_obj.predict(xgb_validation_data)
    print(np.sqrt(mean_squared_error(np.clip(validation_predictions, 0, 20), np.clip(new_Y, 0, 20))))

In [None]:
#build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)
#2.14


In [None]:
#X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_count_or_category_count']
#X = train_data[X_COLUMNS]
#new_X = validation_data[X_COLUMNS]
#build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)
#2.18


##### Test most recent month sale, as opposed to just the previous

In [None]:
#X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'most_recent_item_cnt']
#X = train_data[X_COLUMNS]
#new_X = validation_data[X_COLUMNS]
#build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)
# Result : 2.057547424002264


In [None]:
# X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 
#              'most_recent_item_cnt', 'most_recent_category_count']
# X = train_data[X_COLUMNS]
# new_X = validation_data[X_COLUMNS]
# build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)
# 2.053

In [None]:
# X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 
#              'most_recent_item_cnt_or_category_count']
# X = train_data[X_COLUMNS]
# new_X = validation_data[X_COLUMNS]
# build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)
# 2.13

In [None]:
# X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 
#              'most_recent_item_cnt_or_most_recent_category_count']
# X = train_data[X_COLUMNS]
# new_X = validation_data[X_COLUMNS]
# build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)
# 2.09

In [None]:
X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'most_recent_item_cnt',
             'time_since_last_sale']
X = train_data[X_COLUMNS]
new_X = validation_data[X_COLUMNS]
#build_xgboost_model(X, Y, new_X, new_Y, X_COLUMNS)


#### Do a pivot table and use the data in all the previous months to train a gradient boosting model.

In [23]:
daily_train = sales_train[(sales_train.date_block_num != 32) &(sales_train.date_block_num != 33)]
daily_validation = sales_train[sales_train.date_block_num == 32]
daily_test = sales_train[sales_train.date_block_num == 33]

In [24]:
daily_train_pivot = pd.pivot_table(daily_train,
                                   index=['shop_id', 'item_id'],
                                   values='item_cnt_day',
                                   columns=['date_block_num'],
                                   aggfunc='sum',
                                   fill_value=0)

In [25]:
daily_train_pivot.columns

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
           dtype='int64', name='date_block_num')

In [26]:
daily_train_pivot.reset_index(inplace=True)

In [27]:
daily_train_pivot.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,22,23,24,25,26,27,28,29,30,31
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
daily_train_pivot[0].head(2)

0    0
1    0
Name: 0, dtype: int64

#### Check those cases where we have values present in the training data , but are set to 0

In [29]:
len(monthly_sales_data[monthly_sales_data.date_block_num == 31])

33486

In [30]:
len(daily_train_pivot[daily_train_pivot[31] != 0])

33440

In [31]:
daily_train_pivot['shop_id_item_id'] = list(zip(daily_train_pivot.shop_id, daily_train_pivot.item_id))

In [32]:
daily_train_pivot['isin_orig_data'] = daily_train_pivot['shop_id_item_id'].isin(monthly_sales_data[monthly_sales_data.date_block_num == 31].shop_id_item_id.values)

In [33]:
daily_train_pivot[(daily_train_pivot.isin_orig_data == True) &(daily_train_pivot[31] == 0)]

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,shop_id_item_id,isin_orig_data
7107,2,4334,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,5,0,"(2, 4334)",True
11603,3,3688,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,8,0,"(3, 3688)",True
26860,5,13494,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,"(5, 13494)",True
48689,7,20338,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,2,0,"(7, 20338)",True
64629,12,19790,0,0,0,1,1,0,1,0,...,0,1,0,0,0,0,1,0,"(12, 19790)",True
70683,14,1969,0,0,0,0,0,0,0,0,...,0,0,22,3,2,0,1,0,"(14, 1969)",True
75289,14,15323,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,"(14, 15323)",True
85840,16,1495,0,0,0,0,0,0,0,0,...,2,2,0,1,2,1,1,0,"(16, 1495)",True
108876,19,1464,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,"(19, 1464)",True
108968,19,1673,0,0,0,0,0,0,0,0,...,1,0,1,0,1,1,0,0,"(19, 1673)",True


In [34]:
monthly_sales_data[(monthly_sales_data.shop_id_item_id == (2, 4334)) & (monthly_sales_data.date_block_num == 31) ]

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,shop_id_item_id,shop_id_item_id_date_block_num,prev_month_sale,item_category_id
1514588,31,2,4334,0.0,"(2, 4334)","(2, 4334, 31)",5.0,30


In [35]:
sales_train[(sales_train.shop_id == 59) & (sales_train.item_id == 21968) & (sales_train.date_block_num == 31) ]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
2799747,12.08.2015,31,59,21968,949.0,-1.0
2799777,05.08.2015,31,59,21968,949.0,1.0


###### Looks like those correspond to items being being returned . Now, let us train the model.

In [36]:
daily_train_pivot['item_category_id'] = \
    daily_train_pivot['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [37]:
daily_train_pivot.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,25,26,27,28,29,30,31,shop_id_item_id,isin_orig_data,item_category_id
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(0, 30)",False,40
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(0, 31)",False,37
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(0, 32)",False,40
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(0, 33)",False,37
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(0, 35)",False,40


In [38]:
X_COLUMNS = ['shop_id', 'item_id', 'item_category_id'] + list(range(31))
#X_COLUMNS = ['shop_id', 'item_id', 'item_category_id', 1]

Y_COLUMN = 31

In [39]:
daily_train_pivot.columns

Index([         'shop_id',          'item_id',                  0,
                        1,                  2,                  3,
                        4,                  5,                  6,
                        7,                  8,                  9,
                       10,                 11,                 12,
                       13,                 14,                 15,
                       16,                 17,                 18,
                       19,                 20,                 21,
                       22,                 23,                 24,
                       25,                 26,                 27,
                       28,                 29,                 30,
                       31,  'shop_id_item_id',   'isin_orig_data',
       'item_category_id'],
      dtype='object', name='date_block_num')

In [40]:
#X = daily_train_pivot[X_COLUMNS].copy()
#Y = daily_train_pivot[[Y_COLUMN]].copy()
X = daily_train_pivot[daily_train_pivot['isin_orig_data'] == True][X_COLUMNS].copy()
Y = daily_train_pivot[daily_train_pivot['isin_orig_data'] == True][[Y_COLUMN]].copy()

In [42]:
str_X_COLUMNS = ['shop_id', 'item_id', 'item_category_id'] + [str(x) for x in list(range(31))]

In [43]:
X.columns = str_X_COLUMNS

In [44]:
X.head()

Unnamed: 0,shop_id,item_id,item_category_id,0,1,2,3,4,5,6,...,21,22,23,24,25,26,27,28,29,30
6126,2,32,40,0,0,0,0,0,0,0,...,2,0,2,0,0,1,0,0,0,0
6132,2,70,49,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6145,2,482,73,1,1,1,1,1,2,1,...,1,0,0,0,4,1,0,0,0,1
6170,2,792,73,0,0,0,0,0,0,0,...,1,0,0,1,2,0,0,1,1,0
6173,2,806,49,0,0,0,0,0,0,0,...,1,1,0,1,0,0,1,2,0,0


In [45]:
X.columns

Index(['shop_id', 'item_id', 'item_category_id', '0', '1', '2', '3', '4', '5',
       '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17',
       '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29',
       '30'],
      dtype='object')

In [47]:
daily_train_validation = sales_train[sales_train.date_block_num != 33]


In [52]:
daily_train_validation_pivot = pd.pivot_table(daily_train_validation,
                                              index=['shop_id', 'item_id'],
                                              values='item_cnt_day',
                                              columns=['date_block_num'],
                                              aggfunc='sum',
                                              fill_value=0)

In [54]:
daily_train_validation_pivot.reset_index(inplace=True)

In [55]:
daily_train_validation_pivot['item_category_id'] = \
    daily_train_validation_pivot['item_id'].apply(lambda x : item_id_to_item_category_id.get(x, -1))

In [56]:
daily_train_validation_pivot.head()

date_block_num,shop_id,item_id,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,item_category_id
0,0,30,0,31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,40
1,0,31,0,11,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37
2,0,32,6,10,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,40
3,0,33,3,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37
4,0,35,1,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,40


In [57]:
daily_train_pivot.columns

Index([         'shop_id',          'item_id',                  0,
                        1,                  2,                  3,
                        4,                  5,                  6,
                        7,                  8,                  9,
                       10,                 11,                 12,
                       13,                 14,                 15,
                       16,                 17,                 18,
                       19,                 20,                 21,
                       22,                 23,                 24,
                       25,                 26,                 27,
                       28,                 29,                 30,
                       31,  'shop_id_item_id',   'isin_orig_data',
       'item_category_id'],
      dtype='object', name='date_block_num')

In [58]:
daily_train_pivot['shop_id']

0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24         0
25         0
26         0
27         0
28         0
29         0
          ..
411810    59
411811    59
411812    59
411813    59
411814    59
411815    59
411816    59
411817    59
411818    59
411819    59
411820    59
411821    59
411822    59
411823    59
411824    59
411825    59
411826    59
411827    59
411828    59
411829    59
411830    59
411831    59
411832    59
411833    59
411834    59
411835    59
411836    59
411837    59
411838    59
411839    59
Name: shop_id, Length: 411840, dtype: int64

In [59]:
VALID_X_COLUMNS = ['shop_id', 'item_id', 'item_category_id'] + list(range(1, 32))
#X_COLUMNS = ['date_block_num', 'shop_id', 'item_id', 'item_category_id', 'prev_count_or_category_count']
#VALID_X_COLUMNS = ['shop_id', 'item_id', 'item_category_id', 1]
VALID_Y_COLUMN = 32

In [60]:
daily_train_validation_pivot['shop_id_item_id'] = list(
    zip(daily_train_validation_pivot.shop_id, 
        daily_train_validation_pivot.item_id))

In [61]:
daily_train_validation_pivot['isin_orig_data'] = daily_train_validation_pivot['shop_id_item_id'].isin(
    monthly_sales_data[monthly_sales_data.date_block_num == 32].shop_id_item_id.values)

In [62]:
new_X = \
    daily_train_validation_pivot[daily_train_validation_pivot['isin_orig_data'] == True][VALID_X_COLUMNS].copy()
new_Y = daily_train_validation_pivot[daily_train_validation_pivot['isin_orig_data'] == True][[VALID_Y_COLUMN]].copy()

In [63]:
new_X.columns

Index([         'shop_id',          'item_id', 'item_category_id',
                        1,                  2,                  3,
                        4,                  5,                  6,
                        7,                  8,                  9,
                       10,                 11,                 12,
                       13,                 14,                 15,
                       16,                 17,                 18,
                       19,                 20,                 21,
                       22,                 23,                 24,
                       25,                 26,                 27,
                       28,                 29,                 30,
                       31],
      dtype='object', name='date_block_num')

In [64]:
new_X.columns = str_X_COLUMNS

In [65]:
new_Y.columns = [[Y_COLUMN]]

In [66]:
new_Y.columns

MultiIndex(levels=[[31]],
           codes=[[0]])

In [67]:
len(daily_train_validation_pivot[daily_train_validation_pivot['isin_orig_data'] == True])

29678

In [68]:
#build_xgboost_model(X, Y, new_X, new_Y, str_X_COLUMNS)

KeyboardInterrupt: 

In [107]:
sales_train.columns

Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day', 'item_category_id'],
      dtype='object')