# Modeling Future Sales Data for Shop-Item Pairs

This is a data set from a Kaggle competition representing sales for various physical stores over approximately 2 year period. The items for sale at each store vary by month. 

For this model the following techniques were used:
* encoding total shop sales from previous months as features
* encoding total item sales from previous months as features
* encoding average item sales from previous time periods

This code currently gets approximately MSE 1.56 on the test set.

In [9]:
import numpy as np 
import pandas as pd
from xgboost.sklearn import XGBModel
from sklearn.metrics import mean_squared_error

In [10]:
sales_train = pd.read_csv('sales_train.csv')
test = pd.read_csv('test.csv')

# Create Mean Encodings 

In [11]:
# total monthly sales for shop/item pairs
df = sales_train.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False)['item_cnt_day'].sum()
df.rename(columns={'item_cnt_day':'item_cnt_month'}, inplace=True)

# total monthly sales for each shop
df['shop_sales'] = df.groupby(['shop_id','date_block_num'], as_index=False)['item_cnt_month'].transform('sum')

# total monthly sales for each item
df['item_sales'] = df.groupby(['item_id','date_block_num'], as_index=False)['item_cnt_month'].transform('sum')

# count of shops for each time period
df['shop_count'] = df.groupby(['date_block_num'], as_index=False)['shop_id'].transform('nunique')

# count of items for each time period
df['item_count'] = df.groupby(['date_block_num'], as_index=False)['item_id'].transform('nunique')

# calculate averages
df['shop_avg'] = df['shop_sales'] / df['shop_count']
df['item_avg'] = df['item_sales'] / df['item_count']

In [14]:
# calculate lag feature of previous month's sales
def calculate_train_cv(prev_time, current_time, previous_df, current_df, test=False):

    step_one = current_df[['shop_id', 'item_id']].merge(previous_df[['shop_id', 'item_id', 'item_cnt_month']], 
                                                  how='left', on=['shop_id', 'item_id'])
    step_one.rename(columns={'item_cnt_month':'item_cnt_month_lag'}, inplace=True)
    
    step_two = step_one.merge(previous_df[['shop_id', 'shop_sales', 'shop_count', 'shop_avg']].drop_duplicates(), how='left', on='shop_id')
    step_three = step_two.merge(previous_df[['item_id', 'item_sales', 'item_count', 'item_avg']].drop_duplicates(), how='left',
                           on='item_id')
    if not test:
        step_three = step_three.assign(item_cnt_month=current_df['item_cnt_month'].values)

    step_three['date_block'] = current_time
    step_three['date_block_modulus'] = current_time % 12

    return step_three

In [15]:
# get training and test data ready
train = pd.DataFrame()
for i in range(1,34):
    prev_time = i - 1
    current_time = i
    previous_df = df[df.date_block_num == prev_time]
    current_df = df[df.date_block_num == current_time]
    train = train.append(calculate_train_cv(prev_time, current_time, previous_df, current_df, test=False))

col_keep = ['shop_id', 'item_id', 'item_cnt_month_lag', 'shop_sales', 'shop_count',
       'shop_avg', 'item_sales', 'item_count', 'item_avg',
       'date_block', 'date_block_modulus']

test_features = calculate_train_cv(33, 34, df[df.date_block_num == 33], test, True)
y_train = train.item_cnt_month.clip(0,20)
X_train = train.loc[:,col_keep]
X_test = test_features.loc[:,col_keep]

In [16]:
# since this is time-series, cross validaiton was done by training on all data up to a specific month 
# and then validating on the next month

xgb = XGBModel(max_depth=20, n_estimators=60, num_boost_rounds=1, 
               subsample=.95, colsample_bytree=.95, 
               colsample_bylevel=.95, min_child_weight=100, 
               learning_rate=.05)

xgb.fit(X_train,y_train)

y_test = xgb.predict(X_test)

submit = test.copy()
submit['item_cnt_month'] = y_test.clip(0,20)
submit.loc[:,['ID', 'item_cnt_month']].to_csv('results_moretrain.csv', index=False)