In [None]:
import os
os.chdir('..')

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer

pd.options.display.float_format = '{:.5f}'.format
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

In [None]:
train = pd.read_pickle('tmp/data/processed/train.pkl')
print(train.shape)
train.head()

In [None]:
test = pd.read_pickle('tmp/data/processed/test_plus_features.pkl')
print(test.shape)
test.head()

Join Dataframes to create timelag features

In [None]:
train = pd.concat([train, test], ignore_index=True, sort=False).sort_values(['shop_id', 'item_id', 'date_block_num', 'month', 'year']).reset_index(drop=True)
train.head()

## Add Time Lag Features
Trailing or rolling averages that feed into the current month need to come from the previous month since we won't have that data available for the current month we need to predict.

In [None]:
smoothing=0.00000001

def calculate_lag_delta(df, current_feature, previous_feature, smoothing):
    # smooothing is merely to negate dividing by zero
    return df[current_feature].add(smoothing) - df[previous_feature].add(smoothing) / df[previous_feature].add(smoothing)

In [None]:
# item count lag
train['item_cnt_lag_1m'] = train.groupby(['shop_id', 'item_id'])['item_cnt_month'].shift(1)
train['item_cnt_lag_2m'] = train.groupby(['shop_id', 'item_id'])['item_cnt_month'].shift(2)
train['item_cnt_lag_3m'] = train.groupby(['shop_id', 'item_id'])['item_cnt_month'].shift(3)
train['item_cnt_lag_6m'] = train.groupby(['shop_id', 'item_id'])['item_cnt_month'].shift(6)
train['item_cnt_lag_12m'] = train.groupby(['shop_id', 'item_id'])['item_cnt_month'].shift(12)
train.head()

In [None]:
# item count % change
train['item_cnt_lag_1m_delta'] = calculate_lag_delta(train, 'item_cnt_month', 'item_cnt_lag_1m', smoothing)
train['item_cnt_lag_2m_delta'] = calculate_lag_delta(train, 'item_cnt_lag_1m', 'item_cnt_lag_2m', smoothing)
train['item_cnt_lag_3m_delta'] = calculate_lag_delta(train, 'item_cnt_lag_2m', 'item_cnt_lag_3m', smoothing)
train['item_cnt_lag_6m_delta'] = calculate_lag_delta(train, 'item_cnt_lag_3m', 'item_cnt_lag_6m', smoothing)
train['item_cnt_lag_12m_delta'] = calculate_lag_delta(train, 'item_cnt_lag_6m', 'item_cnt_lag_12m', smoothing)
train.head()

In [None]:
# item count rolling stats
train['item_cnt_3m_roll_mean'] = train.groupby(['shop_id', 'item_id']).rolling(3, min_periods=2)['item_cnt_lag_1m'].mean().reset_index(drop=True)
train['item_cnt_6m_roll_mean'] = train.groupby(['shop_id', 'item_id']).rolling(6, min_periods=5)['item_cnt_lag_1m'].mean().reset_index(drop=True)
train['item_cnt_12m_roll_mean'] = train.groupby(['shop_id', 'item_id']).rolling(12, min_periods=11)['item_cnt_lag_1m'].mean().reset_index(drop=True)

train['item_cnt_3m_roll_std'] = train.groupby(['shop_id', 'item_id']).rolling(3, min_periods=2)['item_cnt_lag_1m'].std().reset_index(drop=True)
train['item_cnt_6m_roll_std'] = train.groupby(['shop_id', 'item_id']).rolling(6, min_periods=5)['item_cnt_lag_1m'].std().reset_index(drop=True)
train['item_cnt_12m_roll_std'] = train.groupby(['shop_id', 'item_id']).rolling(12, min_periods=11)['item_cnt_lag_1m'].std().reset_index(drop=True)

train['item_cnt_3m_roll_median'] = train.groupby(['shop_id', 'item_id']).rolling(3, min_periods=2)['item_cnt_lag_1m'].median().reset_index(drop=True)
train['item_cnt_6m_roll_median'] = train.groupby(['shop_id', 'item_id']).rolling(6, min_periods=5)['item_cnt_lag_1m'].median().reset_index(drop=True)
train['item_cnt_12m_roll_median'] = train.groupby(['shop_id', 'item_id']).rolling(12, min_periods=11)['item_cnt_lag_1m'].median().reset_index(drop=True)
train.head()

In [None]:
# revenue lag
train['revenue_lag_1m'] = train.groupby(['shop_id', 'item_id'])['revenue'].shift(1)
train['revenue_lag_2m'] = train.groupby(['shop_id', 'item_id'])['revenue'].shift(2)
train['revenue_lag_3m'] = train.groupby(['shop_id', 'item_id'])['revenue'].shift(3)
train['revenue_lag_6m'] = train.groupby(['shop_id', 'item_id'])['revenue'].shift(6)
train['revenue_lag_12m'] = train.groupby(['shop_id', 'item_id'])['revenue'].shift(12)
train.head()

In [None]:
# revenue % change
train['revenue_lag_1m_delta'] = calculate_lag_delta(train, 'revenue', 'revenue_lag_1m', smoothing)
train['revenue_lag_2m_delta'] = calculate_lag_delta(train, 'revenue_lag_1m', 'revenue_lag_2m', smoothing)
train['revenue_lag_3m_delta'] = calculate_lag_delta(train, 'revenue_lag_2m', 'revenue_lag_3m', smoothing)
train['revenue_lag_6m_delta'] = calculate_lag_delta(train, 'revenue_lag_3m', 'revenue_lag_6m', smoothing)
train['revenue_lag_12m_delta'] = calculate_lag_delta(train, 'revenue_lag_6m', 'revenue_lag_12m', smoothing)
train.head()

In [None]:
# revenue rolling stats
train['revenue_3m_roll_mean'] = train.groupby(['shop_id', 'item_id']).rolling(3, min_periods=2)['revenue_lag_1m'].mean().reset_index(drop=True)
train['revenue_6m_roll_mean'] = train.groupby(['shop_id', 'item_id']).rolling(6, min_periods=5)['revenue_lag_1m'].mean().reset_index(drop=True)
train['revenue_12m_roll_mean'] = train.groupby(['shop_id', 'item_id']).rolling(12, min_periods=11)['revenue_lag_1m'].mean().reset_index(drop=True)

train['revenue_3m_roll_std'] = train.groupby(['shop_id', 'item_id']).rolling(3, min_periods=2)['revenue_lag_1m'].std().reset_index(drop=True)
train['revenue_6m_roll_std'] = train.groupby(['shop_id', 'item_id']).rolling(6, min_periods=5)['revenue_lag_1m'].std().reset_index(drop=True)
train['revenue_12m_roll_std'] = train.groupby(['shop_id', 'item_id']).rolling(12, min_periods=11)['revenue_lag_1m'].std().reset_index(drop=True)

train['revenue_3m_roll_median'] = train.groupby(['shop_id', 'item_id']).rolling(3, min_periods=2)['revenue_lag_1m'].median().reset_index(drop=True)
train['revenue_6m_roll_median'] = train.groupby(['shop_id', 'item_id']).rolling(6, min_periods=5)['revenue_lag_1m'].median().reset_index(drop=True)
train['revenue_12m_roll_median'] = train.groupby(['shop_id', 'item_id']).rolling(12, min_periods=11)['revenue_lag_1m'].median().reset_index(drop=True)
train.head()

In [None]:
train['item_cnt_3m_mean_delta'] = train['item_cnt_lag_1m'] - train['item_cnt_3m_roll_mean']
train['item_cnt_6m_mean_delta'] = train['item_cnt_lag_1m'] - train['item_cnt_6m_roll_mean']
train['item_cnt_12m_mean_delta'] = train['item_cnt_lag_1m'] - train['item_cnt_12m_roll_mean']

train['revenue_3m_mean_delta'] = train['revenue_lag_1m'] - train['revenue_3m_roll_mean']
train['revenue_6m_mean_delta'] = train['revenue_lag_1m'] - train['revenue_6m_roll_mean']
train['revenue_12m_mean_delta'] = train['revenue_lag_1m'] - train['revenue_12m_roll_mean']

# Add feature checking to see if last months sales were above average
#train['above_mean_item_cnt'] = 0
#train.loc[train['item_cnt_lag_1m'] > train['item_cnt_6m_roll_mean'], 'above_mean_item_cnt'] = 1

#train['above_mean_revenue'] = 0
#train.loc[train['revenue_lag_1m'] > train['revenue_6m_roll_mean'], 'above_mean_revenue'] = 1

# drop first 12 months due to lag
train = train.loc[train['date_block_num']>11].reset_index(drop=True)

# drop current month revenue since we won't know month 34 price
train = train.drop('revenue', axis=1)

train.head()

## Add binary features to training set

In [None]:
shops = pd.read_pickle('tmp/data/processed/shops.pkl')
shops.head()

In [None]:
cats = pd.read_pickle('tmp/data/processed/categories.pkl')
cats.head()

In [None]:
shop_items = pd.read_pickle('tmp/data/processed/categoshop_itemsries.pkl')
shop_items.head()

In [None]:
train = train.merge(shops, on='shop_id', how='left')
train = train.merge(cats, on='item_id', how='left')
train = train.merge(shop_items, on=['shop_id', 'item_id'], how='left')

print(train.shape)
train.isnull().describe().transpose()

## Save files for modeling & predicting on
Rejoin manipulated test set to old test to maintain expected order of submission file for Kaggle.

In [None]:
test = train.loc[train['date_block_num']==34].drop('item_cnt_month', axis=1).sort_values(['shop_id', 'item_id']).reset_index(drop=True)
test_root = pd.read_pickle('tmp/data/processed/test_plus_features.pkl').drop(['revenue', 'item_cnt_month'], axis=1)
test = test_root.merge(test, how='left', on=['date_block_num', 'month', 'year', 'shop_id', 'item_id'])

test.to_pickle('tmp/data/final/test.pkl')
print(test.shape)
test.head()

In [None]:
train = train.loc[train['date_block_num']!=34].sort_values(['shop_id', 'item_id', 'date_block_num']).reset_index(drop=True)
train.to_pickle('tmp/data/final/train.pkl')
print(train.shape)
train.head()