In [33]:
import baseline
import importlib
importlib.reload(baseline)
from baseline import *
import numpy as np 
import pandas as pd

In [22]:
#Data loading and reduce memory usage by changing dtypes
files = ['train.csv', 'test.csv', 'shops.csv', 'items.csv', 'item_categories.csv']

data = [loader(file_name) for file_name in files]

In [23]:
full_data = prepare_full_data(*data)

  


In [24]:
reduce_mem_usage(full_data)
full_data.info()

Mem. usage decreased to 105.42 Mb (82.8% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11054182 entries, 0 to 11054181
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   date_block_num     int8   
 1   shop_id            int8   
 2   item_cnt_month     float16
 3   item_id            int16  
 4   city_id            int8   
 5   item_category_id   int8   
 6   main_category_id   int8   
 7   minor_category_id  int8   
dtypes: float16(1), int16(1), int8(6)
memory usage: 105.4 MB


In [30]:
#Check if our train data is valid
column_types = {'date_block_num': 'int8', 'shop_id': 'int8', 'city_id': 'int8', 'item_id': 'int16', 'item_cnt_month': 'float16',
'item_category_id': 'int8', 'main_category_id': 'int8', 'minor_category_id': 'int8'}
values_ranges = {'date_block_num': (0, 34), 'shop_id': (0, 59), 'item_id': (0, 22169), 'item_cnt_month': (0, 669), 'city_id':(0,30),
                'item_category_id': (0,83), 'main_category_id': (0,11), 'minor_category_id': (0, 66)}
Validator(column_types = column_types, value_ranges = values_ranges, check_missing = True, check_duplicates=True).fit_transform(full_data)

'Data is valid'

Feature exctraction

In [32]:
#From EDA we see a lot of missings in different features during the date period
#We will create feature_history with number of months (for example for shop) that feature exists
history = [('shop_id', 'shop_history'), ('item_id', 'item_history'), ('minor_category_id', 'minor_category_history')]
for group in history:
    full_data = history_features(df = full_data, agg = group[0], new_feature = group[1])

In [None]:
#Features from aggregations
agg_list = [
    (['date_block_num', 'item_category_id'], 'avg_item_cnt_per_cat', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'city_id', 'shop_id'], 'avg_item_cnt_per_city_per_shop', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'shop_id'], 'avg_item_cnt_per_shop', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'item_category_id', 'shop_id'], 'avg_item_cnt_per_cat_per_shop', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'item_id'], 'avg_item_cnt_per_item', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'item_category_id', 'shop_id'], 'med_item_cnt_per_cat_per_shop', {'item_cnt_month': 'median'}),
    (['date_block_num', 'main_category_id'], 'avg_item_cnt_per_main_cat', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'minor_category_id'], 'avg_item_cnt_per_minor_cat', {'item_cnt_month': 'mean'}),
    (['item_id'], 'first_sales_date_block', {'item_cnt_month': 'min'})
]


for agg, new_col, aggregation in agg_list:
    full_data = feat_from_agg(full_data, agg, new_col, aggregation, full_data)


full_data['first_sales_date_block'] = full_data['first_sales_date_block'].fillna(34)

In [18]:
reduce_mem_usage(full_data)
full_data.info()

Mem. usage decreased to 337.35 Mb (30.4% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11054182 entries, 0 to 11054181
Data columns (total 20 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   date_block_num                  int8   
 1   shop_id                         int8   
 2   item_cnt_month                  float16
 3   item_id                         int16  
 4   city_id                         int8   
 5   item_category_id                int8   
 6   main_category_id                int8   
 7   minor_category_id               int8   
 8   shop_history                    int8   
 9   item_history                    float16
 10  minor_category_history          int8   
 11  avg_item_cnt_per_cat            float16
 12  avg_item_cnt_per_city_per_shop  float16
 13  avg_item_cnt_per_shop           float16
 14  avg_item_cnt_per_cat_per_shop   float16
 15  avg_item_cnt_per_item           float16
 16  med_item_cnt_per_c

In [20]:
full_data = feat_from_agg(df = train, agg = ['item_id'], new_col = 'item_avg_item_price', aggregation = {'item_price': ['mean']}, output_df = full_data)
full_data['item_avg_item_price'] = full_data['item_avg_item_price'].astype(np.float16)

full_data = feat_from_agg(df = train, agg = ['date_block_num','item_id'], new_col = 'date_item_avg_item_price', aggregation = {'item_price': ['mean']}, output_df = full_data)
full_data['date_item_avg_item_price'] = full_data['date_item_avg_item_price'].astype(np.float16)

full_data = lag_features(full_data, 'date_item_avg_item_price', [1])
full_data['delta_price_lag_1'] = (full_data['date_item_avg_item_price_lag_1'] - full_data['item_avg_item_price']) / full_data['item_avg_item_price']

del full_data['item_avg_item_price']
del full_data['date_item_avg_item_price']
del full_data['date_item_avg_item_price_lag_1']

In [None]:
train = train.merge(items.loc[:, ['item_id', 'item_category_id']], on = 'item_id', how = 'left')
train = train.merge(shops.loc[:, ['shop_id', 'city_id']], on = 'shop_id', how = 'left')

In [23]:
#Revenue and their lags

agg_list = [
    (['date_block_num', 'item_category_id', 'shop_id'], 'sales_per_category_per_shop', {'revenue': 'sum'}),
    (['date_block_num', 'shop_id'], 'sales_per_shop', {'revenue': 'sum'}),
    (['date_block_num', 'item_id'], 'sales_per_item', {'revenue': 'sum'}),
]


for agg, new_col, aggregation in agg_list:
    full_data = feat_from_agg(train, agg, new_col, aggregation, output_df=full_data)
    

In [24]:
lag_dict = {'sales_per_category_per_shop': [1], 'sales_per_shop': [1],
            'sales_per_item': [1]}

for feature, lags in lag_dict.items():
    full_data = lag_features(df = full_data, col = feature, lags = lags)
    del full_data[feature]

In [25]:
#As for the item_price - delta_revenue
full_data = feat_from_agg(df = train, agg = ['shop_id'], new_col = 'avg_sales_per_shop', aggregation = {'revenue': ['mean']}, output_df = full_data)
full_data['avg_sales_per_shop'] = full_data['avg_sales_per_shop'].astype(np.float32)

full_data['delta_revenue_lag_1'] = (full_data['sales_per_shop_lag_1'] - full_data['avg_sales_per_shop']) / full_data['avg_sales_per_shop']

del full_data['avg_sales_per_shop']
del full_data['sales_per_shop_lag_1']


In [26]:
#Last sale feature
full_data['last_sale'] = full_data.groupby(['shop_id', 'item_id'])['date_block_num'].shift(1)
full_data['last_sale'] = full_data['last_sale']
# Calculate the number of months since the last sale
full_data['months_from_last_sale'] = full_data['date_block_num'] - full_data['last_sale']
# Calculate the number of months since the first sale
full_data['months_from_first_sale'] = full_data['date_block_num'] - full_data.groupby(['shop_id', 'item_id'])['date_block_num'].transform('min')
del full_data['last_sale']
full_data['months_from_last_sale'] = full_data['months_from_last_sale'].fillna(-1)

In [27]:
full_data = full_data.fillna(0)

In [28]:
full_data.to_csv('full_data.csv', index = False)

Train/Test split

In [None]:
#Validation
tss = TimeSeriesSplit(n_splits=3)

X_test = full_data[full_data.date_block_num == 34].drop('item_cnt_month', axis = 1)

X = full_data[full_data.date_block_num != 34].drop('item_cnt_month', axis = 1)
y = full_data[full_data.date_block_num != 34]['item_cnt_month']
tss = TimeSeriesSplit(n_splits=3)

for train_idxs, val_idxs in tss.split(X):

    X_train, X_val = X.iloc[train_idxs], X.iloc[val_idxs]
    y_train, y_val = y.iloc[train_idxs], y.iloc[val_idxs]
    

In [None]:
#Training
X_train = full_data[~full_data.date_block_num.isin([33,34])]
y_train = X_train['item_cnt_month']
del X_train['item_cnt_month']

X_val = full_data[full_data['date_block_num']==33]
y_val = X_val['item_cnt_month']
del X_val['item_cnt_month']

X_test = full_data[full_data['date_block_num']==34].drop(columns='item_cnt_month')
X_test = X_test.reset_index()
del X_test['index']