In [340]:
import baseline
import importlib
importlib.reload(baseline)
from baseline import *
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [124]:
#Data loading and reduce memory usage by changing dtypes
train = reduce_mem_usage(pd.read_csv('train.csv'))
test = reduce_mem_usage(pd.read_csv('test.csv'))
shops = reduce_mem_usage(pd.read_csv('shops.csv'))
items = reduce_mem_usage(pd.read_csv('items.csv'))
categories = reduce_mem_usage(pd.read_csv('item_categories.csv'))

Mem. usage decreased to 14.47 Mb (80.0% reduction)
Mem. usage decreased to  1.43 Mb (70.8% reduction)
Mem. usage decreased to  0.00 Mb (40.7% reduction)
Mem. usage decreased to  0.27 Mb (59.4% reduction)
Mem. usage decreased to  0.00 Mb (42.4% reduction)


In [125]:
#Check if our train data is valid
column_types = {'date_block_num': 'int8', 'shop_id': 'int8', 'item_id': 'int16', 'item_price': 'float16', 'item_cnt_day': 'float16'}
values_ranges = {'date_block_num': (0, 33), 'shop_id': (0, 59), 'item_id': (0, 22169), 'item_price': (0.07, 42980.0), 'item_cnt_day': (0, 669)}
Validator(column_types = column_types, value_ranges = values_ranges, check_missing = True, check_duplicates=True).fit_transform(train)

'Data is valid'

In [126]:
#Target - item_cnt_month
target_group = (train.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day']
                .sum().rename('item_cnt_month').reset_index())
#From EDA step we do not see linear dependency between item_cnt_month and item_price.
#Feature 'revenue' will give us more imformation about target
train['revenue'] = train['item_price'] * train['item_cnt_day']

In [127]:
'''
According to EDA: we do have a lot of data without full range during analysing period
and also our test set contains shop_id&item_id pairs that are nor presented in train set at all
'''
columns = ['date_block_num', 'shop_id', 'item_id']
full_data = full_data_creation(df = train, agg_group = columns, periods = train.date_block_num.nunique())

In [128]:
#Merge between full data and train set
full_data = full_data.merge(target_group, on = columns, how = 'left')

In [129]:
#test set concatenation with full_data
test['date_block_num'] = 34
del test['ID']
full_data = pd.concat([full_data, test], keys = columns, ignore_index=True, sort = False)

  full_data = pd.concat([full_data, test], keys = columns, ignore_index=True, sort = False)


In [130]:
'''
We need:
1. fill all missing values as item&shop pairs + test set have been added
2. clip our target variable - original condition
'''
full_data = full_data.fillna(0)
full_data['item_cnt_month'] = full_data['item_cnt_month'].clip(0,20).astype(np.float16)

In [131]:
#Merge with other datasets
full_data = full_data.merge(shops, on = 'shop_id', how = 'left')
full_data = full_data.merge(items, on = 'item_id', how = 'left')
full_data = full_data.merge(categories, on = 'item_category_id', how = 'left')

In [132]:
#Columns we are planning to work
Work_columns = ['date_block_num', 'shop_id', 'item_cnt_month', 'item_id', 'city_id', 'item_category_id', 'main_category_id','minor_category_id']
full_data = full_data.loc[:, Work_columns]

#As we make transformations during DQL with shop_id, we will encode it with LabelEncoding
full_data['shop_id'] = LabelEncoder().fit_transform(full_data['shop_id'])

Feature exctraction

In [133]:
#From EDA we see a lot of missings in different features during the date period
#We will create feature_history with number of months (for example for shop) that feature exists
full_data = history_features(df = full_data, agg = 'shop_id', new_feature = 'shop_history')
full_data = history_features(df = full_data, agg = 'item_id', new_feature = 'item_history')
full_data = history_features(df = full_data, agg = 'minor_category_id', new_feature = 'minor_category_history')

In [None]:
#Features from aggregations
agg_list = [
    (['date_block_num', 'item_category_id'], 'avg_item_cnt_per_cat', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'city_id'], 'avg_item_cnt_per_city', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'shop_id'], 'avg_item_cnt_per_shop', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'item_category_id', 'shop_id'], 'avg_item_cnt_per_cat_per_shop', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'item_id'], 'avg_item_cnt_per_item', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'item_category_id', 'shop_id'], 'med_item_cnt_per_cat_per_shop', {'item_cnt_month': 'median'}),
    (['date_block_num', 'main_category_id'], 'avg_item_cnt_per_main_cat', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'minor_category_id'], 'avg_item_cnt_per_minor_cat', {'item_cnt_month': 'mean'}),
    (['date_block_num', 'shop_id', 'item_id'], 'avg_item_cnt_per_date_block', {'item_cnt_month': 'mean'}),
    (['item_id'], 'first_sales_date_block', {'item_cnt_month': 'min'})
]


for agg, new_col, aggregation in agg_list:
    full_data = feat_from_agg(full_data, agg, new_col, aggregation)
    


full_data['first_sales_date_block'] = full_data['first_sales_date_block'].fillna(34)

In [None]:
#Lags of aggregational features
#All aggregations will be delted to avoid data leakage
lag_dict = {'avg_item_cnt_per_cat': [1], 'avg_item_cnt_per_shop': [1], 'avg_item_cnt_per_item': [1],
            'avg_item_cnt_per_city': [1], 'avg_item_cnt_per_cat_per_shop': [1], 
            'med_item_cnt_per_cat_per_shop': [1], 'avg_item_cnt_per_main_cat': [1],
            'avg_item_cnt_per_minor_cat': [1], 'avg_item_cnt_per_date_block': [1,2,3],
            'item_cnt_month': [1]}


for feature, lags in lag_dict.items():
    full_data = lag_features(df = full_data, shift_col = feature, lags = lags)

In [142]:
#Features with last sales
full_data = last_sales(df = full_data, new_feature = 'item_shop_last_sale', item_shop = True)
full_data = last_sales(df = full_data, new_feature = 'item_last_sale', item_shop = False)

#On basis of items last sales - first sales
full_data['item_shop_first_sale'] = full_data['date_block_num'] - full_data.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
full_data['item_first_sale'] = full_data['date_block_num'] - full_data.groupby('item_id')['date_block_num'].transform('min')

In [None]:
#Revenue&Item Price Features and their lags
train = train.merge(items.loc[:, ['item_id', 'item_category_id']], on = 'item_id', how = 'left')
train = train.merge(shops.loc[:, ['shop_id', 'city_id']], on = 'shop_id', how = 'left')

agg_list = [
    (['date_block_num', 'item_category_id', 'shop_id'], 'sales_per_category_per_shop', {'revenue': 'sum'}),
    (['date_block_num', 'shop_id'], 'sales_per_shop', {'revenue': 'sum'}),
    (['date_block_num', 'item_id'], 'sales_per_item', {'revenue': 'sum'}),
    (['item_id'], 'avg_item_price', {'item_price': 'mean'}),
    (['date_block_num','item_id'], 'avg_item_price_month', {'item_price': 'mean'})
]


for agg, new_col, aggregation in agg_list:
    train = feat_from_agg(train, agg, new_col, aggregation)

lag_dict = {'sales_per_category_per_shop': [1], 'sales_per_shop': [1],
            'sales_per_item': [1], 'avg_item_price': [1], 
            'avg_item_price_month': [1]}

for feature, lags in lag_dict.items():
    train = lag_features(df = train, shift_col = feature, lags = lags)
    

In [145]:
#Merging results
del train['item_price']
del train['revenue']
del train['item_category_id']
del train['city_id']
del train['item_cnt_day']
full_data = full_data.merge(train, on = ['date_block_num', 'shop_id', 'item_id'], how = 'left')

In [148]:
full_data = full_data.replace([np.inf, -np.inf], np.nan).fillna(0)

In [338]:
full_data.to_csv('full_data.csv', index = False)

Train/Test split

In [None]:
#Validation
tss = TimeSeriesSplit(n_splits=3)

X_test = full_data[full_data.date_block_num == 34].drop('item_cnt_month', axis = 1)

X = full_data[full_data.date_block_num != 34].drop('item_cnt_month', axis = 1)
y = full_data[full_data.date_block_num != 34]['item_cnt_month']
tss = TimeSeriesSplit(n_splits=3)

for train_idxs, val_idxs in tss.split(X):

    X_train, X_val = X.iloc[train_idxs], X.iloc[val_idxs]
    y_train, y_val = y.iloc[train_idxs], y.iloc[val_idxs]
    

In [None]:
#Training
X_train = full_data[~full_data.date_block_num.isin([33,34])]
y_train = X_train['item_cnt_month']
del X_train['item_cnt_month']

X_val = full_data[full_data['date_block_num']==33]
y_val = X_val['item_cnt_month']
del X_val['item_cnt_month']

X_test = full_data[full_data['date_block_num']==34].drop(columns='item_cnt_month')
X_test = X_test.reset_index()
del X_test['index']