## import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from itertools import product
from datetime import datetime
from dateutil.relativedelta import relativedelta
from calendar import monthrange
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## read data

In [2]:
cats = pd.read_csv('./data/item_categories_b4fe.csv')
shops = pd.read_csv('./data/shops_b4fe.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')
items = pd.read_csv('./data/items.csv')
train = pd.read_csv('./data/sales_train_b4fe.csv')

### shops

In [3]:
# extract geographical info from city
geo_info = pd.read_csv('./data/worldcities_ru.csv', usecols=['city','lat','lng','population'])

shops = shops.merge(geo_info, on='city', how='left')
shops['population'] = shops['population'].fillna(0)

# LabelEncode city
shops['city'] = shops['city'].astype(str)
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])

### cats

In [4]:
# LabelEncoder category
for col in ['category0','category1']:
    cats[col] = cats[col].astype(str)
    cats[col+'_code'] = LabelEncoder().fit_transform(cats[col])

### create a dataframe of all possible combinations of shop_id and item_id and their monthly sales

In [5]:
matrix = []

for month in train['date_block_num'].unique():
    
    shop_ids = train.loc[train['date_block_num'] == month, 'shop_id'].unique()
    item_ids = train.loc[train['date_block_num'] == month, 'item_id'].unique()
    matrix.append(np.array(list(product([month], shop_ids, item_ids))))
    
# turn the grid into a dataframe
index_cols = ['date_block_num', 'shop_id', 'item_id']
df = pd.DataFrame(np.vstack(matrix), columns = index_cols)

df.sort_values(by=index_cols, inplace=True)

# cnt
cnt_group = train.groupby(index_cols)['item_cnt_day'].sum().to_frame(name='item_cnt_month')
df = df.merge(cnt_group, on=index_cols, how='left')
df['item_cnt_month'] = df['item_cnt_month'].clip(0,20)

### concat all datasts

In [6]:
# test
test_ = test.drop('ID', axis=1)
test_['date_block_num'] = 34

df = df.append(test_, ignore_index=True)
df = df.fillna(0)

# items
df = df.merge(items[['item_id','item_category_id']], on='item_id', how='left')

# cats
df = df.merge(cats[['item_category_id','category0_code','category1_code']], on='item_category_id', how='left')

# shops
df = df.merge(shops[['shop_id','city_code','lat','lng','population']], on='shop_id', how='left')

### downcast dtype


In [7]:
int8_cols = ['date_block_num','shop_id','city_code','item_category_id','category0_code','category1_code']
int16_cols = ['item_id']
int32_cols = ['population']
float16_cols = ['item_cnt_month','lat','lng']

dtype_dict = {np.int8:int8_cols, np.int16:int16_cols, np.int32: int32_cols, np.float16:float16_cols}

for dtype, cols in dtype_dict.items():
    for col in cols:
        df[col] = df[col].astype(dtype)

### lag  function

In [8]:
def add_lags(df, lag_feature, lags):
    for lag in lags:
        group_features = ['shop_id','item_id']
        lag_name = '%s_lag_%d'%(lag_feature, lag)
        df[lag_name] = df.groupby(group_features)[lag_feature].shift(lag)
        
    return df

### target  encoding function

In [9]:
def add_encoding(df, group_features, on_feature, new_name, how='mean'):
    mean_group = df.groupby(group_features)[on_feature].agg(how).to_frame(name=new_name).reset_index()
    
    if on_feature != 'revenue':
        mean_group[new_name] = mean_group[new_name].astype(np.float16)
    
    return mean_group

In [10]:
%%time
# add lag features for monthly cnt
df = add_lags(df=df, lag_feature='item_cnt_month', lags=[1,2,3])

# add target encoding feature and then lag feature
date_encoding_dict = {
      'avg_cnt_by_month':['date_block_num'] # month
     ,'avg_cnt_by_month_item':['date_block_num','item_id'] # month_item
     ,'avg_cnt_by_month_shop':['date_block_num','shop_id'] # month_shop
     ,'avg_cnt_by_month_shop_item':['date_block_num','shop_id','item_id']
     #,'target_month_category':['date_block_num','item_category_id'] # month_item_category
     #,'target_month_category0':['date_block_num','category0_le']
     ,'avg_cnt_by_month_shop_cat':['date_block_num','shop_id','item_category_id']
     ,'avg_cnt_by_month_shop_loc':['date_block_num','shop_id','city_code']
     #,'target_month_shop_category0':['date_block_num','shop_id','category0_le']
     #,'target_month_location':['date_block_num','location_le']
     ,'avg_cnt_by_month_item_loc':['date_block_num','item_id','city_code']
     }

for k,v in date_encoding_dict.items():
    group = add_encoding(df, group_features=v, new_name=k, on_feature='item_cnt_month')
    df = df.merge(group, on=v, how='left')
    #df = lag_feature(df, [1], k)
    df = add_lags(df=df, lag_feature=k, lags=[1])
    df.drop([k], axis=1, inplace=True)

CPU times: user 46.8 s, sys: 11.9 s, total: 58.7 s
Wall time: 59.3 s


In [11]:
%%time
# price mean encoding
price_encoding_dict = {
    'avg_price_by_item':['item_id']
    ,'avg_price_by_item_month':['date_block_num','item_id']
}

features_to_drop = list(price_encoding_dict.keys())

for k,v in price_encoding_dict.items():
    group = add_encoding(df = train, group_features=v, new_name=k, on_feature='item_price')
    df = df.merge(group, on=v, how='left')

# lags
lags = [1,2,3]
df = add_lags(df=df, lag_feature='avg_price_by_item_month', lags=lags)

for lag in lags:
    lag_name = 'delta_price_lag_%d'%(lag)
    features_to_drop.append(lag_name)
                        
    df[lag_name] = (df['%s_lag_%d'%('avg_price_by_item_month', lag)]\
                    - df['avg_price_by_item'])\
                   / df['avg_price_by_item']
        
df['delta_price_lag'] = df[['delta_price_lag_1',
                            'delta_price_lag_2',
                            'delta_price_lag_3']].max(1)

df['delta_price_lag'] = df['delta_price_lag'].fillna(0)

df.drop(features_to_drop, axis=1, inplace=True)

CPU times: user 11.1 s, sys: 2.96 s, total: 14.1 s
Wall time: 14.1 s


In [12]:
%%time
# revenue sum and mean encoding
train['revenue'] = train['item_cnt_day'] * train['item_price']

group1 = add_encoding(df=train, group_features=['date_block_num','shop_id'], on_feature='revenue',
                    new_name='total_revenue_by_month_shop', how='sum')

df = df.merge(group1, on=['date_block_num','shop_id'], how='left')

group2 = add_encoding(df=train, group_features=['shop_id'], on_feature='revenue', new_name='avg_revenue_by_shop')

df = df.merge(group2, on=['shop_id'], how='left')

df['delta_revenue'] = (df['total_revenue_by_month_shop'] - df['avg_revenue_by_shop']) / df['avg_revenue_by_shop']

df = add_lags(df=df, lag_feature='delta_revenue', lags=[1])

df.drop(['total_revenue_by_month_shop','avg_revenue_by_shop','delta_revenue'], axis=1, inplace=True)

CPU times: user 9.63 s, sys: 4.05 s, total: 13.7 s
Wall time: 13.8 s


### date feature

In [13]:
%%time
date0 = datetime.strptime(train['date'].min(), '%d.%m.%Y')

def get_year(date_num):
    date = date0 + relativedelta(months=date_num)
    return date.year

def get_month(date_num):
    date = date0 + relativedelta(months=date_num)
    return date.month

def get_days(date_num):
    date = date0 + relativedelta(months=date_num)
    return monthrange(date.year, date.month)[1]

calendar_dict = {}
for i in df['date_block_num'].unique():
    calendar_dict[i] = [get_year(i), get_month(i), get_days(i)]
    
for i, c in enumerate(['year','month','num_days']):
    df[c] = df['date_block_num'].map(lambda x: calendar_dict[x][i])
    df[c] = df[c].astype(np.int16)

CPU times: user 54.9 s, sys: 696 ms, total: 55.6 s
Wall time: 55.9 s


### item feature

In [14]:
df["months_after_first_sale_by_item_shop"] = df["date_block_num"]\
                                - df.groupby(["item_id","shop_id"])["date_block_num"].transform('min')

df["months_after_first_sale_by_item"] = df["date_block_num"]\
                                - df.groupby(["item_id"])["date_block_num"].transform('min')

df['is_first_sale_by_item_shop'] = np.where(df["months_after_first_sale_by_item_shop"]==0,1,0)
df['is_first_sale_by_item'] = np.where(df["months_after_first_sale_by_item"]==0,1,0)

df['is_first_sale_by_item_shop'] =df['is_first_sale_by_item_shop'].astype(np.int8)
df['is_first_sale_by_item'] =df['is_first_sale_by_item'].astype(np.int8)

In [15]:
df = df[df.date_block_num > 3]

In [16]:
df.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,category0_code,category1_code,city_code,lat,lng,population,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,avg_cnt_by_month_lag_1,avg_cnt_by_month_item_lag_1,avg_cnt_by_month_shop_lag_1,avg_cnt_by_month_shop_item_lag_1,avg_cnt_by_month_shop_cat_lag_1,avg_cnt_by_month_shop_loc_lag_1,avg_cnt_by_month_item_loc_lag_1,avg_price_by_item_month_lag_1,avg_price_by_item_month_lag_2,avg_price_by_item_month_lag_3,delta_price_lag,delta_revenue_lag_1,year,month,num_days,months_after_first_sale_by_item_shop,months_after_first_sale_by_item,is_first_sale_by_item_shop,is_first_sale_by_item
0,0,2,19,0.0,40,10,20,8,44.59375,40.09375,141970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013,1,31,0,0,1,1
1,0,2,27,1.0,19,7,43,8,44.59375,40.09375,141970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013,1,31,0,0,1,1
2,0,2,28,0.0,30,13,52,8,44.59375,40.09375,141970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013,1,31,0,0,1,1
3,0,2,29,0.0,23,7,57,8,44.59375,40.09375,141970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013,1,31,0,0,1,1
4,0,2,32,0.0,40,10,20,8,44.59375,40.09375,141970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013,1,31,0,0,1,1


In [17]:
df.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11056277 entries, 0 to 11056276
Data columns (total 33 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   date_block_num                        int8   
 1   shop_id                               int8   
 2   item_id                               int16  
 3   item_cnt_month                        float16
 4   item_category_id                      int8   
 5   category0_code                        int8   
 6   category1_code                        int8   
 7   city_code                             int8   
 8   lat                                   float16
 9   lng                                   float16
 10  population                            int32  
 11  item_cnt_month_lag_1                  float16
 12  item_cnt_month_lag_2                  float16
 13  item_cnt_month_lag_3                  float16
 14  avg_cnt_by_month_lag_1                float16
 15  avg_cnt_by_mo

In [18]:
%%time
train = df[df.date_block_num < 33]
validation = df[df.date_block_num == 33]
test = df[df.date_block_num == 34]

CPU times: user 1.72 s, sys: 427 ms, total: 2.15 s
Wall time: 2.19 s


In [19]:
%%time
train.to_csv('./data/train_b4md.csv', index=False)

CPU times: user 2min 58s, sys: 4.55 s, total: 3min 3s
Wall time: 3min 5s


In [20]:
%%time
validation.to_csv('./data/validation_b4md.csv', index=False)
test.to_csv('./data/test_b4md.csv', index=False)

CPU times: user 7.64 s, sys: 210 ms, total: 7.85 s
Wall time: 7.92 s


In [21]:
df[df.date_block_num == 34].shape

(214200, 33)