## import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from itertools import product
from datetime import datetime
from dateutil.relativedelta import relativedelta
from calendar import monthrange

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## read data

In [2]:
cats = pd.read_csv('item_categories_b4fe.csv', usecols=['item_category_id','category0_code','category1_code'])
shops = pd.read_csv('shops_b4fe.csv', usecols=['shop_id','location_code'])
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
items = pd.read_csv('items.csv', usecols=['item_id','item_category_id'])
train = pd.read_csv('sales_train_b4fe.csv')

### create a dataframe of all possible combinations of shop_id and item_id and their monthly sales

In [3]:
matrix = []

for month in train['date_block_num'].unique():
    
    shop_ids = train.loc[train['date_block_num'] == month, 'shop_id'].unique()
    item_ids = train.loc[train['date_block_num'] == month, 'item_id'].unique()
    matrix.append(np.array(list(product([month], shop_ids, item_ids))))
    
# turn the grid into a dataframe
index_cols = ['date_block_num', 'shop_id', 'item_id']
df = pd.DataFrame(np.vstack(matrix), columns = index_cols)

df.sort_values(by=index_cols, inplace=True)

df.head()

Unnamed: 0,date_block_num,shop_id,item_id
114910,0,2,19
117150,0,2,27
120623,0,2,28
118316,0,2,29
114602,0,2,32


In [4]:
# cnt
cnt_group = train.groupby(index_cols)['item_cnt_day'].sum().to_frame(name='item_cnt_month')
df = df.merge(cnt_group, on=index_cols, how='left')
df['item_cnt_month'] = df['item_cnt_month'].clip(0,20)

In [5]:
# revenue
train['revenue'] = train['item_cnt_day'] * train['item_price']

### concat training set and testing set

In [6]:
test_ = test.drop('ID', axis=1)
test_['date_block_num'] = 34

df = df.append(test_, ignore_index=True)
df = df.fillna(0)

### concat all datasts

In [7]:
df = df.merge(items, on='item_id', how='left')
df = df.merge(cats, on='item_category_id', how='left')
df = df.merge(shops, on='shop_id', how='left')

int8_cols = ['date_block_num','shop_id','location_code','item_category_id','category0_code','category1_code']
int16_cols = ['item_id']
float16_cols = ['item_cnt_month']

dtype_dict = {np.int8:int8_cols, np.int16:int16_cols, np.float16:float16_cols}

for dtype, cols in dtype_dict.items():
    for col in cols:
        df[col] = df[col].astype(dtype)


df.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id,category0_code,category1_code,location_code
0,0,2,19,0.0,40,10,20,0
1,0,2,27,1.0,19,7,43,0
2,0,2,28,0.0,30,13,52,0
3,0,2,29,0.0,23,7,57,0
4,0,2,32,0.0,40,10,20,0


In [8]:
def lag_feature( df,lags, cols ):
    for col in cols:
        print(col)
        tmp = df[["date_block_num", "shop_id","item_id",col]]
        for i in lags:
            shifted = tmp.copy()
            shifted.columns = ["date_block_num", "shop_id", "item_id", "%s_lag_%d"%(col, i)]
            shifted.date_block_num = shifted.date_block_num + i
            df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

def add_lags(df, lag_feature, lags):
    for lag in lags:
        group_features = ['shop_id','item_id']
        lag_name = '%s_lag_%d'%(lag_feature, lag)
        df[lag_name] = df.groupby(group_features)[lag_feature].shift(lag)
        
    return df

#df = lag_feature(df, [1,2,3], ["item_cnt_month"] )
df = add_lags(df=df, lag_feature='item_cnt_month', lags=[1,2,3])

### target mean encoding

In [9]:
def add_encoding(df, group_features, on_feature, new_name, how='mean'):
    mean_group = df.groupby(group_features)[on_feature].agg(how).to_frame(name=new_name).reset_index()
    mean_group[new_name] = mean_group[new_name].astype(np.float16)
    
    return mean_group

In [10]:
%%time
date_encoding_dict = {
      'avg_cnt_by_month':['date_block_num'] # month
     ,'avg_cnt_by_month_item':['date_block_num','item_id'] # month_item
     ,'avg_cnt_by_month_shop':['date_block_num','shop_id'] # month_shop
     ,'avg_cnt_by_month_shop_item':['date_block_num','shop_id','item_id']
     #,'target_month_category':['date_block_num','item_category_id'] # month_item_category
     #,'target_month_category0':['date_block_num','category0_le']
     ,'avg_cnt_by_month_shop_cat':['date_block_num','shop_id','item_category_id']
     ,'avg_cnt_by_month_shop_loc':['date_block_num','shop_id','location_code']
     #,'target_month_shop_category0':['date_block_num','shop_id','category0_le']
     #,'target_month_location':['date_block_num','location_le']
     ,'avg_cnt_by_month_item_loc':['date_block_num','item_id','location_code']
     }

for k,v in date_encoding_dict.items():
    group = add_encoding(df, group_features=v, new_name=k, on_feature='item_cnt_month')
    df = df.merge(group, on=v, how='left')
    #df = lag_feature(df, [1], k)
    df = add_lags(df=df, lag_feature=k, lags=[1])
    df.drop([k], axis=1, inplace=True)

CPU times: user 36.1 s, sys: 8.93 s, total: 45 s
Wall time: 45.3 s


In [11]:
%%time
# price mean encoding
price_encoding_dict = {
    'avg_price_by_item':['item_id']
    ,'avg_price_by_item_month':['date_block_num','item_id']
}

features_to_drop = list(price_encoding_dict.keys())

for k,v in price_encoding_dict.items():
    print(k)
    group = add_encoding(df = train, group_features=v, new_name=k, on_feature='item_price')
    df = df.merge(group, on=v, how='left')
    
lags = [1,2,3]
df = add_lags(df=df, lag_feature='avg_price_by_item_month', lags=lags)

for lag in lags:
    lag_name = 'delta_price_lag_%d'%(lag)
    features_to_drop.append(lag_name)
                        
    df[lag_name] = (df['%s_lag_%d'%('avg_price_by_item_month', lag)]\
                    - df['avg_price_by_item'])\
                   / df['avg_price_by_item']
        
df['delta_price_lag'] = df[['delta_price_lag_1',
                            'delta_price_lag_2',
                            'delta_price_lag_3']].max(1)

df['delta_price_lag'] = df['delta_price_lag'].fillna(0)

df.drop(features_to_drop, axis=1, inplace=True)

avg_price_by_item
avg_price_by_item_month
CPU times: user 9.3 s, sys: 2.21 s, total: 11.5 s
Wall time: 11.5 s


In [12]:
%%time
# revenue sum and mean encoding
group1 = add_encoding(df=train, group_features=['date_block_num','shop_id'], on_feature='revenue',
                    new_name='total_revenue_by_month_shop', how='sum')

df = df.merge(group1, on=['date_block_num','shop_id'], how='left')

group2 = add_encoding(df=train, group_features=['shop_id'], on_feature='revenue', new_name='avg_revenue_by_shop')

df = df.merge(group2, on=['shop_id'], how='left')

df['delta_revenue'] = (df['total_revenue_by_month_shop'] - df['avg_revenue_by_shop']) / df['avg_revenue_by_shop']
    

df = add_lags(df=df, lag_feature='delta_revenue', lags=[1])

df.drop(['total_revenue_by_month_shop','avg_revenue_by_shop','delta_revenue'], axis=1, inplace=True)

CPU times: user 6.78 s, sys: 1.7 s, total: 8.49 s
Wall time: 8.51 s


In [13]:
%%time
date0 = datetime.strptime(train['date'].min(), '%d.%m.%Y')

def get_year(date_num):
    date = date0 + relativedelta(months=date_num)
    return date.year

def get_month(date_num):
    date = date0 + relativedelta(months=date_num)
    return date.month

def get_days(date_num):
    date = date0 + relativedelta(months=date_num)
    return monthrange(date.year, date.month)[1]

calendar_dict = {}
for i in df['date_block_num'].unique():
    calendar_dict[i] = [get_year(i), get_month(i), get_days(i)]
    
for i, c in enumerate(['year','month','num_days']):
    df[c] = df['date_block_num'].map(lambda x: calendar_dict[x][i])
    df[c] = df[c].astype(np.int8)

CPU times: user 48.6 s, sys: 403 ms, total: 49 s
Wall time: 49 s


In [14]:
df["first_sale_by_item_shop"] = df["date_block_num"]\
                                - df.groupby(["item_id","shop_id"])["date_block_num"].transform('min')

df["first_sale_by_item"] = df["date_block_num"] - df.groupby(["item_id"])["date_block_num"].transform('min')

In [15]:
df = df[df.date_block_sum > 3]

AttributeError: 'DataFrame' object has no attribute 'date_block_sum'

In [16]:
df.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11056323 entries, 0 to 11056322
Data columns (total 28 columns):
 #   Column                            Dtype  
---  ------                            -----  
 0   date_block_num                    int8   
 1   shop_id                           int8   
 2   item_id                           int16  
 3   item_cnt_month                    float16
 4   item_category_id                  int8   
 5   category0_code                    int8   
 6   category1_code                    int8   
 7   location_code                     int8   
 8   item_cnt_month_lag_1              float16
 9   item_cnt_month_lag_2              float16
 10  item_cnt_month_lag_3              float16
 11  avg_cnt_by_month_lag_1            float16
 12  avg_cnt_by_month_item_lag_1       float16
 13  avg_cnt_by_month_shop_lag_1       float16
 14  avg_cnt_by_month_shop_item_lag_1  float16
 15  avg_cnt_by_month_shop_cat_lag_1   float16
 16  avg_cnt_by_month_shop_loc_lag_1   

In [19]:
%%time
train = df[df.date_block_num < 33]
validation = df[df.date_block_num == 33]
test = df[df.date_block_num == 34]

CPU times: user 1.42 s, sys: 298 ms, total: 1.72 s
Wall time: 1.73 s


In [20]:
%%time
train.to_csv('train_b4md.csv', index=False)

CPU times: user 2min 17s, sys: 2.67 s, total: 2min 20s
Wall time: 2min 21s


In [21]:
%%time
validation.to_csv('validation_b4md.csv', index=False)
test.to_csv('test_b4md.csv', index=False)

CPU times: user 5.92 s, sys: 118 ms, total: 6.03 s
Wall time: 6.07 s


In [17]:
%%time
df.to_csv('data_b4md.csv', index=False)

KeyboardInterrupt: 