In [20]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 99)

In [2]:
base = pd.read_feather('./input/basestructuredf.ftr')
sales = pd.read_feather('./input/cleanedmonthly.ftr')
test = pd.read_csv('./input/test.csv')
items = pd.read_feather('./input/items_data.ftr')
shops = pd.read_feather('./input/shops_data.ftr')

In [3]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104644 entries, 0 to 1104643
Data columns (total 8 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   year                  1104644 non-null  int64  
 1   month                 1104644 non-null  int64  
 2   shop_id               1104644 non-null  int64  
 3   item_id               1104644 non-null  int64  
 4   date_block_num        1104644 non-null  int64  
 5   item_cnt_month        1104644 non-null  float64
 6   median_monthly_price  1104644 non-null  float64
 7   mean_monthly_price    1104644 non-null  float64
dtypes: float64(3), int64(5)
memory usage: 67.4 MB


In [4]:
shops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   shop_name  60 non-null     object
 1   shop_id    60 non-null     int64 
 2   location   60 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.5+ KB


In [5]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22170 entries, 0 to 22169
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   item_id           22170 non-null  int64 
 1   item_category_id  22170 non-null  int64 
 2   cat_code          22170 non-null  object
 3   cat_subcode       22170 non-null  object
dtypes: int64(2), object(2)
memory usage: 692.9+ KB


In [6]:
shops.drop(columns='shop_name', inplace=True)

In [7]:
combined = pd.merge(base, sales, 'outer', on=['year', 'date_block_num','month','shop_id', 'item_id'])

In [8]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18648420 entries, 0 to 18648419
Data columns (total 8 columns):
 #   Column                Dtype  
---  ------                -----  
 0   shop_id               int8   
 1   item_id               int16  
 2   date_block_num        int64  
 3   month                 int64  
 4   year                  int64  
 5   item_cnt_month        float64
 6   median_monthly_price  float64
 7   mean_monthly_price    float64
dtypes: float64(3), int16(1), int64(3), int8(1)
memory usage: 1.0 GB


In [9]:
combined.isnull().sum()

shop_id                        0
item_id                        0
date_block_num                 0
month                          0
year                           0
item_cnt_month          17543776
median_monthly_price    17543776
mean_monthly_price      17543776
dtype: int64

In [10]:
combined= combined.merge(shops, how='left', on='shop_id')

In [11]:
combined = combined.merge(items, how='left', on='item_id')

In [12]:
combined.isnull().sum()

shop_id                        0
item_id                        0
date_block_num                 0
month                          0
year                           0
item_cnt_month          17543776
median_monthly_price    17543776
mean_monthly_price      17543776
location                       0
item_category_id               0
cat_code                       0
cat_subcode                    0
dtype: int64

In [14]:
combined.sort_values(by=['shop_id', 'item_id', 'date_block_num'], inplace=True)

### Add Nas for item_cnt_month, mean and median monthly prices - forward fill followed by zero for prices

In [15]:
def fillnavals_price_item_cnt(df):
    df.median_monthly_price = df.groupby(['shop_id', 'item_id']).median_monthly_price.fillna(method='ffill')
    df.mean_monthly_price = df.groupby(['shop_id', 'item_id']).mean_monthly_price.fillna(method='ffill')
    df.fillna(value=0, inplace=True)

In [16]:
fillnavals_price_item_cnt(combined)

In [18]:
combined.isnull().sum()

shop_id                 0
item_id                 0
date_block_num          0
month                   0
year                    0
item_cnt_month          0
median_monthly_price    0
mean_monthly_price      0
location                0
item_category_id        0
cat_code                0
cat_subcode             0
dtype: int64

In [19]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18648420 entries, 0 to 18648419
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   shop_id               int8   
 1   item_id               int16  
 2   date_block_num        int64  
 3   month                 int64  
 4   year                  int64  
 5   item_cnt_month        float64
 6   median_monthly_price  float64
 7   mean_monthly_price    float64
 8   location              object 
 9   item_category_id      int64  
 10  cat_code              object 
 11  cat_subcode           object 
dtypes: float64(3), int16(1), int64(4), int8(1), object(3)
memory usage: 1.6+ GB


### Add months since last sale - takes a lot of time!

In [20]:
def months_since_last_sale(df):
    df['date_block_with_sale'] = df.apply(lambda o: o.date_block_num + 1 if o.item_cnt_month > 0 else None, axis=1)
    df['date_block_with_sale'] = df.groupby(['shop_id', 'item_id']).date_block_with_sale.fillna(method='ffill')
    df['date_block_with_sale'] = df.groupby(['shop_id', 'item_id']).date_block_with_sale.fillna(value=0)
    lagging_db_with_sale = df.groupby(['shop_id', 'item_id']).date_block_with_sale.shift(1)
    df['months_since_sale'] = df.date_block_num - lagging_db_with_sale
    df.drop(columns='date_block_with_sale', inplace=True)

In [21]:
months_since_last_sale(combined)

In [22]:
combined.sort_values(['shop_id', 'item_id', 'date_block_num'], inplace=True, ignore_index=True)

In [30]:
# combined.to_feather('./input/temp/cons_stage1.ftr')

In [37]:
combined = pd.read_feather('./input/temp/cons_stage1.ftr')

In [38]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18648420 entries, 0 to 18648419
Data columns (total 13 columns):
 #   Column                Dtype  
---  ------                -----  
 0   shop_id               int8   
 1   item_id               int16  
 2   date_block_num        int64  
 3   month                 int64  
 4   year                  int64  
 5   item_cnt_month        float64
 6   median_monthly_price  float64
 7   mean_monthly_price    float64
 8   location              object 
 9   item_category_id      int64  
 10  cat_code              object 
 11  cat_subcode           object 
 12  months_since_sale     float64
dtypes: float64(4), int16(1), int64(4), int8(1), object(3)
memory usage: 1.6+ GB


### Add Lagging Values for item_cnt_month, mean and median prices

In [39]:
def add_lagging_values(df, num_months, attributes=['item_cnt_month']):
    for att in attributes:
        for i in range(num_months):
            df[f'{att} - {i+1}'] =  df.groupby(['shop_id', 'item_id'])[att].shift(i+1)

def add_lagging_deltas(df, num_lag_cols, attributes=['item_cnt_month']):
    for att in attributes:
        for i in range(1, num_lag_cols):
            df[f'delta_{att}_{i}-{i+1}'] = df[f'{att} - {i}'] - df[f'{att} - {i+1}']
            
def add_lagging_deltas_squared(df, num_lag_cols, attributes=['item_cnt_month']):
    for att in attributes:
        for i in range(2, num_lag_cols):
            df[f'delta_sq_{att}_{i-1}-{i+1}'] = df[f'delta_{att}_{i-1}-{i}'] - df[f'delta_{att}_{i}-{i+1}']

#### For item count add lagging for 6 months, deltas and delta squared

In [40]:
add_lagging_values(combined, num_months=6, attributes=['item_cnt_month'])
add_lagging_deltas(combined, num_lag_cols=6, attributes=['item_cnt_month'])
add_lagging_deltas_squared(combined, num_lag_cols=6, attributes=['item_cnt_month'])

In [21]:
# combined.info()

#### For item prices add only lagging for month, others are not important as confirmed by random forest

In [42]:
add_lagging_values(combined, num_months=1, attributes=['mean_monthly_price', 'median_monthly_price'])

In [22]:
# combined.info()

In [44]:
combined.sort_values(['shop_id', 'item_id', 'date_block_num'], inplace=True, ignore_index=True)

### Reduced data size of the frame

In [45]:
def reduce_data_size(df):
    cols = df.columns
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    data_types = df.dtypes
    new_dtypes = {}
    for col in cols:
        if data_types[col] in numerics:
            new_dtypes[col] = 'float32'
    return df.astype(new_dtypes, copy=False)

In [46]:
combined = reduce_data_size(combined)

In [23]:
# combined.info()

In [95]:
# combined.to_feather('input/temp/cons_stage2.ftr')

In [116]:
combined = pd.read_feather('input/temp/cons_stage2.ftr')

In [24]:
# combined.info()

### Add Mean encoded features

#### Add Mean of lagging item count per month for 3 months and six months


#### Add delta of item count per month with these average values

In [118]:
def add_avg_lagging(df, attributes=['item_cnt_month'], num_lagging_months=6):
    for att in attributes:
        for i in range(1,num_lagging_months):
            cols = [f'{att} - {o+1}' for o in range(i+1)]
            df[f'avg_{i+1}_{att}'] = df.loc[:,cols].mean(axis=1)
            
def add_avg_trend_lagging(df, attributes=['item_cnt_month'], num_lagging_months=6):
    for att in attributes:
            df[f'trend_{att}_avg2'] = df[f'{att} - 1'] - df[f'avg_2_{att}']
            for i in range(2, num_lagging_months):
                df[f'trend_avg{i}_avg{i+1}'] = df[f'avg_{i}_{att}'] - df[f'avg_{i}_{att}']

In [119]:
add_avg_lagging(combined)

In [120]:
# add_avg_trend_lagging(combined)

In [123]:
# combined.to_feather('./input/temp/cons_stage3.ftr')

In [129]:
combined = pd.read_feather('./input/temp/cons_stage3.ftr')

### Add mean encoded features for based on categorical columns, item_id, shop_id, items_cat, cat_code, cat_sub_code

In [101]:
groupby_first_cols = ['item_id', 'shop_id', 'location', 'item_category_id', 'cat_code', 'cat_subcode']

In [102]:
groupby_second_cols = ['date_block_num']

In [133]:
def add_groupby_means_lagging_1(df, first_col, second_col, mean_col):
    for sec_col in second_col:
        for f_col in first_col:
            temp = df.groupby([f_col, sec_col])[mean_col].mean().reset_index()
            temp[mean_col] = temp.groupby([f_col])[mean_col].shift(1)
            col_name = f'lag_avg_{f_col}_{sec_col}'
            temp.rename(columns={mean_col: col_name}, inplace=True)
            df = df.merge(temp, 'left', on=[f_col, sec_col])
    return df

In [134]:
combined = add_groupby_means_lagging_1(combined, groupby_first_cols, groupby_second_cols, 'item_cnt_month')

In [135]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18648420 entries, 0 to 18648419
Data columns (total 41 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   shop_id                                  float32
 1   item_id                                  float32
 2   date_block_num                           float32
 3   month                                    float32
 4   year                                     float32
 5   item_cnt_month                           float32
 6   median_monthly_price                     float32
 7   mean_monthly_price                       float32
 8   location                                 object 
 9   item_category_id                         float32
 10  cat_code                                 object 
 11  cat_subcode                              object 
 12  months_since_sale                        float32
 13  item_cnt_month - 1                       float32
 14  item_cnt_month -

### Add mean encoded features based on categorical columns - items_cat, cat_code, cat_sub_code - groupedby shop_id and date_block_num

In [139]:
def add_multiple_groupby_means_lagging_1(df, first_col, cols, mean_col):
    for col in cols:
            temp = df.groupby([first_col, col, 'date_block_num'])[mean_col].mean().reset_index()
            temp[mean_col] = temp.groupby([first_col, col])[mean_col].shift(1)
            col_name = f'lag_avg_{first_col}_{col}_date'
            temp.rename(columns={mean_col: col_name}, inplace=True)
            df = df.merge(temp, 'left', on=[first_col, col, 'date_block_num'])
    return df

In [140]:
cols = ['item_category_id', 'cat_code', 'cat_subcode']

In [141]:
combined = add_multiple_groupby_means_lagging_1(combined, 'shop_id', cols, 'item_cnt_month')

In [143]:
combined = add_multiple_groupby_means_lagging_1(combined, 'location', cols, 'item_cnt_month')

In [144]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18648420 entries, 0 to 18648419
Data columns (total 47 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   shop_id                                  float32
 1   item_id                                  float32
 2   date_block_num                           float32
 3   month                                    float32
 4   year                                     float32
 5   item_cnt_month                           float32
 6   median_monthly_price                     float32
 7   mean_monthly_price                       float32
 8   location                                 object 
 9   item_category_id                         float32
 10  cat_code                                 object 
 11  cat_subcode                              object 
 12  months_since_sale                        float32
 13  item_cnt_month - 1                       float32
 14  item_cnt_month -

In [146]:
# combined.to_feather('./input/temp/cons_stage4.ftr')

In [117]:
combined = pd.read_feather('./input/temp/cons_stage4.ftr')

## Create Dataframe to extract embeddings from shop_id, item_id, date_block_num

In [158]:
embeds = combined.loc[combined.date_block_num<34, ['shop_id', 'date_block_num', 'item_id', 'item_cnt_month']].copy()

In [159]:
embeds

Unnamed: 0,shop_id,date_block_num,item_id,item_cnt_month
0,2.0,0.0,1.0,0.0
1,2.0,1.0,1.0,0.0
2,2.0,2.0,1.0,0.0
3,2.0,3.0,1.0,0.0
4,2.0,4.0,1.0,0.0
...,...,...,...,...
18648414,59.0,29.0,22167.0,0.0
18648415,59.0,30.0,22167.0,0.0
18648416,59.0,31.0,22167.0,0.0
18648417,59.0,32.0,22167.0,0.0


### Save Training data

In [147]:
train_data = combined.loc[combined.date_block_num>5].copy()

In [148]:
train_data.isnull().sum()

shop_id                                    0
item_id                                    0
date_block_num                             0
month                                      0
year                                       0
item_cnt_month                             0
median_monthly_price                       0
mean_monthly_price                         0
location                                   0
item_category_id                           0
cat_code                                   0
cat_subcode                                0
months_since_sale                          0
item_cnt_month - 1                         0
item_cnt_month - 2                         0
item_cnt_month - 3                         0
item_cnt_month - 4                         0
item_cnt_month - 5                         0
item_cnt_month - 6                         0
delta_item_cnt_month_1-2                   0
delta_item_cnt_month_2-3                   0
delta_item_cnt_month_3-4                   0
delta_item

In [149]:
train_data.sort_values(['shop_id', 'item_id', 'date_block_num'], inplace=True, ignore_index=True)

In [150]:
train_data.drop(columns=['median_monthly_price', 'mean_monthly_price'], inplace=True)

In [151]:
train_data.to_feather('./input/train_data.ftr')

In [152]:
len(train_data.columns)

45