In [1]:
import numpy as np
import pandas as pd
import gc

TRAIN_PATH = '/kaggle/input/competitive-data-science-predict-future-sales/' 
items = pd.read_csv(TRAIN_PATH+'items.csv')
categories = pd.read_csv(TRAIN_PATH+'item_categories.csv')
shops = pd.read_csv(TRAIN_PATH+'shops.csv')
sales = pd.read_csv(TRAIN_PATH+'sales_train.csv')
test_df = pd.read_csv(TRAIN_PATH+'/test.csv')

Create the same features as in EDA and drop outliers

In [2]:
# yakutsk = shops['shop_name'].str.contains('!Якутск')
# shops.loc[yakutsk, 'shop_name'] = shops.loc[yakutsk, 'shop_name'].str.slice(start=1)

shops['city'] = shops['shop_name'].str.split(n=1, expand=True)[0]
other = shops['city'].isin(['Цифровой', 'Интернет-магазин', 'Выездная'])
shops.loc[other, 'city'] = 'Other'
shops['city_id'] = shops['city'].factorize()[0]

categories['primary_category'] = categories['item_category_name'].str.split(' -', n=1, expand=True)[0]
categories.loc[categories['primary_category'].str.startswith('Чистые носители'), 'primary_category'] = 'Чистые носители'
categories.loc[categories['primary_category'].str.startswith('Карты оплаты'), 'primary_category'] = 'Карты оплаты'
categories['primary_category_id'] = categories['primary_category'].factorize()[0]

sales['date'] = pd.to_datetime(sales['date'], format='%d.%m.%Y')
sales['revenue'] = sales['item_price'] * sales['item_cnt_day']
sales.loc[sales.shop_id==11, 'shop_id'] = 10
sales.loc[sales.shop_id==0, 'shop_id'] = 57
sales.loc[sales.shop_id==1, 'shop_id'] = 58
sales.drop(index=sales[sales['item_price'] >= 40000].index, inplace=True)
sales.drop(index=sales[sales['item_price'] < 0].index, inplace=True)
sales.drop(index=sales[sales['item_cnt_day'] >= 900].index, inplace=True)

sales = (sales.merge(shops, on='shop_id')
              .merge(items, on='item_id')
              .merge(categories, on='item_category_id')
              )

test_df['date_block_num'] = 34
test_df = (test_df.merge(items[['item_id', 'item_category_id']], on='item_id')
                  .merge(categories[['item_category_id', 'primary_category_id']], on='item_category_id')
                  .merge(shops[['shop_id', 'city_id']], on='shop_id')
                  )

shops.to_csv('shops.csv', index=False)
categories.to_csv('item_categories.csv', index=False)

In [3]:
def downcast_dtypes(df, copy=False):
    '''
    Changes column types in the dataframe:             
    `float64` type to `float32`
    `int64`   type to `int32`
    '''
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype == "int64" and c != 'ID']
    df[float_cols] = df[float_cols].astype(np.float32, copy=copy)
    df[int_cols] = df[int_cols].astype(np.uint16, copy=copy)
    
    return df

Create a new dataset with a cartesian product of item_ids and shop_ids of every month starting from 12

In [4]:
%%time
import itertools

pair_cols = ['date_block_num', 'shop_id', 'item_id']

monthly_sets = sales[sales.date_block_num >= 12].groupby('date_block_num')[['shop_id', 'item_id']].agg(set).reset_index()
lists = monthly_sets.apply(lambda x: itertools.product([x.date_block_num], x.shop_id, x.item_id), axis=1).to_list()
new_pairs = pd.DataFrame(itertools.chain(*lists), columns=pair_cols, dtype=np.int32)
new_pairs = (new_pairs.merge(items[['item_id', 'item_category_id']], on='item_id')
                      .merge(categories[['item_category_id', 'primary_category_id']], on='item_category_id')
                      .merge(shops[['shop_id', 'city_id']], on='shop_id')
                      )
new_pairs = downcast_dtypes(new_pairs)

gc.collect()

CPU times: user 7.29 s, sys: 1.31 s, total: 8.6 s
Wall time: 8.6 s


0

In [5]:
from functools import reduce


def get_lags(df, lags, index_cols=None, downcast=True):
    if index_cols is None:
        index_cols = list(df.index.names)
        df = df.reset_index()
    
    lag_stats = [] 
    for lag in lags:
        lag_df = df.copy()
        lag_df['date_block_num'] = lag_df['date_block_num'] + lag
        lag_df.columns = [f'lag_{lag}_{col}' if col not in index_cols else col for col in lag_df.columns]
        lag_stats.append(lag_df)

    df_merged = reduce(lambda  left, right: pd.merge(left, right, on=index_cols), lag_stats)
    if downcast:
        df_merged = downcast_dtypes(df_merged)
    return df_merged


def create_stats(group_by_cols, agg_dict, rename_dict, df=sales, lags=[1, 2, 3, 12]):
    monthly_stats = (df.groupby(group_by_cols)
                       .agg(agg_dict)
                       .rename(columns=rename_dict, level=0)
                       )
    monthly_stats.columns = ['_'.join(x) for x in monthly_stats.columns.to_flat_index()]
    monthly_stats = get_lags(monthly_stats, lags)
    
    return monthly_stats, group_by_cols


def get_avg_sales(gb1, gb2, col_name, lags=[1, 2, 3, 12]):
    monthly_sales = sales.groupby(gb1)['item_cnt_day'].agg(item_cnt_month=sum)
    return create_stats(gb2, {'item_cnt_month': ['mean']}, {'item_cnt_month': col_name}, monthly_sales, lags)


def get_last_time_func(group_by_cols):
    sales_months = sales.groupby(group_by_cols)['date_block_num'].unique()
    
    def time_from_last_sale(row):
        key = tuple(row[c] for c in group_by_cols)
        if len(key) == 1:
            key = key[0]
        cur_date = row['date_block_num']
        if key not in sales_months:
            return np.nan
        m = sales_months[key]
        m = m[np.nonzero(m < cur_date)]
        if len(m) == 0:
            return np.nan
        return cur_date - m.max()
    
    return time_from_last_sale, sales_months

Create features by monthly aggregation on item-shop sales, and then taking their and averages by different categorical columns (shop, city, item, category, primary category, shop+category) lagged values.

Additional features are time from the last item-shop and item sale and the month number.

In [7]:
target = sales.groupby(['date_block_num', 'shop_id', 'item_id'])['item_cnt_day'].agg(item_cnt_month='sum')

In [8]:
%%time
lags = [1, 2, 3, 12]

features = [
    create_stats(['date_block_num', 'shop_id', 'item_id'], {'item_cnt_day': ['sum']}, {'item_cnt_day': 'target'}),
    create_stats(['date_block_num', 'item_id'], {'item_price': ['mean', 'max', 'min', 'median']}, {'item_price': 'item_price'}),
    get_avg_sales(['date_block_num', 'shop_id', 'item_id'], ['date_block_num', 'item_id'], 'item_sales'),
    get_avg_sales(['date_block_num', 'shop_id', 'item_id'], ['date_block_num', 'shop_id'], 'shop_sales'),
    get_avg_sales(['date_block_num', 'shop_id', 'city_id', 'item_id'], ['date_block_num', 'city_id'], 'city_sales'),
    get_avg_sales(['date_block_num', 'shop_id', 'item_id', 'item_category_id'], ['date_block_num', 'item_category_id'], 'cat_sales'),
    get_avg_sales(['date_block_num', 'shop_id', 'item_id', 'primary_category_id'], ['date_block_num', 'primary_category_id'], 'primarycat_sales'),
    get_avg_sales(['date_block_num', 'shop_id', 'item_id', 'item_category_id'], ['date_block_num', 'shop_id', 'item_category_id'], 'shop_cat_sales'),
    get_avg_sales(['date_block_num', 'shop_id', 'item_id', 'primary_category_id'], ['date_block_num', 'shop_id', 'primary_category_id'], 'shop_primarycat_sales'),
]

gc.collect()

CPU times: user 10.3 s, sys: 875 ms, total: 11.2 s
Wall time: 11.2 s


0

Merge features to the created pairs

In [10]:
%%time
index_cols = len(new_pairs.columns)

for stats, cols in features:
    new_pairs = new_pairs.merge(stats, on=cols, how='left')
    test_df = test_df.merge(stats, on=cols, how='left')
    del stats

new_pairs['month_num'] = (new_pairs['date_block_num'] % 12).astype(np.uint8)
test_df['month_num'] = 34%12

time_from_last_shop_item_sale, sales_months = get_last_time_func(['item_id', 'shop_id'])
new_pairs['time_from_last_shop_item_sale'] = new_pairs.apply(time_from_last_shop_item_sale, axis=1).fillna(999).astype(np.uint8)
test_df['time_from_last_shop_item_sale'] = test_df.apply(time_from_last_shop_item_sale, axis=1).fillna(999).astype(np.uint8)
del sales_months

time_from_last_item_sale, sales_months = get_last_time_func(['item_id'])
new_pairs['time_from_last_item_sale'] = new_pairs.apply(time_from_last_item_sale, axis=1).fillna(999).astype(np.uint8)
test_df['time_from_last_item_sale'] = test_df.apply(time_from_last_item_sale, axis=1).fillna(999).astype(np.uint8)
del sales_months

new_pairs = new_pairs.merge(target, on=['date_block_num', 'shop_id', 'item_id'], how='left')
new_pairs.sort_values('date_block_num', inplace=True)

del features
gc.collect()

CPU times: user 10min 33s, sys: 9.05 s, total: 10min 42s
Wall time: 10min 41s


0

In [11]:
new_pairs.time_from_last_shop_item_sale.describe()

count    6.424946e+06
mean     1.109228e+02
std      1.126534e+02
min      1.000000e+00
25%      3.000000e+00
50%      1.500000e+01
75%      2.310000e+02
max      2.310000e+02
Name: time_from_last_shop_item_sale, dtype: float64

Fill NaN's and downcast types

In [13]:
new_pairs.fillna(0, inplace=True)
new_pairs = downcast_dtypes(new_pairs)
new_pairs.reset_index(drop=True, inplace=True)  # to save current order

gc.collect()

0

In [14]:
new_pairs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6424946 entries, 0 to 6424945
Data columns (total 58 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   date_block_num                     int32  
 1   shop_id                            int32  
 2   item_id                            int32  
 3   item_category_id                   uint16 
 4   primary_category_id                uint16 
 5   city_id                            uint16 
 6   lag_1_target_sum                   float32
 7   lag_2_target_sum                   float32
 8   lag_3_target_sum                   float32
 9   lag_12_target_sum                  float32
 10  lag_1_item_price_mean              float32
 11  lag_1_item_price_max               float32
 12  lag_1_item_price_min               float32
 13  lag_1_item_price_median            float32
 14  lag_2_item_price_mean              float32
 15  lag_2_item_price_max               float32
 16  lag_2_item_price_m

In [16]:
new_pairs

Unnamed: 0,date_block_num,shop_id,item_id,item_category_id,primary_category_id,city_id,lag_1_target_sum,lag_2_target_sum,lag_3_target_sum,lag_12_target_sum,...,lag_3_shop_cat_sales_mean,lag_12_shop_cat_sales_mean,lag_1_shop_primarycat_sales_mean,lag_2_shop_primarycat_sales_mean,lag_3_shop_primarycat_sales_mean,lag_12_shop_primarycat_sales_mean,month_num,time_from_last_shop_item_sale,time_from_last_item_sale,item_cnt_month
0,12,2,27,19,5,1,0.0,0.0,0.0,0.0,...,2.616438,1.900000,2.946809,2.341667,2.565574,1.777778,0,12,1,0.0
1,12,42,18824,40,10,19,0.0,0.0,0.0,0.0,...,2.332278,1.983838,2.222065,2.092058,2.134831,1.833557,0,231,1,0.0
2,12,42,18825,40,10,19,0.0,0.0,0.0,0.0,...,2.332278,1.983838,2.222065,2.092058,2.134831,1.833557,0,2,1,1.0
3,12,6,2628,55,12,5,0.0,0.0,0.0,0.0,...,1.460208,1.289286,1.453925,1.381517,1.358537,1.237647,0,9,1,0.0
4,12,42,18826,40,10,19,0.0,0.0,0.0,0.0,...,2.332278,1.983838,2.222065,2.092058,2.134831,1.833557,0,5,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6424941,33,26,13217,47,11,13,0.0,0.0,0.0,0.0,...,0.000000,0.000000,1.593750,1.657143,1.833333,1.285714,9,2,1,0.0
6424942,33,26,13240,47,11,13,0.0,0.0,0.0,0.0,...,0.000000,0.000000,1.593750,1.657143,1.833333,1.285714,9,3,1,0.0
6424943,33,26,13252,47,11,13,0.0,0.0,0.0,0.0,...,0.000000,0.000000,1.593750,1.657143,1.833333,1.285714,9,3,1,0.0
6424944,33,58,564,78,14,28,0.0,0.0,0.0,0.0,...,0.000000,0.000000,2.181818,2.000000,1.857143,2.095238,9,231,1,0.0


Parquet is probably the best option to save dataframes in terms of file size, IO speed and (maybe) the RAM consumption. But, unfortunately, it doesn't support float16.

In [17]:
result_cols = list(new_pairs.columns[1:-1])  # all the cols except date_block_num and target
target_col = [new_pairs.columns[-1]]
new_pairs[['date_block_num']].to_parquet('date_block_num.parquet')
new_pairs[result_cols].to_parquet('X_train.parquet')
new_pairs[target_col].to_parquet('y_train.parquet')

gc.collect()

0

Same for the test dataset

In [20]:
test_df.fillna(0, inplace=True)
test_df = downcast_dtypes(test_df)

test_df = test_df.set_index('ID').sort_index()  # it's important to save test set in the correct order 

In [21]:
test_df

Unnamed: 0_level_0,shop_id,item_id,date_block_num,item_category_id,primary_category_id,city_id,lag_1_target_sum,lag_2_target_sum,lag_3_target_sum,lag_12_target_sum,...,lag_2_shop_cat_sales_mean,lag_3_shop_cat_sales_mean,lag_12_shop_cat_sales_mean,lag_1_shop_primarycat_sales_mean,lag_2_shop_primarycat_sales_mean,lag_3_shop_primarycat_sales_mean,lag_12_shop_primarycat_sales_mean,month_num,time_from_last_shop_item_sale,time_from_last_item_sale
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5,5037,34,19,5,4,0.0,0.0,0.0,0.0,...,2.244898,2.537037,1.878049,1.931818,1.968553,2.107785,2.507246,10,2,1
1,5,5320,34,55,12,4,0.0,0.0,0.0,0.0,...,1.120000,1.183333,1.178571,1.182796,1.139241,1.180328,1.176991,10,231,231
2,5,5233,34,19,5,4,0.0,0.0,0.0,0.0,...,2.244898,2.537037,1.878049,1.931818,1.968553,2.107785,2.507246,10,1,1
3,5,5232,34,23,5,4,0.0,0.0,0.0,0.0,...,1.852941,1.809524,1.717391,1.931818,1.968553,2.107785,2.507246,10,3,1
4,5,5268,34,20,5,4,0.0,0.0,0.0,0.0,...,2.500000,2.425000,5.925926,1.931818,1.968553,2.107785,2.507246,10,231,231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214195,45,18454,34,55,12,20,0.0,0.0,0.0,0.0,...,1.116279,1.166667,1.347826,1.189873,1.222222,1.161290,1.401869,10,1,1
214196,45,16188,34,64,13,20,0.0,0.0,0.0,0.0,...,1.000000,1.157895,1.230769,1.564706,1.465753,1.254717,2.302083,10,231,1
214197,45,15757,34,55,12,20,0.0,0.0,0.0,0.0,...,1.116279,1.166667,1.347826,1.189873,1.222222,1.161290,1.401869,10,9,1
214198,45,19648,34,40,10,20,0.0,0.0,0.0,0.0,...,1.208333,1.309859,1.391304,1.209524,1.185185,1.238461,1.303797,10,231,1


In [22]:
test_df[result_cols].to_parquet('X_test.parquet')