## import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from itertools import product

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## read data

In [2]:
item_cat = pd.read_csv('item_categories_b4fe.csv', usecols=['item_category_id','category0_le','category1_le'])
shops = pd.read_csv('shops_b4fe.csv', usecols=['shop_id','location_le'])
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
items = pd.read_csv('items.csv', usecols=['item_id','item_category_id'])
sales_train = pd.read_csv('sales_train_b4fe.csv')

### create a dataframe of all possible combinations of shop_id and item_id and their monthly sales

In [3]:
grid = []

for month in sales_train['date_block_num'].unique():
    
    shop_ids = sales_train.loc[sales_train['date_block_num'] == month, 'shop_id'].unique()
    item_ids = sales_train.loc[sales_train['date_block_num'] == month, 'item_id'].unique()
    grid.append(np.array(list(product(shop_ids, item_ids, [month]))))
    
# Turn the grid into a dataframe
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid_df = pd.DataFrame(np.vstack(grid), columns = index_cols)

grid_df.head()

Unnamed: 0,shop_id,item_id,date_block_num
0,59,22154,0
1,59,2552,0
2,59,2554,0
3,59,2555,0
4,59,2564,0


In [4]:
item_cnt = sales_train.groupby(['shop_id','item_id','date_block_num'])['item_cnt_day']\
                       .sum().to_frame(name='item_cnt_month')

item_cnt_pos = sales_train[sales_train['item_cnt_day']>0]\
                       .groupby(['shop_id','item_id','date_block_num'])['item_cnt_day']\
                       .sum().to_frame(name='item_cnt_month_buy')
item_cnt_neg = sales_train[sales_train['item_cnt_day']<0]\
                       .groupby(['shop_id','item_id','date_block_num'])['item_cnt_day']\
                       .sum().to_frame(name='item_cnt_month_return')

item_cnt_group = pd.concat([item_cnt
                           ,item_cnt_pos
                           ,item_cnt_neg], axis=1)

item_cnt_group.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_month,item_cnt_month_buy,item_cnt_month_return
shop_id,item_id,date_block_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,27,0,1.0,1.0,
2,27,17,1.0,1.0,
2,30,2,1.0,1.0,
2,30,5,1.0,1.0,
2,30,15,1.0,1.0,


In [5]:
sales_train['profit'] = sales_train['item_cnt_day'] * sales_train['item_price']

item_profit = sales_train.groupby(['shop_id','item_id','date_block_num'])['profit'].sum().to_frame(name='profit')

item_revenue = sales_train[sales_train['item_cnt_day']>0]\
                         .groupby(['shop_id','item_id','date_block_num'])['profit'].sum().to_frame(name='revenue')

item_loss = sales_train[sales_train['item_cnt_day']<0]\
                         .groupby(['shop_id','item_id','date_block_num'])['profit'].sum().to_frame(name='loss')

item_price_group = pd.concat([item_profit
                             ,item_revenue
                             ,item_loss], axis=1)

In [6]:
monthly_sales = grid_df.merge(item_cnt.reset_index(), how='left', on=['shop_id','item_id','date_block_num']).fillna(0)
monthly_sales = monthly_sales.merge(item_profit.reset_index(), how='left', on=['shop_id','item_id','date_block_num']).fillna(0)

#monthly_sales = grid_df.merge(item_cnt.reset_index(), on=index_cols, how='left').fillna(0)
monthly_sales.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,profit
0,59,22154,0,1.0,999.0
1,59,2552,0,0.0,0.0
2,59,2554,0,0.0,0.0
3,59,2555,0,0.0,0.0
4,59,2564,0,0.0,0.0


In [8]:
monthly_sales[(monthly_sales.shop_id==2)&(monthly_sales.item_id==32)&(monthly_sales.date_block_num==0)]

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,profit
114602,2,32,0,0.0,0.0


### concat training set and testing set

In [9]:
test_ = test.drop('ID', axis=1)
test_['date_block_num'] = 34

data = monthly_sales.append(test_)
data.sort_values(by=['date_block_num','shop_id','item_id'], inplace=True)
data.reset_index(inplace=True, drop=True)
data.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,profit
0,2,19,0,0.0,0.0
1,2,27,0,1.0,2499.0
2,2,28,0,0.0,0.0
3,2,29,0,0.0,0.0
4,2,32,0,0.0,0.0


In [11]:
data.shape

(11056323, 9)

In [10]:
data.tail()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_cnt_month_buy,item_cnt_month_return,profit,revenue,loss
11056318,59,22162,34,,,,,,
11056319,59,22163,34,,,,,,
11056320,59,22164,34,,,,,,
11056321,59,22166,34,,,,,,
11056322,59,22167,34,,,,,,


### concat all datasts

In [10]:
data = data.merge(items, on='item_id', how='left')
data = data.merge(item_cat, on='item_category_id', how='left')
data = data.merge(shops, on='shop_id', how='left')

data.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,profit,item_category_id,category0_le,category1_le,location_le
0,2,19,0,0.0,0.0,40,10,20,0
1,2,27,0,1.0,2499.0,19,7,43,0
2,2,28,0,0.0,0.0,30,13,52,0
3,2,29,0,0.0,0.0,23,7,57,0
4,2,32,0,0.0,0.0,40,10,20,0


In [13]:
data.shape

(11056323, 13)

### target mean encoding

In [11]:
def add_sum_encoding(group_features, new_name, on_feature='item_cnt_month'):
    mean_group = data.groupby(group_features)[on_feature].mean().to_frame(name=new_name).reset_index()
    
    return mean_group

In [12]:
%%time
encoding_dict = {
                  'target_month':['date_block_num'] # month
                 ,'target_month_item':['date_block_num','item_id'] # month_item
                 ,'target_month_shop':['date_block_num','shop_id'] # month_shop
                 ,'target_month_category':['date_block_num','item_category_id'] # month_item_category
                 ,'target_month_category0':['date_block_num','category0_le']
                 ,'target_month_shop_category':['date_block_num','shop_id','item_category_id']
                 ,'target_month_shop_category0':['date_block_num','shop_id','category0_le']
                 ,'target_month_location':['date_block_num','location_le']
                 ,'target_month_item_location':['date_block_num','item_id','location_le']
                 }

for k,v in encoding_dict.items():
    group = add_sum_encoding(group_features=v, new_name=k)
    data = data.merge(group, on=v, how='left')

CPU times: user 22.1 s, sys: 6.87 s, total: 29 s
Wall time: 29.5 s


In [16]:
data.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_cnt_month_buy,item_cnt_month_return,profit,revenue,loss,item_category_id,category0_le,category1_le,location_le,target_month,target_month_item,target_month_shop,target_month_category,target_month_category0,target_month_shop_category,target_month_shop_category0,target_month_location,target_month_item_location
0,2,19,0,,,,,,,40,10,20,0,2.079574,1.0,1.574176,1.864851,1.741395,1.136364,1.112108,1.574176,
1,2,27,0,1.0,1.0,,2499.0,2499.0,,19,7,43,0,2.079574,1.0,1.574176,2.656906,2.400388,1.9,1.777778,1.574176,1.0
2,2,28,0,,,,,,,30,13,52,0,2.079574,1.6,1.574176,3.458281,3.451728,2.103774,2.054688,1.574176,
3,2,29,0,,,,,,,23,7,57,0,2.079574,1.0,1.574176,2.127114,2.400388,1.206897,1.777778,1.574176,
4,2,32,0,,,,,,,40,10,20,0,2.079574,8.081081,1.574176,1.864851,1.741395,1.136364,1.112108,1.574176,


### lags

In [13]:
def add_lags(group_features, on_feature, lags):
    for lag in lags:
        lag_feature = '%s_lag_%d'%(on_feature, lag)
        data[lag_feature] = data.groupby(group_features)[on_feature].shift(lag)
        
    return data

In [14]:
%%time

target_features = [col for col in data.columns if col.startswith('target')]
lag_features = target_features+['item_cnt_month','profit']

for feature in lag_features:
    data = add_lags(group_features=['shop_id','item_id'], on_feature=feature, lags=[1, 2, 3, 6, 12])


CPU times: user 46.4 s, sys: 6.77 s, total: 53.1 s
Wall time: 53.7 s


In [15]:
data[(data.shop_id==5)&(data.item_id==5037)]

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,profit,item_category_id,category0_le,category1_le,location_le,target_month,target_month_item,target_month_shop,target_month_category,target_month_category0,target_month_shop_category,target_month_shop_category0,target_month_location,target_month_item_location,target_month_lag_1,target_month_lag_2,target_month_lag_3,target_month_lag_6,target_month_lag_12,target_month_item_lag_1,target_month_item_lag_2,target_month_item_lag_3,target_month_item_lag_6,target_month_item_lag_12,target_month_shop_lag_1,target_month_shop_lag_2,target_month_shop_lag_3,target_month_shop_lag_6,target_month_shop_lag_12,target_month_category_lag_1,target_month_category_lag_2,target_month_category_lag_3,target_month_category_lag_6,target_month_category_lag_12,target_month_category0_lag_1,target_month_category0_lag_2,target_month_category0_lag_3,target_month_category0_lag_6,target_month_category0_lag_12,target_month_shop_category_lag_1,target_month_shop_category_lag_2,target_month_shop_category_lag_3,target_month_shop_category_lag_6,target_month_shop_category_lag_12,target_month_shop_category0_lag_1,target_month_shop_category0_lag_2,target_month_shop_category0_lag_3,target_month_shop_category0_lag_6,target_month_shop_category0_lag_12,target_month_location_lag_1,target_month_location_lag_2,target_month_location_lag_3,target_month_location_lag_6,target_month_location_lag_12,target_month_item_location_lag_1,target_month_item_location_lag_2,target_month_item_location_lag_3,target_month_item_location_lag_6,target_month_item_location_lag_12,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_6,item_cnt_month_lag_12,profit_lag_1,profit_lag_2,profit_lag_3,profit_lag_6,profit_lag_12
7128317,5,5037,20,1.0,2599.0,19,7,43,26,0.333496,4.265306,0.223192,0.643194,0.714628,0.472527,0.471136,0.223192,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7426591,5,5037,21,0.0,0.0,19,7,43,26,0.332541,1.215686,0.187401,0.557315,0.590686,0.318681,0.366197,0.187401,0.0,0.333496,,,,,4.265306,,,,,0.223192,,,,,0.643194,,,,,0.714628,,,,,0.472527,,,,,0.471136,,,,,0.223192,,,,,1.0,,,,,1.0,,,,,2599.0,,,,
7749663,5,5037,22,1.0,2599.0,19,7,43,26,0.380418,1.326531,0.228567,0.638692,0.85753,0.392857,0.547468,0.228567,1.0,0.332541,0.333496,,,,1.215686,4.265306,,,,0.187401,0.223192,,,,0.557315,0.643194,,,,0.590686,0.714628,,,,0.318681,0.472527,,,,0.366197,0.471136,,,,0.187401,0.223192,,,,0.0,1.0,,,,0.0,1.0,,,,0.0,2599.0,,,
8060317,5,5037,23,2.0,3998.0,19,7,43,26,0.52142,4.755102,0.322029,1.057702,1.136085,0.807107,0.742081,0.322029,2.0,0.380418,0.332541,0.333496,,,1.326531,1.215686,4.265306,,,0.228567,0.187401,0.223192,,,0.638692,0.557315,0.643194,,,0.85753,0.590686,0.714628,,,0.392857,0.318681,0.472527,,,0.547468,0.366197,0.471136,,,0.228567,0.187401,0.223192,,,1.0,0.0,1.0,,,1.0,0.0,1.0,,,2599.0,0.0,2599.0,,
8382522,5,5037,24,2.0,3998.0,19,7,43,26,0.368906,2.204082,0.228702,0.69988,0.672493,0.57754,0.452951,0.228702,2.0,0.52142,0.380418,0.332541,,,4.755102,1.326531,1.215686,,,0.322029,0.228567,0.187401,,,1.057702,0.638692,0.557315,,,1.136085,0.85753,0.590686,,,0.807107,0.392857,0.318681,,,0.742081,0.547468,0.366197,,,0.322029,0.228567,0.187401,,,2.0,1.0,0.0,,,2.0,1.0,0.0,,,3998.0,2599.0,0.0,,
8682994,5,5037,25,0.0,0.0,19,7,43,26,0.29596,0.723404,0.181893,0.49792,0.607537,0.407821,0.345955,0.181893,0.0,0.368906,0.52142,0.380418,,,2.204082,4.755102,1.326531,,,0.228702,0.322029,0.228567,,,0.69988,1.057702,0.638692,,,0.672493,1.136085,0.85753,,,0.57754,0.807107,0.392857,,,0.452951,0.742081,0.547468,,,0.228702,0.322029,0.228567,,,2.0,2.0,1.0,,,2.0,2.0,1.0,,,3998.0,3998.0,2599.0,,
8967573,5,5037,26,0.0,0.0,19,7,43,26,0.293146,0.673913,0.182341,0.429837,0.560862,0.322751,0.327674,0.182341,0.0,0.29596,0.368906,0.52142,0.333496,,0.723404,2.204082,4.755102,4.265306,,0.181893,0.228702,0.322029,0.223192,,0.49792,0.69988,1.057702,0.643194,,0.607537,0.672493,1.136085,0.714628,,0.407821,0.57754,0.807107,0.472527,,0.345955,0.452951,0.742081,0.471136,,0.181893,0.228702,0.322029,0.223192,,0.0,2.0,2.0,1.0,,0.0,2.0,2.0,1.0,,0.0,3998.0,3998.0,2599.0,
9245445,5,5037,27,0.0,0.0,19,7,43,26,0.302391,0.595745,0.192476,0.330322,0.4109,0.26257,0.207641,0.192476,0.0,0.293146,0.29596,0.368906,0.332541,,0.673913,0.723404,2.204082,1.215686,,0.182341,0.181893,0.228702,0.187401,,0.429837,0.49792,0.69988,0.557315,,0.560862,0.607537,0.672493,0.590686,,0.322751,0.407821,0.57754,0.318681,,0.327674,0.345955,0.452951,0.366197,,0.182341,0.181893,0.228702,0.187401,,0.0,0.0,2.0,0.0,,0.0,0.0,2.0,0.0,,0.0,0.0,3998.0,0.0,
9502170,5,5037,28,1.0,1299.0,19,7,43,26,0.31101,1.977273,0.191558,0.450824,0.589752,0.432749,0.409168,0.191558,1.0,0.302391,0.293146,0.29596,0.380418,,0.595745,0.673913,0.723404,1.326531,,0.192476,0.182341,0.181893,0.228567,,0.330322,0.429837,0.49792,0.638692,,0.4109,0.560862,0.607537,0.85753,,0.26257,0.322751,0.407821,0.392857,,0.207641,0.327674,0.345955,0.547468,,0.192476,0.182341,0.181893,0.228567,,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,2599.0,
9734436,5,5037,29,1.0,1499.0,19,7,43,26,0.285856,2.44186,0.182899,0.49447,0.534189,0.432927,0.377856,0.182899,1.0,0.31101,0.302391,0.293146,0.52142,,1.977273,0.595745,0.673913,4.755102,,0.191558,0.192476,0.182341,0.322029,,0.450824,0.330322,0.429837,1.057702,,0.589752,0.4109,0.560862,1.136085,,0.432749,0.26257,0.322751,0.807107,,0.409168,0.207641,0.327674,0.742081,,0.191558,0.192476,0.182341,0.322029,,1.0,0.0,0.0,2.0,,1.0,0.0,0.0,2.0,,1299.0,0.0,0.0,3998.0,


In [18]:
data.item_cnt_month.value_counts().sort_index()

-22.0            1
-13.0            1
-6.0             1
-5.0             1
-4.0             2
-2.0            26
-1.0           883
 0.0       9236433
 1.0       1057965
 2.0        265402
 3.0        103238
 4.0         53348
 5.0         31978
 6.0         20758
 7.0         14070
 8.0         10428
 9.0          7552
 10.0         6028
 11.0         4481
 12.0         3718
 13.0         3052
 14.0         2450
 15.0         2081
 16.0         1755
 17.0         1577
 18.0         1296
 19.0         1109
 20.0         1027
 21.0          837
 22.0          719
 23.0          668
 24.0          538
 25.0          525
 26.0          474
 27.0          428
 28.0          408
 29.0          366
 30.0          341
 31.0          293
 32.0          298
 33.0          246
 34.0          228
 35.0          222
 36.0          200
 37.0          184
 38.0          185
 39.0          164
 40.0          151
 41.0          123
 42.0          134
 43.0          124
 44.0          126
 45.0       

In [16]:
data.item_cnt_month = data.item_cnt_month.fillna(0).clip(0,20)

In [17]:
data.shape

(11056323, 73)

In [26]:
# filter out date_block_num < 12
#data = data[data['date_block_num']>=12]

In [18]:
# drop features from target mean encoding to avoid data leakage
data.drop(columns=target_features, inplace=True)

In [19]:
data.shape

(11056323, 64)

In [24]:
data.tail()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_cnt_month_buy,item_cnt_month_return,profit,revenue,loss,item_category_id,category0_le,category1_le,location_le,target_month_lag_1,target_month_lag_2,target_month_lag_3,target_month_lag_4,target_month_lag_5,target_month_lag_6,target_month_lag_7,target_month_lag_12,target_month_item_lag_1,target_month_item_lag_2,target_month_item_lag_3,target_month_item_lag_4,target_month_item_lag_5,target_month_item_lag_6,target_month_item_lag_7,target_month_item_lag_12,target_month_shop_lag_1,target_month_shop_lag_2,target_month_shop_lag_3,target_month_shop_lag_4,target_month_shop_lag_5,target_month_shop_lag_6,target_month_shop_lag_7,target_month_shop_lag_12,target_month_category_lag_1,target_month_category_lag_2,target_month_category_lag_3,target_month_category_lag_4,target_month_category_lag_5,target_month_category_lag_6,target_month_category_lag_7,target_month_category_lag_12,target_month_category0_lag_1,target_month_category0_lag_2,target_month_category0_lag_3,target_month_category0_lag_4,target_month_category0_lag_5,target_month_category0_lag_6,target_month_category0_lag_7,target_month_category0_lag_12,target_month_shop_category_lag_1,target_month_shop_category_lag_2,target_month_shop_category_lag_3,target_month_shop_category_lag_4,target_month_shop_category_lag_5,target_month_shop_category_lag_6,target_month_shop_category_lag_7,target_month_shop_category_lag_12,target_month_shop_category0_lag_1,target_month_shop_category0_lag_2,target_month_shop_category0_lag_3,target_month_shop_category0_lag_4,target_month_shop_category0_lag_5,target_month_shop_category0_lag_6,target_month_shop_category0_lag_7,target_month_shop_category0_lag_12,target_month_location_lag_1,target_month_location_lag_2,target_month_location_lag_3,target_month_location_lag_4,target_month_location_lag_5,target_month_location_lag_6,target_month_location_lag_7,target_month_location_lag_12,target_month_item_location_lag_1,target_month_item_location_lag_2,target_month_item_location_lag_3,target_month_item_location_lag_4,target_month_item_location_lag_5,target_month_item_location_lag_6,target_month_item_location_lag_7,target_month_item_location_lag_12,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_7,item_cnt_month_lag_12,item_cnt_month_buy_lag_1,item_cnt_month_buy_lag_2,item_cnt_month_buy_lag_3,item_cnt_month_buy_lag_4,item_cnt_month_buy_lag_5,item_cnt_month_buy_lag_6,item_cnt_month_buy_lag_7,item_cnt_month_buy_lag_12,item_cnt_month_return_lag_1,item_cnt_month_return_lag_2,item_cnt_month_return_lag_3,item_cnt_month_return_lag_4,item_cnt_month_return_lag_5,item_cnt_month_return_lag_6,item_cnt_month_return_lag_7,item_cnt_month_return_lag_12,profit_lag_1,profit_lag_2,profit_lag_3,profit_lag_4,profit_lag_5,profit_lag_6,profit_lag_7,profit_lag_12,revenue_lag_1,revenue_lag_2,revenue_lag_3,revenue_lag_4,revenue_lag_5,revenue_lag_6,revenue_lag_7,revenue_lag_12,loss_lag_1,loss_lag_2,loss_lag_3,loss_lag_4,loss_lag_5,loss_lag_6,loss_lag_7,loss_lag_12
11056318,59,22162,34,0.0,,,,,,40,10,20,30,0.289232,0.333141,0.308009,0.27606,0.285856,0.31101,0.302391,,0.227273,0.162791,0.333333,0.627907,0.511628,0.795455,1.659574,,0.145945,0.179744,0.237666,0.186361,0.175038,0.173386,0.192476,,0.221504,0.201342,0.243326,0.249288,0.252182,0.229466,0.233365,,0.226283,0.197367,0.215057,0.202767,0.20459,0.190396,0.211411,,0.100254,0.109834,0.111645,0.101617,0.124709,0.139254,0.140657,,0.125286,0.112623,0.111034,0.096835,0.111249,0.099159,0.130284,,0.145945,0.179744,0.237666,0.186361,0.175038,0.173386,0.192476,,0.0,0.0,1.0,0.0,0.0,1.0,1.0,,0.0,0.0,1.0,0.0,0.0,1.0,1.0,,0.0,0.0,1.0,0.0,0.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,349.0,0.0,0.0,349.0,349.0,,0.0,0.0,349.0,0.0,0.0,349.0,349.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
11056319,59,22163,34,0.0,,,,,,40,10,20,30,0.289232,0.333141,0.308009,,,,,,0.590909,0.465116,0.690476,,,,,,0.145945,0.179744,0.237666,,,,,,0.221504,0.201342,0.243326,,,,,,0.226283,0.197367,0.215057,,,,,,0.100254,0.109834,0.111645,,,,,,0.125286,0.112623,0.111034,,,,,,0.145945,0.179744,0.237666,,,,,,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,,,,,
11056320,59,22164,34,0.0,,,,,,37,10,8,30,0.289232,0.333141,0.308009,0.27606,0.285856,0.31101,0.302391,,0.340909,0.209302,0.47619,0.27907,0.139535,0.522727,0.574468,,0.145945,0.179744,0.237666,0.186361,0.175038,0.173386,0.192476,,0.26295,0.203846,0.195147,0.156834,0.169609,0.146812,0.196367,,0.226283,0.197367,0.215057,0.202767,0.20459,0.190396,0.211411,,0.176638,0.131965,0.12201,0.106557,0.112621,0.043222,0.128458,,0.125286,0.112623,0.111034,0.096835,0.111249,0.099159,0.130284,,0.145945,0.179744,0.237666,0.186361,0.175038,0.173386,0.192476,,0.0,0.0,0.0,1.0,0.0,0.0,2.0,,0.0,0.0,0.0,1.0,0.0,0.0,2.0,,0.0,0.0,0.0,1.0,0.0,0.0,2.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,699.0,0.0,0.0,1398.0,,0.0,0.0,0.0,699.0,0.0,0.0,1398.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
11056321,59,22166,34,0.0,,,,,,54,4,40,30,0.289232,0.333141,0.308009,0.27606,0.285856,0.31101,0.302391,0.380418,0.25,0.116279,0.261905,0.186047,0.232558,0.181818,0.085106,0.326531,0.145945,0.179744,0.237666,0.186361,0.175038,0.173386,0.192476,0.254666,0.184989,0.157558,0.146179,0.160465,0.19402,0.188088,0.162737,0.159602,0.29919,0.189629,0.184271,0.169565,0.1642,0.164114,0.121986,0.114123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025463,0.033943,0.03966,0.024024,0.013468,0.0,0.003333,0.009202,0.145945,0.179744,0.237666,0.186361,0.175038,0.173386,0.192476,0.254666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11056322,59,22167,34,0.0,,,,,,49,4,38,30,0.289232,0.333141,0.308009,0.27606,0.285856,0.31101,0.302391,0.380418,0.840909,0.488372,0.690476,0.790698,0.767442,0.704545,0.808511,1.0,0.145945,0.179744,0.237666,0.186361,0.175038,0.173386,0.192476,0.254666,0.181534,0.164492,0.161848,0.22548,0.191245,0.162424,0.210402,0.178278,0.29919,0.189629,0.184271,0.169565,0.1642,0.164114,0.121986,0.114123,0.0125,0.02439,0.047619,0.057971,0.011765,0.0,0.012346,0.034483,0.025463,0.033943,0.03966,0.024024,0.013468,0.0,0.003333,0.009202,0.145945,0.179744,0.237666,0.186361,0.175038,0.173386,0.192476,0.254666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
for col in data.select_dtypes(include=[int]).columns:
    data[col] = data[col].astype('category')

In [33]:
float_cols = data.select_dtypes(include=[float]).columns.tolist()
data[float_cols] = data[float_cols].apply(pd.to_numeric, downcast='float')

In [45]:
%%time
train = data[~data['date_block_num'].isin([33,34])]
validation = data[data['date_block_num'] == 33]
test = data[data['date_block_num'] == 34]

CPU times: user 1.63 s, sys: 1.05 s, total: 2.69 s
Wall time: 2.76 s


In [46]:
%%time
train.to_csv('train_b4md.csv', index=False)

CPU times: user 8min 56s, sys: 16.5 s, total: 9min 13s
Wall time: 9min 25s


In [47]:
%%time
validation.to_csv('validation_b4md.csv', index=False)
test.to_csv('test_b4md.csv', index=False)

CPU times: user 22.1 s, sys: 556 ms, total: 22.6 s
Wall time: 22.9 s
