In [101]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

from itertools import product
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBRegressor
from xgboost import plot_importance
import lightgbm as lgb

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

import time
import sys
import gc
import pickle
sys.version_info

sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)

In [102]:
train = pd.read_csv('./data/sales_train.csv')
test = pd.read_csv('./data/test.csv').set_index('ID')
sample_submission = pd.read_csv('./data/sample_submission.csv')
items = pd.read_csv('./data/items.csv')
cats = pd.read_csv('./data/item_categories.csv')
shops = pd.read_csv('./data/shops.csv')

In [103]:
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

In [104]:
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

In [105]:
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

In [106]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['type_code'] = LabelEncoder().fit_transform(cats['type'])
# if subtype is nan then type
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','type_code', 'subtype_code']]

items.drop(['item_name'], axis=1, inplace=True)

In [107]:
ts = time.time()
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
time.time() - ts

14.175607919692993

In [108]:
matrix.shape

(10913804, 3)

In [109]:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day':'sum'}).reset_index()

In [110]:
group.shape

(1609123, 4)

In [111]:
len(train.shop_id.unique()) * len(train.item_id.unique()) * len(train.date_block_num.unique())

42260028

In [112]:
train['revenue'] = train['item_price'] *  train['item_cnt_day']

In [113]:
ts = time.time()
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))
time.time() - ts

6.402884006500244

In [114]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

In [115]:
test.head(1)

Unnamed: 0_level_0,shop_id,item_id,date_block_num
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,5,5037,34


In [116]:
matrix.head(1)

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,2,19,0.0


In [117]:
ts = time.time()
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month
time.time() - ts

0.09621095657348633

In [118]:
ts = time.time()
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)
time.time() - ts

5.00059700012207

In [119]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [120]:
ts = time.time()
matrix = lag_feature(matrix, [1,2,3,6,12], 'item_cnt_month')
time.time() - ts

59.80960273742676

In [121]:
ts = time.time()
group = matrix.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num'], how='left')
matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_avg_item_cnt')
matrix.drop(['date_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

19.455909729003906

In [122]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

61.01127099990845

In [123]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_shop_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
#matrix.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

59.91660714149475

In [124]:
### TEST
### TEST
### TEST
### TEST
### TEST
matrix[(matrix.date_block_num==27)&(matrix.shop_id==2)&(matrix.item_id==24)][['date_block_num','shop_id','item_id','date_shop_avg_item_cnt_lag_1','date_shop_avg_item_cnt']]

Unnamed: 0,date_block_num,shop_id,item_id,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt
9299441,27,2,24,,0.125122


In [125]:
### TEST
### TEST
### TEST
### TEST
### TEST
matrix[(matrix.date_block_num==26)&(matrix.shop_id==2)&(matrix.item_id==24)][['date_block_num','shop_id','item_id','date_shop_avg_item_cnt_lag_1','date_shop_avg_item_cnt']]

Unnamed: 0,date_block_num,shop_id,item_id,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt


In [25]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_cat_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_category_id'], how='left')
matrix['date_cat_avg_item_cnt'] = matrix['date_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_cat_avg_item_cnt')
matrix.drop(['date_cat_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

19.353145122528076

In [26]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_cat_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
matrix['date_shop_cat_avg_item_cnt'] = matrix['date_shop_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_cat_avg_item_cnt')
matrix.drop(['date_shop_cat_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

21.52137589454651

In [27]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_id', 'type_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_type_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'type_code'], how='left')
matrix['date_shop_type_avg_item_cnt'] = matrix['date_shop_type_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_type_avg_item_cnt')
matrix.drop(['date_shop_type_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

22.126614809036255

In [28]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'shop_id', 'subtype_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_subtype_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'subtype_code'], how='left')
matrix['date_shop_subtype_avg_item_cnt'] = matrix['date_shop_subtype_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_subtype_avg_item_cnt')
matrix.drop(['date_shop_subtype_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

23.41545295715332

In [29]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_city_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'city_code'], how='left')
matrix['date_city_avg_item_cnt'] = matrix['date_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_city_avg_item_cnt')
matrix.drop(['date_city_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

23.10524010658264

In [30]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'item_id', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_city_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id', 'city_code'], how='left')
matrix['date_item_city_avg_item_cnt'] = matrix['date_item_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_item_city_avg_item_cnt')
matrix.drop(['date_item_city_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

33.06318283081055

In [31]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'type_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_type_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'type_code'], how='left')
matrix['date_type_avg_item_cnt'] = matrix['date_type_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_type_avg_item_cnt')
matrix.drop(['date_type_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

23.977633953094482

In [32]:
ts = time.time()
group = matrix.groupby(['date_block_num', 'subtype_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_subtype_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'subtype_code'], how='left')
matrix['date_subtype_avg_item_cnt'] = matrix['date_subtype_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_subtype_avg_item_cnt')
matrix.drop(['date_subtype_avg_item_cnt'], axis=1, inplace=True)
time.time() - ts

24.614383935928345

In [33]:
ts = time.time()
group = train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['item_id'], how='left')
matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)

group = train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)

lags = [1,2,3,4,5,6]
matrix = lag_feature(matrix, lags, 'date_item_avg_item_price')

for i in lags:
    matrix['delta_price_lag_'+str(i)] = \
        (matrix['date_item_avg_item_price_lag_'+str(i)] - matrix['item_avg_item_price']) / matrix['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0
    
matrix['delta_price_lag'] = matrix.apply(select_trend, axis=1)
matrix['delta_price_lag'] = matrix['delta_price_lag'].astype(np.float16)
matrix['delta_price_lag'].fillna(0, inplace=True)

# https://stackoverflow.com/questions/31828240/first-non-null-value-per-row-from-a-list-of-pandas-columns/31828559
# matrix['price_trend'] = matrix[['delta_price_lag_1','delta_price_lag_2','delta_price_lag_3']].bfill(axis=1).iloc[:, 0]
# Invalid dtype for backfill_2d [float16]

fetures_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    fetures_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    fetures_to_drop += ['delta_price_lag_'+str(i)]

matrix.drop(fetures_to_drop, axis=1, inplace=True)

time.time() - ts

404.79270815849304

In [34]:
ts = time.time()
group = train.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})
group.columns = ['date_shop_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_revenue'] = matrix['date_shop_revenue'].astype(np.float32)

group = group.groupby(['shop_id']).agg({'date_shop_revenue': ['mean']})
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id'], how='left')
matrix['shop_avg_revenue'] = matrix['shop_avg_revenue'].astype(np.float32)

matrix['delta_revenue'] = (matrix['date_shop_revenue'] - matrix['shop_avg_revenue']) / matrix['shop_avg_revenue']
matrix['delta_revenue'] = matrix['delta_revenue'].astype(np.float16)

matrix = lag_feature(matrix, [1], 'delta_revenue')

matrix.drop(['date_shop_revenue','shop_avg_revenue','delta_revenue'], axis=1, inplace=True)
time.time() - ts

23.184242010116577

In [35]:
matrix['month'] = matrix['date_block_num'] % 12

In [36]:
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days).astype(np.int8)

In [37]:
ts = time.time()
cache = {}
matrix['item_shop_last_sale'] = -1
matrix['item_shop_last_sale'] = matrix['item_shop_last_sale'].astype(np.int8)
for idx, row in matrix.iterrows():    
    key = str(row.item_id)+' '+str(row.shop_id)
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        matrix.at[idx, 'item_shop_last_sale'] = row.date_block_num - last_date_block_num
        cache[key] = row.date_block_num         
time.time() - ts

1528.8620750904083

In [38]:
ts = time.time()
cache = {}
matrix['item_last_sale'] = -1
matrix['item_last_sale'] = matrix['item_last_sale'].astype(np.int8)
for idx, row in matrix.iterrows():    
    key = row.item_id
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        if row.date_block_num>last_date_block_num:
            matrix.at[idx, 'item_last_sale'] = row.date_block_num - last_date_block_num
            cache[key] = row.date_block_num         
time.time() - ts

921.6269571781158

In [39]:
ts = time.time()
matrix['item_shop_first_sale'] = matrix['date_block_num'] - matrix.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')
time.time() - ts

2.6026909351348877

In [40]:
ts = time.time()
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

matrix = fill_na(matrix)
time.time() - ts

7.919317960739136

In [41]:
matrix

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_6,item_cnt_month_lag_12,date_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_2,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale
0,0,2,19,0.0,0,40,11,4,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
1,0,2,27,1.0,0,19,5,10,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
2,0,2,28,0.0,0,30,8,55,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
3,0,2,29,0.0,0,23,5,16,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
4,0,2,32,0.0,0,40,11,4,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
5,0,2,33,1.0,0,37,11,1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
6,0,2,34,0.0,0,40,11,4,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
7,0,2,35,0.0,0,40,11,4,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
8,0,2,40,0.0,0,57,13,8,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0
9,0,2,41,0.0,0,57,13,8,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,,0,31,-1,-1,0,0


In [42]:
matrix.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city_code',
       'item_category_id', 'type_code', 'subtype_code', 'item_cnt_month_lag_1',
       'item_cnt_month_lag_2', 'item_cnt_month_lag_3', 'item_cnt_month_lag_6',
       'item_cnt_month_lag_12', 'date_avg_item_cnt_lag_1',
       'date_item_avg_item_cnt_lag_1', 'date_item_avg_item_cnt_lag_2',
       'date_item_avg_item_cnt_lag_3', 'date_item_avg_item_cnt_lag_6',
       'date_item_avg_item_cnt_lag_12', 'date_shop_avg_item_cnt_lag_1',
       'date_shop_avg_item_cnt_lag_2', 'date_shop_avg_item_cnt_lag_3',
       'date_shop_avg_item_cnt_lag_6', 'date_shop_avg_item_cnt_lag_12',
       'date_cat_avg_item_cnt_lag_1', 'date_shop_cat_avg_item_cnt_lag_1',
       'date_shop_type_avg_item_cnt_lag_1',
       'date_shop_subtype_avg_item_cnt_lag_1', 'date_city_avg_item_cnt_lag_1',
       'date_item_city_avg_item_cnt_lag_1', 'date_type_avg_item_cnt_lag_1',
       'date_subtype_avg_item_cnt_lag_1', 'delta_price_lag',
       'delta

In [None]:
#INITIAL, DON'T TOUCH

features = [
    'date_block_num',
    'shop_id',
    'item_id',
    #'item_cnt_month',
    'city_code',
    'item_category_id',
    'type_code',
    'subtype_code',
    'item_cnt_month_lag_1',
    'item_cnt_month_lag_2',
    'item_cnt_month_lag_3',
    'item_cnt_month_lag_6',
    'item_cnt_month_lag_12',
    'date_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_2',
    'date_item_avg_item_cnt_lag_3',
    'date_item_avg_item_cnt_lag_6',
    'date_item_avg_item_cnt_lag_12',
    'date_shop_avg_item_cnt_lag_1',
    'date_shop_avg_item_cnt_lag_2',
    'date_shop_avg_item_cnt_lag_3',
    'date_shop_avg_item_cnt_lag_6',
    'date_shop_avg_item_cnt_lag_12',
    'date_cat_avg_item_cnt_lag_1',
    'date_shop_cat_avg_item_cnt_lag_1',
    'date_shop_type_avg_item_cnt_lag_1',
    'date_shop_subtype_avg_item_cnt_lag_1',
    'date_city_avg_item_cnt_lag_1',
    'date_item_city_avg_item_cnt_lag_1',
    'date_type_avg_item_cnt_lag_1',
    'date_subtype_avg_item_cnt_lag_1',
    'delta_price_lag',
    'delta_revenue_lag_1',
    'month',
    'days',
    'item_shop_last_sale',
    'item_last_sale',
    'item_shop_first_sale',
    'item_first_sale'
]

categorical_features = [
    'shop_id',
    'item_id',
    'city_code',
    'item_category_id',
    'type_code',
    'subtype_code',
]

In [63]:
features = [
    'date_block_num',
    'shop_id',
    'item_id',
    'city_code',
    'item_category_id',
    'type_code',
    'subtype_code',
    'item_cnt_month_lag_1',
    'item_cnt_month_lag_2',
    'item_cnt_month_lag_3',
    'item_cnt_month_lag_6',
    'item_cnt_month_lag_12',
    #'date_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_2',
    'date_item_avg_item_cnt_lag_3',
    'date_item_avg_item_cnt_lag_6',
    'date_item_avg_item_cnt_lag_12',
    'date_shop_avg_item_cnt_lag_1',
    'date_shop_avg_item_cnt_lag_2',
    'date_shop_avg_item_cnt_lag_3',
    'date_shop_avg_item_cnt_lag_6',
    'date_shop_avg_item_cnt_lag_12',
    'date_cat_avg_item_cnt_lag_1',
    'date_shop_cat_avg_item_cnt_lag_1',
    'date_shop_type_avg_item_cnt_lag_1',
    'date_shop_subtype_avg_item_cnt_lag_1',
    'date_city_avg_item_cnt_lag_1',
    #'date_item_city_avg_item_cnt_lag_1',
    'date_type_avg_item_cnt_lag_1',
    'date_subtype_avg_item_cnt_lag_1',
    #'delta_price_lag',
    #'delta_revenue_lag_1',
    'month',
    'days',
    #'item_shop_last_sale',
    #'item_last_sale',
    #'item_shop_first_sale',
    #'item_first_sale'
]

categorical_features = [
    'shop_id',
    'item_id',
    'city_code',
    'item_category_id',
    'type_code',
    'subtype_code',
]

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 44,#71
    'max_depth': 19,#15
    'feature_fraction': 0.9639040863246328,#0.930233257263184
    'bagging_fraction': 0.9149216274612932, #0.9417022004702574
    'bagging_freq': 18,#3
    'learning_rate': 0.01,#0.05
    'early_stopping_round': 60,
    'lambda': 0.8378024755045912,#0.44026767245133913
    'min_split_gain': 7.20114889206731,#0.8178089989260697
    'min_child_samples': 39,#85
    'min_data_per_group': 369,#342
    'min_child_weight': 6.683586781569385,#2.155266936013428
    'cat_smooth': 0.05596554352078842,#1.0113231069171438
    'min_data_in_leaf': 69,#126
    'max_bin': 37,#157
    'verbosity' : -1
}

sales_test = matrix[matrix.date_block_num > 11]#11 moins bien

ROUNDS = 35000
traindf = sales_test[sales_test.date_block_num < 27]
testdf = sales_test[sales_test.date_block_num == 27]
traingbm = lgb.Dataset(traindf[features], label=traindf['item_cnt_month'], categorical_feature=categorical_features, free_raw_data=False)
testgbm = lgb.Dataset(testdf[features], label=testdf['item_cnt_month'], categorical_feature=categorical_features, free_raw_data=False)
m = lgb.train(params, traingbm, ROUNDS, valid_sets=[traingbm, testgbm], verbose_eval=1)
predsgbm = m.predict(testdf[features])
mse = mean_squared_error(testdf['item_cnt_month'],predsgbm)
rmse = math.sqrt(mse)
print('total rmse : {}'.format(rmse))
predsgbm = m.predict(testdf[testdf.item_first_sale==0][features])
mse = mean_squared_error(testdf[testdf.item_first_sale==0]['item_cnt_month'],predsgbm)
rmse = math.sqrt(mse)
print('total rmse new items: {}'.format(rmse))
print('with {} rows'.format(len(testdf[testdf.item_first_sale==0])))
predsgbm = m.predict(testdf[testdf.item_first_sale!=0][features])
mse = mean_squared_error(testdf[testdf.item_first_sale!=0]['item_cnt_month'],predsgbm)
rmse = math.sqrt(mse)
print('total rmse old items: {}'.format(rmse))
print('with {} rows'.format(len(testdf[testdf.item_first_sale!=0])))



[1]	training's rmse: 1.21266	valid_1's rmse: 1.09062
Training until validation scores don't improve for 60 rounds.
[2]	training's rmse: 1.20695	valid_1's rmse: 1.08675
[3]	training's rmse: 1.20133	valid_1's rmse: 1.08303
[4]	training's rmse: 1.1958	valid_1's rmse: 1.07933
[5]	training's rmse: 1.19035	valid_1's rmse: 1.07572
[6]	training's rmse: 1.18498	valid_1's rmse: 1.07219
[7]	training's rmse: 1.17968	valid_1's rmse: 1.06872
[8]	training's rmse: 1.17443	valid_1's rmse: 1.06555
[9]	training's rmse: 1.16928	valid_1's rmse: 1.06224
[10]	training's rmse: 1.1642	valid_1's rmse: 1.05922
[11]	training's rmse: 1.15922	valid_1's rmse: 1.056
[12]	training's rmse: 1.15432	valid_1's rmse: 1.05294
[13]	training's rmse: 1.14949	valid_1's rmse: 1.05004
[14]	training's rmse: 1.14474	valid_1's rmse: 1.04721
[15]	training's rmse: 1.14003	valid_1's rmse: 1.04436
[16]	training's rmse: 1.1354	valid_1's rmse: 1.04156
[17]	training's rmse: 1.13083	valid_1's rmse: 1.03884
[18]	training's rmse: 1.12636	vali

[148]	training's rmse: 0.852749	valid_1's rmse: 0.938183
[149]	training's rmse: 0.852049	valid_1's rmse: 0.937839
[150]	training's rmse: 0.850953	valid_1's rmse: 0.937957
[151]	training's rmse: 0.849871	valid_1's rmse: 0.937941
[152]	training's rmse: 0.848995	valid_1's rmse: 0.937931
[153]	training's rmse: 0.847988	valid_1's rmse: 0.938026
[154]	training's rmse: 0.846995	valid_1's rmse: 0.93816
[155]	training's rmse: 0.846063	valid_1's rmse: 0.938359
[156]	training's rmse: 0.84512	valid_1's rmse: 0.938402
[157]	training's rmse: 0.844439	valid_1's rmse: 0.938085
[158]	training's rmse: 0.843618	valid_1's rmse: 0.938097
[159]	training's rmse: 0.842888	valid_1's rmse: 0.938127
[160]	training's rmse: 0.84192	valid_1's rmse: 0.938139
[161]	training's rmse: 0.841088	valid_1's rmse: 0.938111
[162]	training's rmse: 0.8402	valid_1's rmse: 0.938239
[163]	training's rmse: 0.839517	valid_1's rmse: 0.938164
[164]	training's rmse: 0.838836	valid_1's rmse: 0.937785
[165]	training's rmse: 0.837999	vali

month32: [503]	training's rmse: 0.724181	valid_1's rmse: 0.907445
total rmse new items: 2.445944661463226
with 12986 rows
total rmse old items: 0.7054814700379023
with 205669 rows

month33: [792]	training's rmse: 0.707068	valid_1's rmse: 0.909176
total rmse new items: 1.7559395179766482
with 20900 rows
total rmse old items: 0.780717545271791
with 217272 rows

27. 0.897893 (training 0.803958 196 rounds)
28. 0.829226 (training 0.700592 920 rounds)
29. 0.778054 (training 0.74054 419 rounds)
30. 0.709879 (training 0.736382 450 rounds)
31. 0.773381 
32. 0.907445
33. 0.909176
34. 0.94116

In [None]:
m.save_model('m_cheat.txt')

In [46]:
np.sort(matrix.date_block_num.unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34])

In [51]:
testdf = matrix[matrix.date_block_num == 34]
preds = m.predict(testdf[features])
testdf['item_cnt_month'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [78]:
np.sort(matrix.item_first_sale.unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34])

In [77]:
with pd.option_context('display.max_rows', 200, 'display.max_columns', 200):
    display(testdf[(testdf.item_cnt_month>=13.15905)&(testdf.item_cnt_month<=13.15906)])#(testdf.shopid==25)&

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_6,item_cnt_month_lag_12,date_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_2,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale
11077464,34,36,20949,13.159053,16,71,14,58,16.0,0.0,0.0,0.0,0.0,0.258545,17.453125,0.0,0.0,0.0,0.0,0.060974,0.0,0.0,0.0,0.0,17.453125,16.0,0.067993,16.0,0.100525,18.0,0.237305,17.453125,0.016693,0.0,10,30,1,-1,1,31


In [52]:
test = pd.read_csv('./data/test.csv')
test.loc[test.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 10, 'shop_id'] = 11
test.head(1)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037


In [53]:
submission = pd.merge(test,testdf[['shop_id','item_id','item_cnt_month']],on=['shop_id','item_id'],how='left')
submission.head(2)

Unnamed: 0,ID,shop_id,item_id,item_cnt_month
0,0,5,5037,1.290375
1,1,5,5320,0.237166


In [54]:
na_df = submission[submission.isna().any(axis=1)]
na_df

Unnamed: 0,ID,shop_id,item_id,item_cnt_month


In [55]:
submission = submission[['ID','item_cnt_month']]
print(submission.shape)

(214200, 2)


In [56]:
print(submission['item_cnt_month'].min())
print(submission['item_cnt_month'].max())

-0.5587035678330609
20.74627341237353


In [57]:
submission['item_cnt_month'] = submission['item_cnt_month'].clip(0,20)
print(submission['item_cnt_month'].min())
print(submission['item_cnt_month'].max())

0.0
20.0


In [75]:
submission.to_csv('submission_cheat.csv',index=False)

In [74]:
submission.head(1)

Unnamed: 0,ID,item_cnt_month
0,0,1.290375


In [61]:
existingitemids = matrix[matrix.date_block_num<=33]['item_id'].unique()

In [63]:
len(existingitemids)

21806

In [68]:
newitemids = matrix[(matrix.date_block_num==34)&(~matrix.item_id.isin(existingitemids))]['item_id'].unique()
print(len(newitemids))
print(len(matrix[(matrix.date_block_num==34)]['item_id'].unique()))
print(len(matrix[(matrix.date_block_num==34)&(matrix.item_id.isin(existingitemids))]['item_id'].unique()) + len(newitemids))

363
5100
5100


In [73]:
with pd.option_context('display.max_rows', 200, 'display.max_columns', 200):
    display(testdf[(testdf.date_block_num==34)&(~testdf.item_id.isin(newitemids))])
    #display(testdf[testdf.item_id.isin([5320])])

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,city_code,item_category_id,type_code,subtype_code,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_6,item_cnt_month_lag_12,date_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_1,date_item_avg_item_cnt_lag_2,date_item_avg_item_cnt_lag_3,date_item_avg_item_cnt_lag_6,date_item_avg_item_cnt_lag_12,date_shop_avg_item_cnt_lag_1,date_shop_avg_item_cnt_lag_2,date_shop_avg_item_cnt_lag_3,date_shop_avg_item_cnt_lag_6,date_shop_avg_item_cnt_lag_12,date_cat_avg_item_cnt_lag_1,date_shop_cat_avg_item_cnt_lag_1,date_shop_type_avg_item_cnt_lag_1,date_shop_subtype_avg_item_cnt_lag_1,date_city_avg_item_cnt_lag_1,date_item_city_avg_item_cnt_lag_1,date_type_avg_item_cnt_lag_1,date_subtype_avg_item_cnt_lag_1,delta_price_lag,delta_revenue_lag_1,month,days,item_shop_last_sale,item_last_sale,item_shop_first_sale,item_first_sale
10913804,34,5,5037,1.290375,3,19,5,10,0.0,1.0,3.0,1.0,1.0,0.258545,0.568359,2.511719,2.833984,1.977539,1.299805,0.190063,0.205933,0.245117,0.180054,0.206055,0.379150,0.513672,0.456055,0.447021,0.190063,0.0,0.494141,0.337402,-0.222046,-0.048553,10,30,1,1,14,14
10913806,34,5,5233,1.961521,3,19,5,10,1.0,3.0,1.0,3.0,0.0,0.258545,0.954590,1.860352,3.572266,1.613281,0.000000,0.190063,0.205933,0.245117,0.180054,0.000000,0.379150,0.513672,0.456055,0.447021,0.190063,1.0,0.494141,0.337402,0.496826,-0.048553,10,30,1,1,7,7
10913807,34,5,5232,0.357890,3,23,5,16,0.0,0.0,1.0,0.0,0.0,0.258545,0.636230,1.116211,1.547852,0.000000,0.000000,0.190063,0.205933,0.245117,0.000000,0.000000,0.337402,0.374268,0.456055,0.349121,0.190063,0.0,0.494141,0.331787,0.505371,-0.048553,10,30,1,1,3,3
10913809,34,5,5039,1.207568,3,23,5,16,1.0,1.0,0.0,3.0,0.0,0.258545,0.659180,2.162109,2.546875,1.590820,0.899902,0.190063,0.205933,0.245117,0.180054,0.206055,0.337402,0.374268,0.456055,0.349121,0.190063,1.0,0.494141,0.331787,-0.210693,-0.048553,10,30,1,1,14,14
10913810,34,5,5041,0.778931,3,20,5,11,2.0,3.0,0.0,0.0,0.0,0.258545,1.409180,5.699219,0.000000,0.000000,0.000000,0.190063,0.205933,0.000000,0.000000,0.000000,1.141602,0.796387,0.456055,0.684082,0.190063,2.0,0.494141,1.022461,0.031342,-0.048553,10,30,1,1,2,2
10913811,34,5,5046,0.224733,3,55,13,2,0.0,0.0,0.0,0.0,1.0,0.258545,0.272705,0.209351,0.309570,0.795410,0.580078,0.190063,0.205933,0.245117,0.180054,0.206055,0.196899,0.158203,0.104492,0.158203,0.190063,0.0,0.146973,0.196899,0.080505,-0.048553,10,30,1,1,12,12
10913812,34,5,5319,1.158728,3,55,13,2,0.0,3.0,2.0,2.0,5.0,0.258545,0.590820,1.000000,1.213867,1.500000,5.238281,0.190063,0.205933,0.245117,0.180054,0.206055,0.196899,0.158203,0.104492,0.158203,0.190063,0.0,0.146973,0.196899,0.005886,-0.048553,10,30,1,1,12,12
10913813,34,5,5003,0.689390,3,20,5,11,0.0,0.0,0.0,0.0,0.0,0.258545,2.158203,0.279053,0.000000,0.000000,0.000000,0.190063,0.205933,0.000000,0.000000,0.000000,1.141602,0.796387,0.456055,0.684082,0.190063,0.0,0.494141,1.022461,0.001506,-0.048553,10,30,-1,1,2,2
10913814,34,5,4806,3.354863,3,30,8,55,3.0,2.0,5.0,6.0,2.0,0.258545,2.794922,2.744141,5.523438,4.203125,1.540039,0.190063,0.205933,0.245117,0.180054,0.206055,0.591797,0.834473,0.319092,0.834473,0.190063,3.0,0.328857,0.591797,-0.341797,-0.048553,10,30,1,1,22,22
10913815,34,5,4843,0.093806,3,20,5,11,0.0,1.0,0.0,0.0,0.0,0.258545,0.318115,4.765625,0.000000,0.000000,0.000000,0.190063,0.205933,0.000000,0.000000,0.000000,1.141602,0.796387,0.456055,0.684082,0.190063,0.0,0.494141,1.022461,-0.006435,-0.048553,10,30,1,1,2,2


In [49]:
sample_cheat = matrix[(matrix.date_block_num==33)&(matrix.shop_id==21)&(matrix.item_id==14127)][features]

In [50]:
sample_cheat.to_csv('sample_cheat.csv',index=False)

In [62]:
matrix.item_last_sale.unique()

array([-1,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 24, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33])

In [60]:
group = matrix[matrix.item_id==14127]
group[group.item_last_sale!=-1][['date_block_num','shop_id','item_id','item_last_sale']]

Unnamed: 0,date_block_num,shop_id,item_id,item_last_sale
7785874,22,2,14127,1
8102104,23,2,14127,1
8432071,24,2,14127,1
8738946,25,2,14127,1
9023407,26,2,14127,1
9302908,27,2,14127,1
9560100,28,2,14127,1
9792490,29,2,14127,1
10016862,30,2,14127,1
10245665,31,2,14127,1


date_block_num: month OK
shop_id: shopid OK
item_id: itemid OK
city_code: shopcityid OK
item_category_id: itemcategoryid OK
type_code: itemcategorytypeid OK
subtype_code: itemcategorysubtypeid OK
item_cnt_month_lag_1,2,3,6,12: tsshopiditemid1,2,3,6,12 OK

date_avg_item_cnt_lag_1: monthly mean of target over all rows of a month MISSING IN MY MODEL

date_item_avg_item_cnt_lag_1,2,3,6,12: tsitemid1,2,3,6,12 OK (my model: 6decimals, cheat: 4)
date_shop_avg_item_cnt_lag_1,2,3,6,12: tsshopid1,2,3,6,12 OK (my model: 6decimals, cheat: 4)
date_cat_avg_item_cnt_lag_1: tsitemcategoryid1 OK (my model: 6decimals, cheat: 4)
date_shop_cat_avg_item_cnt_lag_1: tsitemcategoryidshopid1 OK (my model: 6decimals, cheat: 4)
date_shop_type_avg_item_cnt_lag_1: tsitemcategorytypeidshopid1 OK (my model: 6decimals, cheat: 4)
date_shop_subtype_avg_item_cnt_lag_1: tsitemcategorysubtypeidshopid1 OK (my model: 6decimals, cheat: 4)
date_city_avg_item_cnt_lag_1: tsshopcityid1 OK (my model: 6decimals, cheat: 4)

date_item_city_avg_item_cnt_lag_1: monthly mean over itemid/shopcityid MISSING IN MY MODEL

date_type_avg_item_cnt_lag_1: tsitemcategorytypeid1 OK (my model: 6decimals, cheat: 4)
date_subtype_avg_item_cnt_lag_1: tsitemcategorysubtypeid1 OK (my model: 6decimals, cheat: 4)

delta_price_lag: should be ratiolastavgprice/100 - 1 (OK at some decimals près, check whole dataset)

delta_revenue_lag_1: revenue by shop lag 1 - shoprevenue avg / shoprevenue DIFFERENT ON MY MODEL (ratio shopid revenue lag1 / shopid revenue lag 2)

month: monthofyear -1 OK BUT DIFFERENT WITH A +1 CONSTANT

days: nbdays OK

item_shop_last_sale: current month - last month there was a row for itemid/shopid combinartion MISSING IN MY MODEL (-1 if first appearance)
item_last_sale: MISSING IN MY MODEL but no need, seems buggy according to implementation
item_shop_first_sale: current month - month of first time combination itemid/shopid appeared

item_first_sale: monthssinceitemlaunched - 1 OK BUT DIFFERENT WITH A +1 CONSTANT

In [68]:
sales_test[sales_test.date_block_num == 27][features].to_csv('compare_cheat.csv')

differences noted:
- checker d'où viennent les 0 en + dans le code cheat
- comparer les targets

In [71]:
cheat = pd.DataFrame()
cheat['targetcheat'] = matrix[matrix.date_block_num == 27]['item_cnt_month']
cheat.head(2)

Unnamed: 0,targetcheat
9299440,0.0
9299441,0.0


In [72]:
cheat.to_csv('targets_cheat.csv',index=False)