In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
from fastai.dataset import *
from fastai.structured import *
from fastai.column_data import *

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from math import ceil

from itertools import product
import gc
from dateutil import parser

In [2]:
torch.cuda.set_device(0)

In [3]:
PATH = "data/"
sz=224
arch=resnext101_64
batch_size=5

## Feature Engineering

In [6]:
cat_vars = [
    'date_block_num',
    'item_id',
    'month',
    'shop_id',
    'year',
    'item_category_id',
    'item_category_name'
]

dep = 'target'

In [7]:
item_categories = pd.read_csv(f'{PATH}item_categories.csv')
items = pd.read_csv(f'{PATH}items.csv')
shops = pd.read_csv(f'{PATH}shops.csv')
sales = pd.read_csv(f'{PATH}sales_train.csv')
sales_test = pd.read_csv(f'{PATH}test.csv')

In [None]:
index_cols  = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'},'item_price': np.mean})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
# all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
all_data = pd.merge(grid,gb,how='left',on=index_cols)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

# del grid,cur_shops,cur_items,gb
# del sales
gc.collect()

In [None]:
all_data.head()

In [None]:
gb.head()

In [None]:
all_data.columns

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

all_data.fillna(0.0,inplace=True)
all_data['target'] = all_data['target'].clip(0.0,40.0)
all_data['month'] = all_data['date_block_num'] % 12
all_data['year'] = all_data['date_block_num'] // 12

In [None]:
sales_test['date_block_num'] = 34
sales_test['month'] = sales_test['date_block_num']%12
sales_test['year'] = sales_test['date_block_num']//12

all_data = pd.concat([all_data,sales_test])

# del sales_test
gc.collect()

In [None]:
all_data = all_data.merge(items, on="item_id")
all_data.drop('item_name',axis = 1,inplace=True)
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

all_data = downcast_dtypes(all_data)

gc.collect()

In [None]:
lag_variable = ['target']
lags = [1 ,2 ,3 ,4, 5, 12]
for lag in lags:
    sales_new_df = all_data[['date_block_num','shop_id','item_id']+lag_variable].copy()
    sales_new_df.date_block_num+=lag
    sales_new_df.columns = ['date_block_num','shop_id','item_id']+ [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variable]
    all_data = pd.merge(all_data, sales_new_df,on=['date_block_num','shop_id','item_id'] ,how='left')
    del sales_new_df
    gc.collect()

for feat in all_data.columns:
    if 'target' in feat:
        all_data[feat]=all_data[feat].fillna(0)

In [None]:

all_data['avg_sales_per_shop_id'] = 0.0
all_data['avg_sales_per_item_id'] = 0.0
all_data['avg_sales_per_item_cat_id'] = 0.0

avg_shop_sales_per_month = all_data.groupby(['shop_id','date_block_num'],as_index=False).agg({'target':'mean'})
avg_item_sales_per_month = all_data.groupby(['item_id','date_block_num'],as_index=False).agg({'target':'mean'})
avg_item_cat_sales_per_month = all_data.groupby(['item_category_id','date_block_num', ],as_index=False).agg({'target':'mean'})

#lets calculate  6 lags
lag_variable = ['item_id','shop_id','item_category_id']
lags = [1 ,2 ,3 ,4, 5, 12]
curr = 0
for lag in lags:
    diff = lag - curr
    curr = lag
    avg_shop_sales_per_month.date_block_num+=diff
    avg_item_cat_sales_per_month.date_block_num+=diff
    avg_item_sales_per_month.date_block_num+=diff
    
    avg_shop_sales_per_month.columns = ['shop_id','date_block_num']+ [lag_variable[1]+'_lag_'+str(lag)]
    avg_item_sales_per_month.columns = ['item_id','date_block_num']+ [lag_variable[0]+'_lag_'+str(lag)]
    avg_item_cat_sales_per_month.columns = ['item_category_id','date_block_num']+ [lag_variable[2]+'_lag_'+str(lag)]
    
    all_data = pd.merge(all_data, avg_shop_sales_per_month,on=['date_block_num','shop_id'] ,how='left')
    all_data = pd.merge(all_data, avg_item_sales_per_month,on=['date_block_num','item_id'] ,how='left')
    all_data = pd.merge(all_data, avg_item_cat_sales_per_month,on=['date_block_num','item_category_id'] ,how='left')
    
    gc.collect()
del avg_item_cat_sales_per_month,avg_item_sales_per_month,avg_shop_sales_per_month

In [None]:
all_data.drop(columns=['mean'],inplace=True)
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)

In [None]:
all_data.fillna(0,inplace=True)
gc.collect()

In [None]:
all_data.to_pickle(PATH + 'all_data.pkl')

In [None]:
all_data = pd.read_pickle(PATH + 'all_data.pkl')

In [None]:
all_data.columns

### Adding category names

In [None]:
l_cat = list(item_categories.item_category_name)
for ind in range(0,1):
    l_cat[ind] = 'PC Headsets / Headphones'
for ind in range(1,8):
    l_cat[ind] = 'Access'
l_cat[8] = 'Tickets (figure)'
l_cat[9] = 'Delivery of goods'
for ind in range(10,18):
    l_cat[ind] = 'Consoles'
for ind in range(18,25):
    l_cat[ind] = 'Consoles Games'
l_cat[25] = 'Accessories for games'
for ind in range(26,28):
    l_cat[ind] = 'phone games'
for ind in range(28,32):
    l_cat[ind] = 'CD games'
for ind in range(32,37):
    l_cat[ind] = 'Card'
for ind in range(37,43):
    l_cat[ind] = 'Movie'
for ind in range(43,55):
    l_cat[ind] = 'Books'
for ind in range(55,61):
    l_cat[ind] = 'Music'
for ind in range(61,73):
    l_cat[ind] = 'Gifts'
for ind in range(73,79):
    l_cat[ind] = 'Soft'
for ind in range(79,81):
    l_cat[ind] = 'Office'
for ind in range(81,83):
    l_cat[ind] = 'Clean'
l_cat[83] = 'Elements of a food'

In [None]:
all_data['item_category_name'] = all_data.item_category_id.apply(
    lambda x: l_cat[x]
)

In [None]:
cat_list = list(all_data.item_category_name)

In [None]:
all_data.columns

In [None]:
for cat in cat_vars:
    all_data[cat] = all_data[cat].astype('category')
all_data['date_block_num'] = all_data['date_block_num'].cat.as_ordered()
all_data['month'] = all_data['month'].cat.as_ordered()
all_data['year'] = all_data['year'].cat.as_ordered()

In [None]:
# X_train = all_data[(all_data['date_block_num']>12)&(all_data['date_block_num']<33)].drop(['target','ID'], axis = 1)
# y_train = all_data[(all_data['date_block_num']>12)&(all_data['date_block_num']<33)]['target']
X_train = all_data[all_data['date_block_num']<33].drop(['target','ID'], axis = 1)
y_train = all_data[all_data['date_block_num']<33]['target']
X_val =  all_data[all_data['date_block_num']==33].drop(['target','ID'], axis = 1)
y_val = all_data[all_data['date_block_num'] == 33]['target']
X_test = all_data[(all_data['date_block_num']==34)].drop(['target'],axis=1)
del all_data

In [None]:
X_test = X_test.drop('ID', axis=1)

In [None]:
gc.collect()

In [None]:
X_train.to_pickle(PATH + 'X_train.pkl')
y_train.to_pickle(PATH + 'y_train.pkl')
X_val.to_pickle(PATH + 'X_val.pkl')
y_val.to_pickle(PATH + 'y_val.pkl')
X_test.to_pickle(PATH + 'X_test.pkl')

In [None]:
X_train.head().columns

In [None]:
X_train.tail()

In [None]:
X_test.head()

# Training

In [15]:
X_train = pd.read_pickle(PATH + 'X_train.pkl')
y_train = pd.read_pickle(PATH + 'y_train.pkl')
X_val = pd.read_pickle(PATH + 'X_val.pkl')
y_val = pd.read_pickle(PATH + 'y_val.pkl')
X_test = pd.read_pickle(PATH + 'X_test.pkl')

In [16]:
orig_len = len(X_test)
X_test = X_test.merge(sales_test, on=['item_id', 'shop_id'], how='inner')
X_test = X_test.set_index('ID').sort_index().reset_index().drop('ID', axis=1)
assert len(X_test) == orig_len  # sanity check

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


In [58]:
def rmse(y_pred, targ):
    return np.sqrt(((targ - y_pred)**2).mean())

def rmse_expm1(y_pred, targ):
    return rmse(y_pred=np.expm1(y_pred), targ=np.expm1(targ))

## _Deep Learning Approach_

In [75]:
X_train['target'] = y_train
X_val['target'] = y_val
X_test['target'] = 0

In [76]:
# For faster prototyping
X_train = X_train[X_train['date_block_num'] > 12]

In [77]:
trn_df, trn_y, nas, mapper = proc_df(X_train, 'target', do_scale=True)
val_df, val_y, nas, mapper = proc_df(
    X_val,
    dep,
    do_scale=True,
    mapper=mapper,
)
test_df, _, nas, mapper = proc_df(
    X_test,
    dep,
    do_scale=True,
    mapper=mapper,
)

In [78]:
# reduces volatility in the loss function
trn_y = np.log1p(trn_y)
val_y = np.log1p(val_y)

In [None]:
trn_df.to_pickle(PATH + 'trn_df.pkl')
val_df.to_pickle(PATH + 'val_df.pkl')
test_df.to_pickle(PATH + 'test_df.pkl')

In [None]:
trn_df.head()

In [None]:
test_df.head()

In [79]:
md = ColumnarModelData.from_data_frames(
    path=PATH,
    trn_df=trn_df,
    trn_y=trn_y,
    val_df=val_df,
    val_y=val_y,
    cat_flds=cat_vars,
    bs=128,
    test_df=test_df,
    is_reg=True,  # is regression
    is_multi=False, 
)

In [23]:
cat_sz = [(c, len(X_train[c].cat.categories)+1) for c in cat_vars]

In [24]:
cat_sz

[('date_block_num', 36),
 ('item_id', 22171),
 ('month', 13),
 ('shop_id', 61),
 ('year', 4),
 ('item_category_id', 85),
 ('item_category_name', 19)]

In [25]:
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]

In [26]:
emb_szs

[(36, 18), (22171, 50), (13, 7), (61, 31), (4, 2), (85, 43), (19, 10)]

## _First Attempt - No Regularization_

In [None]:
m = md.get_learner(
    emb_szs=emb_szs,
    n_cont=len(trn_df.columns) - len(cat_vars),
    emb_drop=.02,
    out_sz=1,
    szs=[1000, 500, 500],
    drops=[.001, .005, .005],
    use_bn=True,
)

In [None]:
m.crit = nn.MSELoss()

In [None]:
m.lr_find(1e-6)

In [None]:
m.sched.plot()

In [None]:
lr = 1e-5
m.fit(lrs=lr, n_cycle=2, metrics=[rmse, rmse_expm1], cycle_len=1, cycle_mult=2, wds=1e-6)

## _New Attempt - Adding Regularization_

The above model seems to be converging on _something_, but it's overfitting
far too much. Here we will increase dropout in the last linear layer
and include weight decay

In [98]:
gc.collect()

641

In [97]:
m = md.get_learner(
    emb_szs=emb_szs,
    n_cont=len(trn_df.columns) - len(cat_vars),
    emb_drop=.05,
    out_sz=1,
    szs=[1000, 500, 500],
    drops=[.001, .005, .01],
    use_bn=True,
    y_range=(np.log1p(0), np.log1p(20))
)

  for o in self.lins: kaiming_normal(o.weight.data)
  kaiming_normal(self.outp.weight.data)


In [96]:
??md.get_learner

In [None]:
m

In [99]:
m.crit = nn.MSELoss()

In [None]:
m.lr_find(1e-6, wds=1e-3)

In [None]:
m.sched.plot()

Interestingly, adding weight decay made the plot much more noisy. I'm not
sure what causes this, but it could be an intersting thing to investigate

In [None]:
lr = 1e-5
m.fit(lrs=lr, n_cycle=2, metrics=[rmse, rmse_expm1], cycle_len=1, cycle_mult=2, wds=1e-3)

In [None]:
lr = 1e-5
m.fit(lrs=lr, n_cycle=2, metrics=[rmse, rmse_expm1], wds=1e-3, cycle_len=3)

### _Simplifying the Model_

Here we will run the same code as above, except with less regularization and more
dropout

In [111]:
gc.collect()

116

In [105]:
m = md.get_learner(
    emb_szs=emb_szs,
    n_cont=len(trn_df.columns) - len(cat_vars),
    emb_drop=.1,
    out_sz=1,
    szs=[1000, 500],
    drops=[.1, .15],
    use_bn=True,
    y_range=(np.log1p(0), np.log1p(20))
)

  for o in self.lins: kaiming_normal(o.weight.data)
  kaiming_normal(self.outp.weight.data)


In [106]:
m.crit = nn.MSELoss()

In [None]:
m.lr_find()

In [None]:
m.sched.plot()

In [107]:
lr = 1e-4
m.fit(lrs=lr, n_cycle=2, metrics=[rmse, rmse_expm1], cycle_len=1, cycle_mult=2, wds=3e-4)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

100%|██████████| 45622/45622 [04:50<00:00, 157.27it/s, loss=0.0823]

KeyboardInterrupt: 

In [110]:
validation_predictions = m.predict()
np.sqrt((
    (np.expm1(val_y.reshape(len(val_y), 1)).clip(0,20) - np.expm1(validation_predictions).clip(0,20))**2
).mean())

0.96739256

In [92]:
np.argmax((np.expm1(val_y.reshape(len(val_y), 1).clip(0,20)) - np.expm1(validation_predictions.clip(0,20))))

151473

In [91]:
np.expm1(val_y[151473])

40.0

### Make Predictions on the Model

In [112]:
len(y_train)

result = m.predict(is_test=True)
predictions = np.expm1(result).clip(0, 20)
# predictions = result.clip(0,20)

In [None]:
??np.mean

## Gradient Boost Regressor

This is the most common methodology that I see in this Kaggle competition. Gradient boosting is
a well-established method for tabular data. It's historically been a great approach
in Kaggle competitions.

From peeking at other kernels, people have actually gotten worse results with
using the entire dataset with gradient boosting. The goal of the deep 
learning approach is to capture the entire dataset and come up with better generalization.

In [113]:
X_train['item_category_name'] = X_train['item_category_name'].cat.codes

In [114]:
X_test['item_category_name'] = X_test['item_category_name'].cat.codes

In [115]:
X_val['item_category_name'] = X_val['item_category_name'].cat.codes

In [116]:
for var in cat_vars:
    if var == 'item_category_name':
        continue # hack
    X_train[var] = X_train[var].astype('int32')
    X_val[var] = X_val[var].astype('int32')
    X_test[var] = X_test[var].astype('int32')

In [117]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
params = {
    'objective': 'reg:linear',
    'max_depth':4 ,
    'learning_rate': 1,
    'silent': 1.0,
    'n_estimators': 30,
    'min_child_weight':10
}

bst = XGBRegressor(**params).fit(X_train, y_train,eval_metric='rmse')
preds = bst.predict(X_val)
sqrt(mean_squared_error(y_val, preds))



XGBoostError: b'[08:00:53] src/objective/regression_obj.cc:90: Check failed: (preds.size()) == (info.labels.size()) labels are not correctly providedpreds.size=5839576, label.size=10675678'

In [None]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
params = {
    'objective': 'reg:linear',
    'max_depth':4 ,
    'learning_rate': 1,
    'silent': 1.0,
    'n_estimators': 30,
    'min_child_weight':10
}

X = pd.concat([X_train,X_val])
y = np.concatenate([y_train,y_val])
del X_train,X_val,y_train,y_val
gc.collect()

In [None]:
bst = XGBRegressor(**params).fit(X, y,eval_metric='rmse')

In [None]:
predictions = bst.predict(X_test)
predictions = np.clip(predictions,0.0,20.0)

## Submit It!

In [118]:
display(predictions)

array([[ 0.27717],
       [ 0.05289],
       [ 0.46136],
       ..., 
       [ 0.05169],
       [ 0.04469],
       [ 0.04236]], dtype=float32)

In [121]:
final_dataframe = pd.read_csv(f'{PATH}test.csv')

In [122]:
final_dataframe['item_cnt_month'] = predictions
final_dataframe = final_dataframe[['ID', 'item_cnt_month']]

In [123]:
final_dataframe.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.277168
1,1,0.05289
2,2,0.461358
3,3,0.208736
4,4,0.162292


In [124]:
len(final_dataframe)

214200

In [125]:
np.unique(predictions)

array([  0.02227,   0.02241,   0.02243, ...,  19.2229 ,  19.25305,  19.27076], dtype=float32)

In [126]:
len(np.unique(final_dataframe['ID']))

214200

In [127]:
SUBM = 'sub/'
os.makedirs(SUBM, exist_ok=True)
final_dataframe.to_csv(f'{SUBM}subm.gz', compression='gzip', index=False)

In [128]:
FileLink(f'{SUBM}subm.gz')