In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import feather
from fastai import *
from fastai.tabular import *
from dataclasses import dataclass
from pathlib import Path
from typing import *


In [2]:
plt.style.use(['dark_background'])

In [5]:
default_device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Data preparation

In [4]:
DATA_PATH = Path.home()/".fastai/data"
PATH = DATA_PATH/"competitions/rossmann"
train_df = pd.read_feather(PATH/'train_clean')
test_df = pd.read_feather(PATH/'test_clean')

TypeError: read_feather() got an unexpected keyword argument 'nthreads'

In [7]:
train_df.head()

Unnamed: 0,index,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,...,AfterStateHoliday,BeforeStateHoliday,AfterPromo,BeforePromo,SchoolHoliday_bw,StateHoliday_bw,Promo_bw,SchoolHoliday_fw,StateHoliday_fw,Promo_fw
0,0,1,5,2015-07-31,5263,555,1,1,False,1,...,57,0,0,0,5.0,0.0,5.0,7.0,0.0,5.0
1,1,2,5,2015-07-31,6064,625,1,1,False,1,...,67,0,0,0,5.0,0.0,5.0,1.0,0.0,1.0
2,2,3,5,2015-07-31,8314,821,1,1,False,1,...,57,0,0,0,5.0,0.0,5.0,5.0,0.0,5.0
3,3,4,5,2015-07-31,13995,1498,1,1,False,1,...,67,0,0,0,5.0,0.0,5.0,1.0,0.0,1.0
4,4,5,5,2015-07-31,4822,559,1,1,False,1,...,57,0,0,0,5.0,0.0,5.0,1.0,0.0,1.0


In [8]:
test_df.head()

Unnamed: 0,index,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,Year,...,AfterStateHoliday,BeforeStateHoliday,AfterPromo,BeforePromo,SchoolHoliday_bw,StateHoliday_bw,Promo_bw,SchoolHoliday_fw,StateHoliday_fw,Promo_fw
0,0,1,1,4,2015-09-17,1.0,1,False,0,2015,...,105,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
1,1,2,3,4,2015-09-17,1.0,1,False,0,2015,...,105,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
2,2,3,7,4,2015-09-17,1.0,1,False,0,2015,...,115,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
3,3,4,8,4,2015-09-17,1.0,1,False,0,2015,...,115,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0
4,4,5,9,4,2015-09-17,1.0,1,False,0,2015,...,105,0,0,0,0.0,0.0,4.0,0.0,0.0,1.0


In [9]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

n = len(train_df); n

844338

In [10]:
n_samples = 2000
idx = np.sort(np.random.permutation(range(n))[:n_samples])
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]

In [11]:
small_train_df.head()

Unnamed: 0,Store,DayOfWeek,PromoInterval,CompetitionDistance,Mean_Humidity,Sales
1,2,5,"Jan,Apr,Jul,Oct",570.0,62,6064
359,361,5,"Feb,May,Aug,Nov",4490.0,55,9806
736,738,5,,5980.0,61,9504
1057,1060,5,"Feb,May,Aug,Nov",3430.0,61,11782
1448,337,4,"Feb,May,Aug,Nov",10600.0,56,6031


In [12]:
small_test_df.head()

Unnamed: 0,Store,DayOfWeek,PromoInterval,CompetitionDistance,Mean_Humidity,Sales
428325,833,2,"Mar,Jun,Sept,Dec",3290.0,70,4152
429166,564,1,"Jan,Apr,Jul,Oct",6540.0,79,4853
429260,658,1,"Jan,Apr,Jul,Oct",520.0,79,5163
429740,948,7,,1430.0,69,8869
429993,250,6,"Feb,May,Aug,Nov",3520.0,88,7589


In [13]:
dep_var = 'Sales'
train_df = pd.read_feather(PATH/'train_clean')
train_df = train_df[cat_vars+cont_vars+[dep_var, 'Date']].copy()

In [14]:
len(test_df)

41088

In [15]:
cut = train_df.Date[(train_df.Date == train_df.Date[len(test_df)])].index.max()
cut

41395

In [16]:
train_df = train_df.set_index('Date')

In [17]:
train_df, valid_df = train_df.iloc[cut:], train_df.iloc[:cut]
len(train_df), len(valid_df)

(802943, 41395)

In [18]:
tfms = [FillMissing, Categorify]
data = TabularDataBunch.from_df(PATH, train_df, valid_df, dep_var, tfms=tfms,
                    cat_names=cat_vars, cont_names=cont_vars, log_output=True)

In [19]:
data.train_ds.conts.shape, data.train_ds.cats.shape

(torch.Size([802943, 16]), torch.Size([802943, 24]))

## Model

In [20]:
cat_szs = [len(train_df[n].cat.categories)+1 for n in cat_vars]
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]

In [21]:
max_log_y = np.log(np.max(train_df['Sales']))
y_range = torch.tensor([0, max_log_y*1.2], device=default_device)

In [22]:
model = TabularModel(emb_szs, len(cont_vars), 1, [1000,500], [0.001,0.01], emb_drop=0.04, y_range=y_range)

In [23]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(1116, 50)
    (1): Embedding(8, 4)
    (2): Embedding(4, 2)
    (3): Embedding(13, 7)
    (4): Embedding(32, 16)
    (5): Embedding(3, 2)
    (6): Embedding(26, 13)
    (7): Embedding(27, 14)
    (8): Embedding(5, 3)
    (9): Embedding(4, 2)
    (10): Embedding(4, 2)
    (11): Embedding(24, 12)
    (12): Embedding(9, 5)
    (13): Embedding(13, 7)
    (14): Embedding(53, 27)
    (15): Embedding(22, 11)
    (16): Embedding(7, 4)
    (17): Embedding(7, 4)
    (18): Embedding(4, 2)
    (19): Embedding(4, 2)
    (20): Embedding(9, 5)
    (21): Embedding(9, 5)
    (22): Embedding(3, 2)
    (23): Embedding(3, 2)
  )
  (emb_drop): Dropout(p=0.04)
  (bn_cont): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=219, out_features=1000, bias=True)
    (1): ReLU(inplace)
    (2): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  

In [24]:
def exp_rmspe(pred:Tensor, targ:Tensor) -> Rank0Tensor:
    pred, targ = torch.exp(pred), torch.exp(targ)
    pct_var = (targ - pred)/targ
    return torch.sqrt((pct_var**2).mean())

In [25]:
learn = Learner(data, model)
learn.loss_fn = F.mse_loss
learn.metrics = [exp_rmspe]

## Train 

In [None]:
learn.fit_one_cycle(5, 1e-3, wd=0.2, pct_start=0.2)

In [None]:
learn.fit_one_cycle(5, 1e-3, wd=0.1, pct_start=0.3)

In [None]:
with torch.no_grad():
    pct_var,cnt = 0.,0
    for x,y in learn.data.valid_dl:
        out = learn.model(*x)
        cnt += y.size(0)
        y, out = torch.exp(y), torch.exp(out)
        pct_var += ((y - out)/y).pow(2).sum()
torch.sqrt(pct_var/cnt).item()

In [None]:
embs = np.array([e.weight.data.numpy() for e in learn.model.embeds.cpu()])
np.save(PATH/'embs.npy', embs)

## Reuse

In [2]:
import xgboost as xgb
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn import neighbors

  from numpy.core.umath_tests import inner1d


### Data Acquisition from Learner

In [26]:
cats_train = data.train_ds.cats.numpy()
conts_train = data.train_ds.conts.numpy()

In [27]:
cats_train.shape, conts_train.shape

((802943, 24), (802943, 16))

In [55]:
x_train,y_train = np.concatenate((cats_train, conts_train), axis=1), data.train_ds.y.numpy()
x_train.shape, y_train.shape

((802943, 40), (802943,))

In [29]:
cats_valid = data.valid_ds.cats.numpy()
conts_valid = data.valid_ds.conts.numpy()

In [30]:
cats_valid.shape, conts_valid.shape

((41395, 24), (41395, 16))

In [31]:
x_valid,y_valid = np.concatenate((cats_valid, conts_valid), axis=1), data.valid_ds.y.numpy()

### RF Baseline

In [153]:
m = RandomForestRegressor(n_estimators=200, max_depth=35,
                min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
#m = RandomForestRegressor(n_estimators=20, n_jobs=-1)

In [47]:
def rmse(pred,y): return math.sqrt(((pred-y)**2).mean())
def mae(pred,y):
    pred, y = np.exp(pred), np.exp(y)
    return np.absolute((pred-y)/y).mean()

def print_score(m, metric):
    res = [metric(m.predict(x_train), y_train), metric(m.predict(x_valid), y_valid)]
    if isinstance(x_train, np.ndarray): res.extend([m.score(x_train, y_train), m.score(x_valid, y_valid)])
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [133]:
%time m.fit(x_train,y_train)

In [133]:
print_score(m, mae)

CPU times: user 6min 34s, sys: 2.04 s, total: 6min 36s
Wall time: 59.7 s
[0.04654200870419245, 0.14236726547932022, 0.9880543579737189, 0.8818696263048807]


### RF + Emb

In [32]:
embs = np.load(PATH/'embs.npy')

In [33]:
embc_train = [ embs[c][cats_train[:,c]] for c in range(len(data.train_ds.cat_names)) ]
embc_valid = [ embs[c][cats_valid[:,c]] for c in range(len(data.train_ds.cat_names)) ]

x_train = np.concatenate(embc_train+[conts_train], axis=1)
x_valid = np.concatenate(embc_valid+[conts_valid], axis=1)

In [34]:
x_train.shape, x_valid.shape

((802943, 219), (41395, 219))

In [158]:
%time m.fit(x_train,y_train)

In [158]:
print_score(m, mae)

CPU times: user 44min 31s, sys: 7.59 s, total: 44min 39s
Wall time: 6min 40s
[0.0428516384693047, 0.12918475899251264, 0.9898736224745616, 0.9027333743575678]


### GBT Baseline

In [67]:
x_train,y_train = np.concatenate((cats_train, conts_train), axis=1), data.train_ds.y.numpy()
x_valid,y_valid = np.concatenate((cats_valid, conts_valid), axis=1), data.valid_ds.y.numpy()

In [67]:
x_train = xgb.DMatrix(x_train, label=y_train)
x_valid = xgb.DMatrix(x_valid, label=y_valid)
evallist = [(x_train, 'train')]

In [68]:
evallist = [(x_train, 'train')]
param = {'nthread': -1,
         'max_depth': 7,
         'eta': 0.02,
         'silent': 1,
         'objective': 'reg:linear',
         'colsample_bytree': 0.7,
         'subsample': 0.7}
num_round = 3000

In [69]:
# GPU settings
param['gpu_id'] = 0
param['max_bin'] = 16
param['tree_method'] = 'gpu_hist'

In [70]:
%time m = xgb.train(param, x_train, num_round, evallist)

[0]	train-rmse:8.10297
[1]	train-rmse:7.94124
[2]	train-rmse:7.78274
[3]	train-rmse:7.62741
[4]	train-rmse:7.47518
[5]	train-rmse:7.32606
[6]	train-rmse:7.17988
[7]	train-rmse:7.03663
[8]	train-rmse:6.89628
[9]	train-rmse:6.75875
[10]	train-rmse:6.62396
[11]	train-rmse:6.49191
[12]	train-rmse:6.36246
[13]	train-rmse:6.23559
[14]	train-rmse:6.1113
[15]	train-rmse:5.98946
[16]	train-rmse:5.87008
[17]	train-rmse:5.7531
[18]	train-rmse:5.63845
[19]	train-rmse:5.52613
[20]	train-rmse:5.41606
[21]	train-rmse:5.30822
[22]	train-rmse:5.20249
[23]	train-rmse:5.09892
[24]	train-rmse:4.99744
[25]	train-rmse:4.89796
[26]	train-rmse:4.80049
[27]	train-rmse:4.70498
[28]	train-rmse:4.61138
[29]	train-rmse:4.51968
[30]	train-rmse:4.42981
[31]	train-rmse:4.34174
[32]	train-rmse:4.25544
[33]	train-rmse:4.17088
[34]	train-rmse:4.088
[35]	train-rmse:4.0068
[36]	train-rmse:3.92723
[37]	train-rmse:3.84928
[38]	train-rmse:3.77289
[39]	train-rmse:3.69803
[40]	train-rmse:3.62471
[41]	train-rmse:3.55279
[42]	tr

In [71]:
print_score(m, mae)

[0.08473571, 0.105619594]


### GBT + Emb

In [76]:
embc_train = [ embs[c][cats_train[:,c]] for c in range(len(data.train_ds.cat_names)) ]
embc_valid = [ embs[c][cats_valid[:,c]] for c in range(len(data.train_ds.cat_names)) ]

x_train = np.concatenate(embc_train+[conts_train], axis=1)
x_valid = np.concatenate(embc_valid+[conts_valid], axis=1)

In [77]:
x_train = xgb.DMatrix(x_train, label=y_train)
x_valid = xgb.DMatrix(x_valid, label=y_valid)
evallist = [(x_train, 'train')]

In [78]:
%time m = xgb.train(param, x_train, num_round, evallist)

[0]	train-rmse:8.10287
[1]	train-rmse:7.94105
[2]	train-rmse:7.78244
[3]	train-rmse:7.627
[4]	train-rmse:7.47469
[5]	train-rmse:7.32542
[6]	train-rmse:7.17912
[7]	train-rmse:7.03576
[8]	train-rmse:6.89528
[9]	train-rmse:6.7576
[10]	train-rmse:6.62269
[11]	train-rmse:6.49045
[12]	train-rmse:6.3609
[13]	train-rmse:6.23392
[14]	train-rmse:6.1095
[15]	train-rmse:5.98754
[16]	train-rmse:5.86802
[17]	train-rmse:5.75089
[18]	train-rmse:5.6361
[19]	train-rmse:5.52364
[20]	train-rmse:5.41341
[21]	train-rmse:5.30538
[22]	train-rmse:5.19954
[23]	train-rmse:5.09581
[24]	train-rmse:4.99416
[25]	train-rmse:4.89452
[26]	train-rmse:4.79687
[27]	train-rmse:4.70121
[28]	train-rmse:4.60743
[29]	train-rmse:4.51553
[30]	train-rmse:4.42547
[31]	train-rmse:4.33722
[32]	train-rmse:4.25076
[33]	train-rmse:4.16602
[34]	train-rmse:4.08296
[35]	train-rmse:4.0016
[36]	train-rmse:3.92187
[37]	train-rmse:3.84371
[38]	train-rmse:3.76711
[39]	train-rmse:3.69205
[40]	train-rmse:3.61851
[41]	train-rmse:3.54643
[42]	trai

In [79]:
print_score(m, mae)

[0.056473542, 0.085233375]
