In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from nb_008 import *

# Rossmann

## Data preparation

To create the feature-engineered filed train_clean and test_clean from the initial data, run nb009a

In [None]:
PATH = Path('data/rossmann/')
train_df = pd.read_feather(PATH/'train_clean')
test_df = pd.read_feather(PATH/'test_clean')

In [None]:
train_df.head()

In [None]:
cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',
    'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',
    'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',
    'SchoolHoliday_fw', 'SchoolHoliday_bw']

cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',
   'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 
   'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',
   'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']

n = len(train_df); n

In [None]:
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]

In [None]:
small_train_df.head()

In [None]:
small_test_df.head()

In [None]:
@dataclass
class TabularTransform():
    cat_names:Collection[str]
    cont_names:Collection[str]
        
    def __call__(self, df, test=False):
        func = self.apply_test if test else self.apply_train
        func(df)
        
    def apply_train(self, df): raise NotImplementedError
    def apply_test(self, df):  self.apply_train(df)

In [None]:
class Categorify(TabularTransform):
    
    def apply_train(self, df):
        self.categories = {}
        for n in self.cat_names: 
            df[n] = df[n].astype('category').cat.as_ordered()
            self.categories[n] = df[n].cat.categories
    
    def apply_test(self, df):
        for n in self.cat_names:
            df[n] = pd.Categorical(df[n], categories=self.categories[n], ordered=True)

In [None]:
categorify = Categorify(small_cat_vars, small_cont_vars)
categorify(small_train_df)
categorify(small_test_df, test=True)

In [None]:
small_test_df.head()

In [None]:
small_train_df['PromoInterval'].cat.codes

In [None]:
small_test_df['Store'].cat.codes

In [None]:
@dataclass
class FillNACont(TabularTransform):
    fill_val:float=0.
        
    def apply_train(self, df):
        for n in self.cont_names: df[n] = df[n].fillna(self.fill_val)

In [None]:
small_train_df1 = small_train_df.copy()
fillna = FillNACont(small_cat_vars, small_cont_vars)
fillna(small_train_df1)

In [None]:
pd.isnull(small_train_df['CompetitionDistance']).sum(), pd.isnull(small_train_df1['CompetitionDistance']).sum()

In [None]:
FillStrategy = IntEnum('FillStrategy', 'MEDIAN COMMON')

@dataclass
class FillMissing(TabularTransform):
    fill_strategy:FillStrategy=FillStrategy.MEDIAN
    add_col:bool=True
        
    def apply_train(self, df):
        self.na_dict = {}
        for name in self.cont_names:
            if pd.isnull(df[name]).sum():
                if self.add_col: 
                    df[name+'_na'] = pd.isnull(df[name])
                    if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')
                if self.fill_strategy == FillStrategy.MEDIAN: filler = df[name].median() 
                else: filler = df[name].dropna().value_counts().idxmax()
                df[name] = df[name].fillna(filler)
                self.na_dict[name] = filler
            
    def apply_test(self, df): 
        for name in self.cont_names:
            if name in self.na_dict:
                if self.add_col: 
                    df[name+'_na'] = pd.isnull(df[name])
                    if name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')
                df[name] = df[name].fillna(self.na_dict[name])

In [None]:
fill_missing = FillMissing(small_cat_vars, small_cont_vars)
fill_missing(small_train_df)
fill_missing(small_test_df, test=True)

In [None]:
small_train_df[small_train_df['CompetitionDistance_na'] == True]

In [None]:
small_test_df[small_test_df['CompetitionDistance_na'] == True]

In [None]:
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

In [None]:
class TabularDataset():
    def __init__(self, df, dep_var, cat_names=None, cont_names=None, stats=None, log_output=False):
        if not is_numeric_dtype(df[dep_var]): df[dep_var] = df[dep_var].cat.codes
        self.y = torch.tensor(df[dep_var].values)
        if log_output: self.y = torch.log(self.y.float())
        n = len(self.y)
        if cat_names and len(cat_names) >= 1:
            self.cats = np.stack([c.cat.codes.values for n,c in df[cat_names].items()], 1) + 1
        else: self.cats = np.zeros((n,1))
        self.cats = LongTensor(self.cats.astype(np.int64))
        if cont_names and len(cont_names) >= 1:
            self.conts = np.stack([c.astype('float32').values for n,c in df[cont_names].items()], 1)
            means, stds = stats if stats is not None else (self.conts.mean(0), self.conts.std(0))
            self.conts = (self.conts - means[None]) / stds[None]
            self.stats = means,stds
        else: 
            self.conts = np.zeros((n,1), dtype=np.float32)
            self.stats = None
        self.conts = FloatTensor(self.conts)
    
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return ((self.cats[idx], self.conts[idx]), self.y[idx])
    
    @classmethod
    def from_dataframes(cls, train_df, test_df, dep_var, tfms=None, cat_names=None, cont_names=None, **kwargs):
        if cat_names is None: cat_names = [n for n in train_df.columns if is_categorical_dtype(train_df[n])]
        if cont_names is None: cont_names = [n for n in train_df.columns 
                                             if is_numeric_dtype_dtype(train_df[n]) and not n==dep_var]
        if tfms is None: tfms = []
        for tfm in tfms:
            tfm = tfm(cat_names, cont_names)
            tfm(train_df)
            tfm(test_df, test=True)
            cat_names, cont_names = tfm.cat_names, tfm.cont_names
        train_ds = cls(train_df, dep_var, cat_names, cont_names, **kwargs)
        return (train_ds, cls(test_df, dep_var, cat_names, cont_names, stats=train_ds.stats, **kwargs))

In [None]:
train_df = pd.read_feather(PATH/'train_clean')

In [None]:
idx = np.random.permutation(range(n))[:2000]
idx.sort()
small_train_df = train_df.iloc[idx[:1000]]
small_test_df = train_df.iloc[idx[1000:]]
small_cont_vars = ['CompetitionDistance', 'Mean_Humidity']
small_cat_vars =  ['Store', 'DayOfWeek', 'PromoInterval']
small_train_df = small_train_df[small_cat_vars+small_cont_vars + ['Sales']]
small_test_df = small_test_df[small_cat_vars+small_cont_vars + ['Sales']]

In [None]:
dep_var = 'Sales'
tfms = [FillMissing, Categorify] #Fillmissing first so that the added columns are categorified
train_ds, valid_ds = TabularDataset.from_dataframes(small_train_df, small_test_df, dep_var, tfms, cat_names=small_cat_vars, 
                                                    cont_names=small_cont_vars, log_output=True)

In [None]:
train_ds[2]

In [None]:
train_ds.stats, valid_ds.stats

In [None]:
small_train_df.head()

In [None]:
dep_var = 'Sales'
train_df = pd.read_feather(PATH/'train_clean')
train_df = train_df[cat_vars+cont_vars+[dep_var, 'Date']].copy()
train_df = train_df.set_index('Date')

In [None]:
cut = int(len(train_df) * 0.1)
train_df,valid_df = train_df[cut:], train_df[:cut]
len(train_df),len(valid_df)

In [None]:
tfms = [FillNACont, Categorify]
train_ds, valid_ds = TabularDataset.from_dataframes(train_df, valid_df, dep_var, tfms, cat_names=cat_vars, 
                                                    cont_names=cont_vars, log_output=True)

In [None]:
train_df.columns

In [None]:
len(train_ds), len(valid_ds)

In [None]:
data = DataBunch.create(train_ds, valid_ds, bs=64, num_workers=0)

In [None]:
x,y = next(iter(data.train_dl))

In [None]:
x[0].size(), x[1].size()

## Model

In [None]:
bn_drop_lin

In [None]:
class TabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, drops, emb_drop=0., y_range=None, use_bn=True, 
                 is_reg=False, is_multi=False):
        super().__init__()
        self.embeds = nn.ModuleList([get_embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(emb_drop)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb,self.n_cont,self.y_range = n_emb,n_cont,y_range
        layers = []
        if is_reg: final_act = None if y_range is None else nn.Sigmoid()
        else:      final_act = nn.LogSoftmax() if is_multi else nn.Sigmoid()
        sizes = [n_emb + n_cont] + layers + [out_sz]
        actns = [nn.ReLU(inplace=True)] * (len(sizes)-2) + [final_act]
        for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+drops,actns)):
            layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)
        self.layers = nn.Sequential(*layers)
    
    def forward(self, x_cat, x_cont):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        x = self.layers(x)
        if self.y_range is not None: x = (self.y_range[1] - self.y_range[0]) * x + self.y_range[0]
        return x.squeeze()

In [None]:
cat_szs = [len(train_df[n].cat.categories)+1 for n in cat_vars]
emb_szs = [(c, min(50, (c+1)//2)) for c in cat_szs]
emb_szs

In [None]:
max_log_y = np.log(np.max(train_df['Sales']))
y_range = torch.tensor([0, max_log_y*1.2], device=default_device)

In [None]:
model = TabularModel(emb_szs, len(cont_vars), 1, [1000,500], [0.001,0.01], emb_drop=0.04, y_range=y_range, is_reg=True)

In [None]:
def exp_rmspe(pred, targ):
    pred, targ =torch.exp(pred), torch.exp(targ)
    pct_var = (targ - pred)/targ
    return torch.sqrt((pct_var**2).mean())

In [None]:
learn = Learner(data, model)
learn.loss_fn = F.mse_loss
learn.metrics = [exp_rmspe]

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(3, 1e-3)