In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.structured import *
# from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=25)

PATH='data/favorita/'

In [3]:
!ls {PATH}

holidays_events.csv        stores.csv
items.csv                  test.csv
oil.csv                    train.csv
[34mrf_rnn[m[m                     train_six_months_full_data
sample_submission.csv      transactions.csv


In [4]:
from IPython.display import HTML

## Moving averages
https://www.kaggle.com/paulorzp/log-ma-and-days-of-week-means-lb-0-529/code

In [5]:
dtypes = {'id':'uint32', 'item_nbr':'uint32', 'store_nbr':'uint8', 'onpromotion':'bool'}

In [6]:
train = pd.read_csv(f'{PATH}train.csv', usecols=[1, 2, 3, 4, 5], dtype=dtypes,
            converters={'unit_sales': lambda u: np.log1p(float(u)) if float(u) > 0 else 0},
            parse_dates=['date'], skiprows=range(1, 114176251))  # header counts as row 0

# 86672217 => Skip dates before 2016-08-01
# 114176250 => 2017-05-01

### Separate validation set before filling in missing data

**Problem:**  
missing ~25% of the ['item_nbr', 'store_nbr'] combinations of training set  
Need to create a validation set with similar circumstances...

In [8]:
# last 2 weeks
valid = train[train.date>='2017-08-01']

In [9]:
# drop valid from train
train = train[:-(len(valid))]; train.tail()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
9749817,2017-07-31,54,2105347,1.098612,False
9749818,2017-07-31,54,2106464,0.693147,False
9749819,2017-07-31,54,2110456,6.359574,False
9749820,2017-07-31,54,2113914,2.639057,True
9749821,2017-07-31,54,2116416,1.609438,False


In [10]:
valid = valid.reset_index(drop=True)

valid['dow'] = valid['date'].dt.dayofweek
valid['dom'] = valid['date'].dt.day

### create missing rows (items,stores,dates)

In [11]:
# creating records for all items, in all markets on all dates
# for correct calculation of daily unit sales averages.
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=['date','store_nbr','item_nbr']
    )
).reset_index()

del u_dates, u_stores, u_items

train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs
train.loc[:, 'onpromotion'].fillna(False, inplace=True) # fill NaNs
lastdate = train.iloc[train.shape[0]-1].date

In [12]:
train['dow'] = train['date'].dt.dayofweek
train['dom'] = train['date'].dt.day

In [13]:
os.makedirs(f'{PATH}rf_rnn', exist_ok=True)
train.to_feather(f'{PATH}rf_rnn/train_3_mo')

In [None]:
train = pd.read_feather(f'{PATH}rf_rnn/train_full_year_incomplete')
dtypes = {'id':'uint32', 'item_nbr':'uint32', 'store_nbr':'uint8', 'onpromotion': 'bool'}
lastdate = train.iloc[train.shape[0]-1].date

In [14]:
#Days of Week Means
#By tarobxl: https://www.kaggle.com/c/favorita-grocery-sales-forecasting/discussion/42948
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(
        ['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw').reset_index()
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(
        ['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk').reset_index()

In [15]:
#Days of Month Means
ma_dm = train[['item_nbr','store_nbr','dom','unit_sales']].groupby(
    ['item_nbr','store_nbr','dom'])['unit_sales'].mean().to_frame('madm').reset_index()
ma_mo = ma_dm[['item_nbr','store_nbr','madm']].groupby(
    ['item_nbr','store_nbr'])['madm'].mean().to_frame('mamo').reset_index()

In [16]:
from datetime import timedelta

#Moving Averages
ma_is = train[['item_nbr','store_nbr','unit_sales']].groupby(
        ['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais')

for i in [55,34,21,13,8,5,3,2,1]:
    tmp = train[train.date>lastdate-timedelta(int(i))]
    tmpg = tmp.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais'+str(i))
    ma_is = ma_is.join(tmpg, how='left')

del tmp,tmpg

ma_is['mais']=ma_is.median(axis=1)
ma_is.reset_index(inplace=True)
ma_is.drop(list(ma_is.columns.values)[3:],1,inplace=True)

In [55]:
#Load test
test = pd.read_csv(f'{PATH}test.csv', dtype=dtypes, parse_dates=['date'])
test['dow'] = test['date'].dt.dayofweek
test['dom'] = test['date'].dt.day

In [18]:
# merge moving averages onto test df
test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_mo, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])
test = pd.merge(test, ma_dm, how='left', on=['item_nbr','store_nbr','dom'])

# merge moving averages onto valid df
valid = pd.merge(valid, ma_is, how='left', on=['item_nbr','store_nbr'])
valid = pd.merge(valid, ma_wk, how='left', on=['item_nbr','store_nbr'])
valid = pd.merge(valid, ma_mo, how='left', on=['item_nbr','store_nbr'])
valid = pd.merge(valid, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])
valid = pd.merge(valid, ma_dm, how='left', on=['item_nbr','store_nbr','dom'])

# merge moving averages onto train df
train = pd.merge(train, ma_is, how='left', on=['item_nbr','store_nbr'])
train = pd.merge(train, ma_wk, how='left', on=['item_nbr','store_nbr'])
train = pd.merge(train, ma_mo, how='left', on=['item_nbr','store_nbr'])
train = pd.merge(train, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])
train = pd.merge(train, ma_dm, how='left', on=['item_nbr','store_nbr','dom'])

In [24]:
DataFrameSummary(test).summary()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,dow,dom,avg_dow,avg_dom
count,3.37046e+06,,3.37046e+06,3.37046e+06,,3.37046e+06,3.37046e+06,3.28752e+06,3.28752e+06
mean,1.27182e+08,,27.5,1.2448e+06,,2.9375,23.5,0.856474,0.867372
std,972969,,15.5858,589836,,1.88642,4.60977,0.939628,0.995126
min,1.25497e+08,,1,96995,,0,16,0,0
25%,1.2634e+08,,14,805321,,1.75,19.75,0,0
50%,1.27182e+08,,27.5,1.29466e+06,,3,23.5,0.593464,0.561651
75%,1.28025e+08,,41,1.73002e+06,,4.25,27.25,1.40389,1.4556
max,1.28868e+08,,54,2.13424e+06,,6,31,7.72613,20.4699
counts,3370464,3370464,3370464,3370464,3370464,3370464,3370464,3287520,3287520
uniques,3370464,16,54,3901,2,7,16,979533,1704529


In [28]:
DataFrameSummary(valid).summary()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,dow,dom,avg_dow,avg_dom
count,,1.57097e+06,1.57097e+06,1.57097e+06,,1.57097e+06,1.57097e+06,1.57020e+06,1.57020e+06
mean,,28.307,1.17259e+06,1.6822,,2.89611,7.93831,1.4658,1.46402
std,,16.3061,587100,0.860944,,2.00734,4.33322,0.905538,0.95274
min,,1,96995,0,,0,1,0,0
25%,,13,692537,1.09861,,1,4,0.793082,0.750494
50%,,29,1.21391e+06,1.60944,,3,8,1.33097,1.35529
75%,,44,1.58352e+06,2.19722,,5,12,1.99155,2.04691
max,,54,2.12711e+06,8.49883,,6,15,7.72613,9.14928
counts,1570968,1570968,1570968,1570968,1570968,1570968,1570968,1570205,1570205
uniques,15,54,3854,35597,2,7,15,851001,1326055


In [32]:
DataFrameSummary(train).summary()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,dow,dom,avg_dow,avg_dom
count,,1.94249e+07,1.94249e+07,1.94249e+07,,1.94249e+07,1.94249e+07,1.94249e+07,1.94249e+07
mean,,27.5,1.22942e+06,0.851644,,2.96739,15.837,0.844942,0.844317
std,,15.5858,579373,1.04983,,2.01328,8.85423,0.944727,0.980262
min,,1,96995,0,,0,1,0,0
25%,,14,802832,0,,1,8,0,0
50%,,27.5,1.24676e+06,0.64658,,3,16,0.565945,0.522522
75%,,41,1.69595e+06,1.60944,,5,23.25,1.39663,1.43181
max,,54,2.12711e+06,9.64056,,6,31,7.72613,20.4699
counts,19424880,19424880,19424880,19424880,19424880,19424880,19424880,19424880,19424880
uniques,92,54,3910,82460,2,7,31,982725,2956970


In [29]:
# recalculate above w/out item_nbr for quick and dirty averages to fill nas in test...
av_ma_dw = ma_dw[['store_nbr','dow','madw']].groupby(['store_nbr','dow'])['madw'].mean().to_frame('av_madw').reset_index()
av_ma_wk = ma_wk[['store_nbr','mawk']].groupby(['store_nbr'])['mawk'].mean().to_frame('av_mawk').reset_index()

av_ma_dm = ma_dm[['store_nbr','dom','madm']].groupby(['store_nbr','dom'])['madm'].mean().to_frame('av_madm').reset_index()
av_ma_mo = ma_mo[['store_nbr','mamo']].groupby(['store_nbr'])['mamo'].mean().to_frame('av_mamo').reset_index()

av_ma_is = ma_is[['store_nbr','mais']].groupby(['store_nbr'])['mais'].mean().to_frame('av_mais').reset_index()

In [33]:
# merge moving averages onto test df
test = pd.merge(test, av_ma_is, how='left', on=['store_nbr'])
test = pd.merge(test, av_ma_wk, how='left', on=['store_nbr'])
test = pd.merge(test, av_ma_mo, how='left', on=['store_nbr'])
test = pd.merge(test, av_ma_dw, how='left', on=['store_nbr','dow'])
test = pd.merge(test, av_ma_dm, how='left', on=['store_nbr','dom'])

# # merge moving averages onto valid df
# valid = pd.merge(valid, av_ma_is, how='left', on=['store_nbr'])
# valid = pd.merge(valid, av_ma_wk, how='left', on=['store_nbr'])
# valid = pd.merge(valid, av_ma_mo, how='left', on=['store_nbr'])
# valid = pd.merge(valid, av_ma_dw, how='left', on=['store_nbr','dow'])
# valid = pd.merge(valid, av_ma_dm, how='left', on=['store_nbr','dom'])

In [None]:
test['av_dow'] = test['av_mais'] * test['av_madw'] / test['av_mawk']
test['av_dom'] = test['av_mais'] * test['av_madm'] / test['av_mamo']

In [44]:
test['avg_dow'].fillna(test['av_dow'], inplace=True)
test['avg_dom'].fillna(test['av_dom'], inplace=True)

In [51]:
test.drop(list(test.columns.values)[9:], axis=1, inplace=True)

In [57]:
train.to_feather(f'{PATH}rf_rnn/train_dom_dow_w_avgs')
valid.to_feather(f'{PATH}rf_rnn/valid_dom_dow_w_avgs')
test.to_feather(f'{PATH}rf_rnn/test_dom_dow_w_avgs')

## Calculate averages for train/test

In [20]:
# avg price * dow multiplier
train['avg_dow'] = train.mais
pos_idx = train['mawk'] > 0  # avoid division by zero error
train_pos = train.loc[pos_idx]
train.loc[pos_idx, 'avg_dow'] = train_pos['mais'] * train_pos['madw'] / train_pos['mawk']
# train.loc[:, 'avg_dow'].fillna(train['avg_dow'].median(), inplace=True)  # fill w/ median instead of 0
train.drop(['mawk', 'madw'], axis=1, inplace=True)

In [21]:
# avg price * dom multiplier
train['avg_dom'] = train.mais
pos_idx = train['mamo'] > 0  # avoid division by zero error
train_pos = train.loc[pos_idx]
train.loc[pos_idx, 'avg_dom'] = train_pos['mais'] * train_pos['madm'] / train_pos['mamo']
# train.loc[:, 'avg_dom'].fillna(train['avg_dom'].median(), inplace=True)
train.drop(['mais', 'mamo', 'madm'], axis=1, inplace=True); train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,dow,dom,avg_dow,avg_dom
0,2017-05-01,1,103520,0.693147,False,0,1,0.73599,0.943082
1,2017-05-01,1,105574,1.609438,False,0,1,1.972015,1.691273
2,2017-05-01,1,105575,1.609438,False,0,1,2.216405,1.754545
3,2017-05-01,1,106716,0.693147,False,0,1,1.500419,0.995786
4,2017-05-01,1,108696,1.609438,False,0,1,1.076144,0.666585


In [22]:
test['avg_dow'] = test.mais
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'avg_dow'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
# test.loc[:, 'avg_dow'].fillna(test['avg_dow'].median(), inplace=True)
test.drop(['mawk', 'madw'], axis=1, inplace=True)

In [23]:
test['avg_dom'] = test.mais
pos_idx = test['mamo'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'avg_dom'] = test_pos['mais'] * test_pos['madm'] / test_pos['mamo']
# test.loc[:, "avg_dom"].fillna(test['avg_dom'].median(), inplace=True)
test.drop(['mais', 'mamo', 'madm'], axis=1, inplace=True); test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,dow,dom,avg_dow,avg_dom
0,125497040,2017-08-16,1,96995,False,2,16,0.170942,0.0
1,125497041,2017-08-16,1,99197,False,2,16,0.341187,0.176807
2,125497042,2017-08-16,1,103501,False,2,16,0.0,0.0
3,125497043,2017-08-16,1,103520,False,2,16,1.016295,0.751296
4,125497044,2017-08-16,1,103665,False,2,16,0.982138,1.229558


In [26]:
valid['avg_dow'] = valid.mais
pos_idx = valid['mawk'] > 0
valid_pos = valid.loc[pos_idx]
valid.loc[pos_idx, 'avg_dow'] = valid_pos['mais'] * valid_pos['madw'] / valid_pos['mawk']
# valid.loc[:, 'avg_dow'].fillna(valid['avg_dow'].median(), inplace=True)
valid.drop(['mawk', 'madw'], axis=1, inplace=True)

In [27]:
valid['avg_dom'] = valid.mais
pos_idx = valid['mamo'] > 0
valid_pos = valid.loc[pos_idx]
valid.loc[pos_idx, 'avg_dom'] = valid_pos['mais'] * valid_pos['madm'] / valid_pos['mamo']
# valid.loc[:, "avg_dom"].fillna(valid['avg_dom'].median(), inplace=True)
valid.drop(['mais', 'mamo', 'madm'], axis=1, inplace=True); valid.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,dow,dom,avg_dow,avg_dom
0,2017-08-01,1,103520,1.386294,False,1,1,0.844002,0.943082
1,2017-08-01,1,103665,1.609438,False,1,1,1.048943,0.817444
2,2017-08-01,1,105574,2.197225,False,1,1,1.740511,1.691273
3,2017-08-01,1,105575,2.70805,False,1,1,2.529345,1.754545
4,2017-08-01,1,105693,0.693147,False,1,1,0.191487,0.0


In [58]:
del pos_idx, train_pos, test_pos

In [None]:
train.to_feather(f'{PATH}rf_rnn/train_w_averages_incomplete')
test.to_feather(f'{PATH}rf_rnn/test_w_averages_incomplete')

## Establish baseline (moving averages)

In [61]:
# Root Mean Squared Logarathimic Error
# kaggle definition
# np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

def rmsle(pred, targ):
    # pred and targs are both log1p values already
    mean_diff = np.square(pred - targ).mean()
    return round(math.sqrt(mean_diff), 4)

In [65]:
# testing 25% more for promotion items
#dow
train['avg_dow_promo'] = train['avg_dow']
train.loc[train['onpromotion'] == True, 'avg_dow_promo'] *= 1.25

valid['avg_dow_promo'] = valid['avg_dow']
valid.loc[valid['onpromotion'] == True, 'avg_dow_promo'] *= 1.25
# dom
train['avg_dom_promo'] = train['avg_dom']
train.loc[train['onpromotion'] == True, 'avg_dom_promo'] *= 1.25

valid['avg_dom_promo'] = valid['avg_dom']
valid.loc[valid['onpromotion'] == True, 'avg_dom_promo'] *= 1.25

# drop avg_dow_promo columns from train/valid
train.drop('avg_dow_promo', axis=1, inplace=True)
valid.drop('avg_dow_promo', axis=1, inplace=True)
train.drop('avg_dom_promo', axis=1, inplace=True)
valid.drop('avg_dom_promo', axis=1, inplace=True)

In [63]:
# The baseline predictions on full training set
print(rmsle(train.avg_dow, train.unit_sales))  #=> 0.7158
print(rmsle(train.avg_dom, train.unit_sales))  #=> 0.337
# 0.5269
# 0.5264

# baseline w/ promo multiplier
print(rmsle(train.avg_dow_promo, train.unit_sales))  #=> 0.7133
print(rmsle(train.avg_dom_promo, train.unit_sales))
# 0.5292
# 0.5238

# 0.5985
# 0.5699
# 0.5993
# 0.5741

0.5985
0.5699
0.5993
0.5741


In [64]:
# baseline predictions on validation set
print(rmsle(valid.avg_dow, valid.unit_sales))  #=> 0.5138
print(rmsle(valid.avg_dom, valid.unit_sales))  #=> 0.2611
# 0.4915
# 0.483

# baseline w/ promo multiplier
print(rmsle(valid.avg_dow_promo, valid.unit_sales))  #=> 0.5116
print(rmsle(valid.avg_dom_promo, valid.unit_sales))
# 0.5013
# 0.4891

# 0.663
# 0.7611
# 0.6654
# 0.762

0.663
0.7611
0.6654
0.762


In [66]:
# average dom/dow for full and valid sets
print(rmsle(train[['avg_dow', 'avg_dom']].mean(axis=1), train.unit_sales))
print(rmsle(valid[['avg_dow', 'avg_dom']].mean(axis=1), valid.unit_sales))

0.5522
0.6641


It appears that avg dow/dom is the best baseline predictor...

In [None]:
DataFrameSummary(valid).summary()

In [None]:
DataFrameSummary(test).summary()

### Submit test on kaggle

In [67]:
# set unit_sales
test['unit_sales'] = test[['avg_dom','avg_dow']].mean(axis=1)
# test.loc[test['onpromotion'] == True, 'unit_sales'] *= 1.25

# need to convert unit_sales back from log1p
test['unit_sales'] = test['unit_sales'].apply(pd.np.expm1) # restoring unit values

In [68]:
SUBM = f'{PATH}rf_rnn/subm/'
os.makedirs(SUBM, exist_ok=True)

test.to_csv(f'{SUBM}avg_dow_dom_v2.csv.gz', columns=['id','unit_sales'], index=False, compression='gzip')

In [71]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f {SUBM}avg_dow_dom_v2.csv.gz -m "dow and dom averages v2"
#=> 0.537  avg_dow w/ promo multiplier
#=> 0.718  dow/dom averages -- why you so bad!?
#=> 0.718  dow/dom_v2 averages

Successfully submitted to Corporación Favorita Grocery Sales Forecasting

## Feature Engineering

In [None]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_averages')
test = pd.read_feather(f'{PATH}rf_rnn/test_w_averages')

In [None]:
# NOTE: need to convert dates back to datetime?!
train['date'] = train['date'].astype('datetime64[ns]')
test['date'] = test['date'].astype('datetime64[ns]')

In [None]:
items = pd.read_csv(f'{PATH}items.csv', low_memory=False)
stores = pd.read_csv(f'{PATH}stores.csv', low_memory=False)
holidays = pd.read_csv(f'{PATH}holidays_events.csv', parse_dates=['date'], low_memory=False)

### Stores

In [None]:
stores.head()

In [None]:
train = pd.merge(train, stores, how='left', on=['store_nbr'])
test = pd.merge(test, stores, how='left', on=['store_nbr'])

In [None]:
del stores

### Holidays

In [None]:
# looks like we only need to worry about local holidays...
holidays[(holidays['date'] >= '2017-08-16') & (holidays['date'] <= '2017-08-31')]

In [None]:
local_holidays = holidays.loc[(holidays['locale'] == 'Local') & (holidays['transferred'] == False)].copy()
local_holidays['holiday'] = True
local_holidays = local_holidays.rename(index=str, columns={"locale_name": "city"}).drop(
    ['type','locale','description','transferred'], axis=1)
local_holidays.head()

In [None]:
train = pd.merge(train, local_holidays, how='left', on=['date','city'])
test = pd.merge(test, local_holidays, how='left', on=['date','city'])

In [None]:
train.loc[:, 'holiday'].fillna(False, inplace=True) # fill NaNs
test.loc[:, 'holiday'].fillna(False, inplace=True)

In [None]:
del holidays, local_holidays

### Items

In [None]:
items.head()

In [None]:
train = pd.merge(train, items, how='left', on=['item_nbr'])
# test = pd.merge(test, items, how='left', on=['item_nbr'])

In [None]:
pd.isnull(train).any()

In [None]:
del items

In [None]:
train.to_feather(f'{PATH}rf_rnn/train_w_features')
test.to_feather(f'{PATH}rf_rnn/test_w_features')

## Data prep - handle categorical variables

In [None]:
# train = pd.read_feather(f'{PATH}rf_rnn/train_w_features')
test = pd.read_feather(f'{PATH}rf_rnn/test_w_features')

In [None]:
# add date info
add_datepart(train, 'date')
add_datepart(test, 'date')

In [None]:
test.columns.values

In [None]:
test.drop(['Elapsed', 'Year'], axis=1, inplace=True)
train.drop(['Elapsed', 'Year'], axis=1, inplace=True)

In [None]:
cat_vars = ['store_nbr', 'item_nbr', 'onpromotion', 'city', 'state', 'type', 'cluster',
            'holiday', 'family', 'class', 'perishable']

In [None]:
for v in cat_vars:
    train[v] = train[v].astype('category').cat.as_ordered()

In [None]:
apply_cats(test, train)

In [None]:
# train has been reduced to last 3.5 months -> 2017-05-01
train.to_feather(f'{PATH}rf_rnn/train_w_categories')
test.to_feather(f'{PATH}rf_rnn/test_w_categories')

## Separate target from df

In [None]:
# last 3.5 months
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')
# test = pd.read_feather(f'{PATH}rf_rnn/test_w_categories')

In [None]:
df, y, nas, mapper = proc_df(train, 'unit_sales', do_scale=True)

## Split training/validation

In [None]:
# Using Skicit-learn to split data into training and testing sets
# from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
# x_trn, x_val, y_trn, y_val = train_test_split(df, y, test_size = 0.20, random_state = 42)

# good validation set => 2017-7-26 -> 2017-8-9
# IDs => [18743184:22012344] (18743184 - 22012343)
# len(test) => 3370464

def split_val(df,a,b):
    val = df[a:b].copy()
    trn = df.drop(df.index[a:b]).copy()
    return trn, val

x_trn,x_val = split_val(df,18743184,22012344)

def split_val_arr(arr,a,b):
    val = arr[a:b].copy()
    trn = np.delete(arr, slice(a,b))
    return trn, val

y_trn,y_val = split_val_arr(y,18743184,22012344)

In [None]:
x_trn.shape, x_val.shape, y_trn.shape, y_val.shape

In [None]:
del train, df, y

## Train model

In [None]:
def rmsle(pred, targ):
    # pred and targs are both log1p values already
    mean_diff = np.square(pred - targ).mean()
    return round(math.sqrt(mean_diff), 4)

def print_score(m):
    res = [rmsle(m.predict(x_trn), y_trn), rmsle(m.predict(x_val), y_val),
                round(m.score(x_trn, y_trn),4), round(m.score(x_val, y_val),4)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    
# m.score => coefficient of determination (R^2)
#   proportion of the variance in the dependent variable that is predictable from the independent variable(s)
#   ratio of how much better the model is than the mean prediction (0);  1: perfect;  -*: worse than the mean

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
set_rf_samples(50000)

In [None]:
m = RandomForestRegressor()
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2952, 0.2656, 0.67486495050828088, 0.73733977357618774]

In [None]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=3, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2857, 0.2547, 0.6956, 0.75860000000000005]

In [None]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2856, 0.2536, 0.69569999999999999, 0.76060000000000005]

In [None]:
m = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2825, 0.2498, 0.70220000000000005, 0.76770000000000005]

In [None]:
reset_rf_samples()
set_rf_samples(250000)

m = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2769, 0.2488, 0.71409999999999996, 0.76949999999999996]

## Variable importances

In [None]:
features = list(x_trn.columns)

# Get numerical feature importances
importances = list(m.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

## Feature Reduction

In [None]:
least_important_features = [feature[0] for feature in feature_importances[15:]]

x_trn.drop(least_important_features, axis=1, inplace=True)
x_val.drop(least_important_features, axis=1, inplace=True)

In [None]:
reset_rf_samples()
set_rf_samples(500000)

m = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)
#=> [0.2769, 0.2488, 0.71409999999999996, 0.76949999999999996]
#=> [0.2732, 0.2487, 0.72150000000000003, 0.76970000000000005]

In [None]:
features = list(x_trn.columns)

# Get numerical feature importances
importances = list(m.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
reset_rf_samples()
# full dataset - need to run on GPU

m = RandomForestRegressor(n_estimators=100, min_samples_leaf=25, max_features=0.5, n_jobs=8)
%time m.fit(x_trn, y_trn)
print_score(m)
#=> [0.2769, 0.2488, 0.71409999999999996, 0.76949999999999996]
#=> [0.1931, 0.248, 0.8609, 0.7711]  -- overfit?!   (n_est=50, min_leaf=5, 23min)
#=> [0.2484, 0.2466, 0.7699, 0.7737]  (37min)

## Feature Reduction / Data Expansion

In [None]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')

In [None]:
# Extract the names of the least important features
# least_important_features = [feature[0] for feature in feature_importances[15:]]

least_important_features = list(train.columns)[11:-1]
least_important_features.extend(list(['Year', 'Month']))

In [None]:
train.drop(columns=least_important_features, inplace=True)

In [None]:
df, y, nas = proc_df(train, 'unit_sales', subset=30000000)

In [None]:
del train  # need to save memory

In [None]:
# Using Scikit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_trn, x_val, y_trn, y_val = train_test_split(df, y, test_size = 0.20, random_state = 42)

In [None]:
del df,y

In [None]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(x_trn, y_trn)

In [None]:
print_score(m)   #=> [0.24, 0.5684, 0.946675430780943, 0.7008898192311388]

# slight increase in accuracy (increased data)
# large increase in duration (feature reduction & split over multiple cores)

### Apply to Test dataset

In [None]:
test = pd.read_feather(f'{PATH}rf_rnn/test_w_categories')

In [None]:
test.drop(columns=least_important_features, inplace=True)

In [None]:
# add in y value to test: 'unit sales'
test['unit_sales'] = 0.0

df_test, _, nas = proc_df(test, 'unit_sales', skip_flds=['id'], na_dict=nas)

In [None]:
# use rf to predict on test df
log_preds = m.predict(df_test)

In [None]:
test['unit_sales'] = np.expm1(log_preds) # re-scale predictions and add to df

In [None]:
SUBM = f'{PATH}rf_rnn/subm/'
os.makedirs(SUBM, exist_ok=True)

test.to_csv(f'{SUBM}rf_v3.csv.gz', columns=['id','unit_sales'], index=False, compression='gzip')

In [None]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f {SUBM}rf_v3.csv.gz -m "random forest version 3"
# v1 => 1.223 -- throwaway
# v2 => 0.581
# v3 => 0.865 (last 3 months) -- throwaway
# v4 => 0.544 (last 3 months)

## Train only on last ~3 months

In [None]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')

In [None]:
# Extract the names of the least important features
# least_important_features = [feature[0] for feature in feature_importances[9:]]

least_important_features = list(train.columns)[11:-1]
least_important_features.extend(list(['Year']))

In [None]:
train.drop(columns=least_important_features, inplace=True)

In [None]:
df, y, nas = proc_df(train, 'unit_sales')

In [None]:
del train  # need to save memory

In [None]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=25, max_features=0.5, n_jobs=8, oob_score=True)
%time m.fit(df, y);

res = [rmsle(m.predict(df), y), round(m.score(df, y),4), m.oob_score_]
print(res)
#=> [0.2453, 0.7755]  do_scale=True
#=> [0.2453, 0.7756, 0.7388747115547389] do_scale=False

## Apply to Test Data

In [None]:
#  del x_trn, x_val, y_trn, y_val
del df, y

In [None]:
test = pd.read_feather(f'{PATH}rf_rnn/test_w_categories')

In [None]:
test.drop(columns=least_important_features, inplace=True)

In [None]:
# add in y value to test: 'unit sales'
test['unit_sales'] = 0.0

In [None]:
df_test, _, nas = proc_df(test, 'unit_sales', skip_flds=['id'], na_dict=nas)

In [None]:
# need to drop columns after scaling because mapper includes all columns
# df_test.drop(columns=least_important_features, inplace=True)

In [None]:
test['unit_sales'] = m.predict(df_test)  # log1p values
test['unit_sales'] = test['unit_sales'].apply(pd.np.expm1) # restoring unit values

In [None]:
SUBM = f'{PATH}rf_rnn/subm/'
csv = f'{SUBM}rf_v7.csv.gz'
os.makedirs(SUBM, exist_ok=True)

In [None]:
test.to_csv(csv, columns=['id','unit_sales'], index=False, compression='gzip')

In [None]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f {csv} -m "random forest version 7 - more features including dom average"
# v4 => 0.544 (last 3 months)
# v5 => 0.803
# v6 => 0.801