In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from fastai.structured import *
# from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=25)

PATH='data/favorita/'

In [3]:
!ls {PATH}

holidays_events.csv        stores.csv
items.csv                  test.csv
oil.csv                    train.csv
[34mrf_rnn[m[m                     train_six_months_full_data
sample_submission.csv      transactions.csv


In [4]:
from IPython.display import HTML

# Standard ML workflow:

1. State the question and determine required data
2. Acquire the data in an accessible format
3. Identify and correct missing data points/anomalies as required
4. Prepare the data for the machine learning model
5. Establish a baseline model that you aim to exceed
6. Train the model on the training data
7. Make predictions on the test data
8. Compare predictions to the known test set targets and calculate performance metrics
9. If performance is not satisfactory, adjust the model, acquire more data, or try a different modeling technique
10. Interpret model and report results visually and numerically

## Moving averages
https://www.kaggle.com/paulorzp/log-ma-and-days-of-week-means-lb-0-529/code

In [10]:
dtypes = {'id':'uint32', 'item_nbr':'uint32', 'store_nbr':'uint8', 'unit_sales':'float32', 'onpromotion': np.dtype('bool')}

In [6]:
train = pd.read_csv(f'{PATH}train.csv', index_col='id', dtype=dtypes, parse_dates=['date'], skiprows=range(1, 86672217))  #Skip dates before 2016-08-01

  mask |= (ar1 == a)


In [7]:
# creating records for all items, in all markets on all dates
# for correct calculation of daily unit sales averages.
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=['date','store_nbr','item_nbr']
    )
).reset_index()

del u_dates, u_stores, u_items

train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs
train.loc[:, 'onpromotion'].fillna(False, inplace=True) # fill NaNs
lastdate = train.iloc[train.shape[0]-1].date

In [9]:
train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion
train['dow'] = train['date'].dt.dayofweek
train['dom'] = train['date'].dt.day

In [11]:
os.makedirs(f'{PATH}rf_rnn', exist_ok=True)
train.to_feather(f'{PATH}rf_rnn/train_full_year')

In [5]:
train = pd.read_feather(f'{PATH}rf_rnn/train_full_year')
dtypes = {'id':'uint32', 'item_nbr':'uint32', 'store_nbr':'uint8', 'unit_sales':'float32', 'onpromotion': np.dtype('bool')}
lastdate = train.iloc[train.shape[0]-1].date

In [6]:
#Days of Week Means
#By tarobxl: https://www.kaggle.com/c/favorita-grocery-sales-forecasting/discussion/42948
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(
        ['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw').reset_index()
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(
        ['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk').reset_index()

In [7]:
#Days of Month Means
ma_dm = train[['item_nbr','store_nbr','dom','unit_sales']].groupby(
    ['item_nbr','store_nbr','dom'])['unit_sales'].mean().to_frame('madm').reset_index()
ma_mo = ma_dm[['item_nbr','store_nbr','madm']].groupby(
    ['item_nbr','store_nbr'])['madm'].mean().to_frame('mamo').reset_index()

In [8]:
from datetime import timedelta

#Moving Averages
ma_is = train[['item_nbr','store_nbr','unit_sales']].groupby(
        ['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais')

for i in [112,56,28,14,7,3,1]:
    tmp = train[train.date>lastdate-timedelta(int(i))]
    tmpg = tmp.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais'+str(i))
    ma_is = ma_is.join(tmpg, how='left')

del tmp,tmpg

ma_is['mais']=ma_is.median(axis=1)
ma_is.reset_index(inplace=True)
ma_is.drop(list(ma_is.columns.values)[3:],1,inplace=True)

In [11]:
#Load test
test = pd.read_csv(f'{PATH}test.csv', dtype=dtypes, parse_dates=['date'])
test['dow'] = test['date'].dt.dayofweek
test['dom'] = test['date'].dt.day

In [None]:
# merge moving averages onto test df
test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_mo, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])
test = pd.merge(test, ma_dm, how='left', on=['item_nbr','store_nbr','dom'])

# merge moving averages onto train df
train = pd.merge(train, ma_is, how='left', on=['item_nbr','store_nbr'])
train = pd.merge(train, ma_wk, how='left', on=['item_nbr','store_nbr'])
train = pd.merge(train, ma_mo, how='left', on=['item_nbr','store_nbr'])
train = pd.merge(train, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])
train = pd.merge(train, ma_dm, how='left', on=['item_nbr','store_nbr','dom'])

In [14]:
del ma_is, ma_wk, ma_dw, ma_dm, ma_mo

In [28]:
train.to_feather(f'{PATH}rf_rnn/train_w_averages')
test.to_feather(f'{PATH}rf_rnn/test_w_averages')

## Calculate averages for train/test

In [31]:
# avg price * dow multiplier
train['avg_dow'] = train.mais
pos_idx = train['mawk'] > 0  # avoid division by zero error
train_pos = train.loc[pos_idx]
train.loc[pos_idx, 'avg_dow'] = train_pos['mais'] * train_pos['madw'] / train_pos['mawk']
train.loc[:, 'avg_dow'].fillna(0, inplace=True)
train.drop(['mawk', 'madw', 'dow'], axis=1, inplace=True)

In [32]:
# avg price * dom multiplier
train['avg_dom'] = train.mais
pos_idx = train['mamo'] > 0  # avoid division by zero error
train_pos = train.loc[pos_idx]
train.loc[pos_idx, 'avg_dom'] = train_pos['mais'] * train_pos['madm'] / train_pos['mamo']
train.loc[:, 'avg_dom'].fillna(0, inplace=True)
train.drop(['mais', 'mamo', 'madm', 'dom'], axis=1, inplace=True); train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,avg_dow,avg_dom
0,2016-08-01,1,103520,0.869742,False,0.456681,0.542221
1,2016-08-01,1,103665,0.741276,False,0.515578,0.523228
2,2016-08-01,1,105574,1.124748,False,1.034287,1.000571
3,2016-08-01,1,105575,1.291725,False,1.209559,1.137517
4,2016-08-01,1,105577,0.741276,False,0.380684,0.587766


In [None]:
test['avg_dow'] = test.mais
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'avg_dow'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
test.loc[:, 'avg_dow'].fillna(0, inplace=True)
test.drop(['mawk', 'madw', 'dow'], axis=1, inplace=True)

In [34]:
test['avg_dom'] = test.mais
pos_idx = test['mamo'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'avg_dom'] = test_pos['mais'] * test_pos['madm'] / test_pos['mamo']
test.loc[:, "avg_dom"].fillna(0, inplace=True)
test.drop(['mais', 'mamo', 'madm', 'dom'], axis=1, inplace=True); test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,avg_dow,avg_dom
0,125497040,2017-08-16,1,96995,False,0.08973,0.0
1,125497041,2017-08-16,1,99197,False,0.15175,0.062994
2,125497042,2017-08-16,1,103501,False,0.0,0.0
3,125497043,2017-08-16,1,103520,False,0.503904,0.403077
4,125497044,2017-08-16,1,103665,False,0.629691,0.689135


In [54]:
del pos_idx, train_pos, test_pos

In [35]:
train.to_feather(f'{PATH}rf_rnn/train_w_averages')
test.to_feather(f'{PATH}rf_rnn/test_w_averages')

## Establish baseline (moving averages)

In [18]:
# Root Mean Squared Logarathimic Error
# kaggle definition
# np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

def rmsle(pred, targ):
    # pred and targs are both log1p values already
    mean_diff = np.square(pred - targ).mean()
    return round(math.sqrt(mean_diff), 4)

In [30]:
# testing 25% more for promotion items
# train['m_average_promo'] = train['m_average']
# train.loc[train['onpromotion'] == True, 'm_average_promo'] *= 1.25

# drop m_average_promo columns from train/valid
# train.drop('m_average_promo', axis=1, inplace=True)
# valid.drop('m_average_promo', axis=1, inplace=True)

In [19]:
# The baseline predictions on full training set
rmsle(train.m_average, train.unit_sales)  #=> 0.7158

0.7158

In [23]:
# baseline predictions on full training set w/ promo multiplier
rmsle(train.m_average_promo, train.unit_sales)  #=> 0.7133

0.7133

In [27]:
# baseline predictions on validation set
rmsle(valid.m_average, valid.unit_sales)  #=> 0.5138

0.5138

In [28]:
# baseline predictions on validation set w/ promo multiplier
rmsle(valid.m_average_promo, valid.unit_sales)  #=> 0.5116

0.5116

### Submit test on kaggle

In [42]:
# set unit_sales w/ promo multiplier
test['unit_sales'] = test['m_average']
test.loc[test['onpromotion'] == True, 'unit_sales'] *= 1.25

# need to convert unit_sales back from log1p
test['unit_sales'] = test['unit_sales'].apply(pd.np.expm1) # restoring unit values

In [49]:
SUBM = f'{PATH}rf_rnn/subm/'
os.makedirs(SUBM, exist_ok=True)

test.to_csv(f'{SUBM}m_average_promo.csv.gz', columns=['id','unit_sales'], index=False, compression='gzip')

In [48]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f {SUBM}m_average_promo.csv.gz -m "moving averages w/ promo multiplier"
#=> 0.537

Successfully submitted to Corporación Favorita Grocery Sales Forecasting

### Separate validation set

In [37]:
# last month
valid = train.loc[train.date>='2017-07-15']

In [38]:
# drop valid from train
train = train[:-(len(valid))]; train.tail()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,m_average
75626563,2017-07-14,52,2122818,0.0,False,0.0
75626564,2017-07-14,52,2011459,0.0,False,0.0
75626565,2017-07-14,52,2126944,0.0,False,0.0
75626566,2017-07-14,52,2123839,0.0,False,0.0
75626567,2017-07-14,52,2011451,0.0,False,0.0


In [39]:
valid = valid.reset_index(drop=True); valid.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,m_average
0,2017-07-15,1,103520,0.693147,False,0.643326
1,2017-07-15,1,103665,0.0,False,1.343232
2,2017-07-15,1,105574,1.94591,False,1.507704
3,2017-07-15,1,105575,2.397895,False,2.322006
4,2017-07-15,1,105577,0.0,False,0.541284


## Feature Engineering

In [27]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_averages')
test = pd.read_feather(f'{PATH}rf_rnn/test_w_averages')

In [None]:
# NOTE: need to convert dates back to datetime?!
train['date'] = train['date'].astype('datetime64[ns]')
test['date'] = test['date'].astype('datetime64[ns]')

In [7]:
items = pd.read_csv(f'{PATH}items.csv', low_memory=False)
stores = pd.read_csv(f'{PATH}stores.csv', low_memory=False)
holidays = pd.read_csv(f'{PATH}holidays_events.csv', parse_dates=['date'], low_memory=False)

### Stores

In [43]:
stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [29]:
train = pd.merge(train, stores, how='left', on=['store_nbr'])
test = pd.merge(test, stores, how='left', on=['store_nbr'])

In [30]:
del stores

### Holidays

In [65]:
# looks like we only need to worry about local holidays...
holidays[(holidays['date'] >= '2017-08-16') & (holidays['date'] <= '2017-08-31')]

Unnamed: 0,date,type,locale,locale_name,description,transferred
327,2017-08-24,Holiday,Local,Ambato,Fundacion de Ambato,False


In [11]:
local_holidays = holidays.loc[(holidays['locale'] == 'Local') & (holidays['transferred'] == False)].copy()
local_holidays['holiday'] = True
local_holidays = local_holidays.rename(index=str, columns={"locale_name": "city"}).drop(
    ['type','locale','description','transferred'], axis=1)
local_holidays.head()

Unnamed: 0,date,city,holiday
0,2012-03-02,Manta,True
2,2012-04-12,Cuenca,True
3,2012-04-14,Libertad,True
4,2012-04-21,Riobamba,True
5,2012-05-12,Puyo,True


In [31]:
train = pd.merge(train, local_holidays, how='left', on=['date','city'])
test = pd.merge(test, local_holidays, how='left', on=['date','city'])

In [32]:
train.loc[:, 'holiday'].fillna(False, inplace=True) # fill NaNs
test.loc[:, 'holiday'].fillna(False, inplace=True)

In [34]:
del holidays, local_holidays

### Items

In [18]:
items.head()

Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,0
1,99197,GROCERY I,1067,0
2,103501,CLEANING,3008,0
3,103520,GROCERY I,1028,0
4,103665,BREAD/BAKERY,2712,1


In [35]:
train = pd.merge(train, items, how='left', on=['item_nbr'])
# test = pd.merge(test, items, how='left', on=['item_nbr'])

In [36]:
pd.isnull(train).any()

date           False
store_nbr      False
item_nbr       False
unit_sales     False
onpromotion    False
avg_dow        False
avg_dom        False
city           False
state          False
type           False
cluster        False
holiday        False
family         False
class          False
perishable     False
dtype: bool

In [37]:
del items

In [38]:
train.to_feather(f'{PATH}rf_rnn/train_w_features')
test.to_feather(f'{PATH}rf_rnn/test_w_features')

## Data prep - handle categorical variables

In [45]:
# train = pd.read_feather(f'{PATH}rf_rnn/train_w_features')
test = pd.read_feather(f'{PATH}rf_rnn/test_w_features')

In [47]:
# add date info
add_datepart(train, 'date')
add_datepart(test, 'date')

In [55]:
test.columns.values

array(['id', 'store_nbr', 'item_nbr', 'onpromotion', 'avg_dow', 'avg_dom', 'city', 'state', 'type', 'cluster',
       'holiday', 'family', 'class', 'perishable', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
       'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'], dtype=object)

In [54]:
test.drop(['Elapsed', 'Year'], axis=1, inplace=True)
train.drop(['Elapsed', 'Year'], axis=1, inplace=True)

In [56]:
cat_vars = ['store_nbr', 'item_nbr', 'onpromotion', 'city', 'state', 'type', 'cluster',
            'holiday', 'family', 'class', 'perishable']

In [57]:
for v in cat_vars:
    train[v] = train[v].astype('category').cat.as_ordered()

In [58]:
apply_cats(test, train)

In [64]:
# train has been reduced to last 3.5 months -> 2017-05-01
train.to_feather(f'{PATH}rf_rnn/train_w_categories')
test.to_feather(f'{PATH}rf_rnn/test_w_categories')

## Separate target from df

In [70]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')
# test = pd.read_feather(f'{PATH}rf_rnn/test_w_categories')

In [65]:
df, y, nas, mapper = proc_df(train, 'unit_sales', do_scale=True)

## Split training/validation

In [113]:
# Using Skicit-learn to split data into training and testing sets
# from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
# x_trn, x_val, y_trn, y_val = train_test_split(df, y, test_size = 0.20, random_state = 42)

# good validation set => 2017-7-26 -> 2017-8-9
# IDs => [18743184:22012344] (18743184 - 22012343)
# len(test) => 3370464

def split_val(df,a,b):
    val = df[a:b].copy()
    trn = df.drop(df.index[a:b]).copy()
    return trn, val

x_trn,x_val = split_val(df,18743184,22012344)

def split_val_arr(arr,a,b):
    val = arr[a:b].copy()
    trn = np.delete(arr, slice(a,b))
    return trn, val

y_trn,y_val = split_val_arr(y,18743184,22012344)

In [114]:
x_trn.shape, x_val.shape, y_trn.shape, y_val.shape

((20050848, 24), (3269160, 24), (20050848,), (3269160,))

## Train model

In [120]:
def rmsle(pred, targ):
    # pred and targs are both log1p values already
    mean_diff = np.square(pred - targ).mean()
    return round(math.sqrt(mean_diff), 4)

def print_score(m):
    res = [rmsle(m.predict(x_trn), y_trn), rmsle(m.predict(x_val), y_val),
                round(m.score(x_trn, y_trn),4), round(m.score(x_val, y_val),4)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    
# m.score => coefficient of determination (R^2)
#   proportion of the variance in the dependent variable that is predictable from the independent variable(s)
#   ratio of how much better the model is than the mean prediction (0);  1: perfect;  -*: worse than the mean

In [117]:
from sklearn.ensemble import RandomForestRegressor

In [118]:
set_rf_samples(50000)

In [119]:
m = RandomForestRegressor()
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2952, 0.2656, 0.67486495050828088, 0.73733977357618774]

CPU times: user 14.5 s, sys: 5.22 s, total: 19.7 s
Wall time: 20.9 s
[0.2952, 0.2656, 0.67486495050828088, 0.73733977357618774]


In [121]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=3, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2857, 0.2547, 0.6956, 0.75860000000000005]

CPU times: user 37.2 s, sys: 8.15 s, total: 45.3 s
Wall time: 12.9 s
[0.2857, 0.2547, 0.6956, 0.75860000000000005]


In [122]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2856, 0.2536, 0.69569999999999999, 0.76060000000000005]

CPU times: user 23.7 s, sys: 8.77 s, total: 32.4 s
Wall time: 11.9 s
[0.2856, 0.2536, 0.69569999999999999, 0.76060000000000005]


In [123]:
m = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2825, 0.2498, 0.70220000000000005, 0.76770000000000005]

CPU times: user 55.6 s, sys: 15.2 s, total: 1min 10s
Wall time: 16.7 s
[0.2825, 0.2498, 0.70220000000000005, 0.76770000000000005]


In [124]:
reset_rf_samples()
set_rf_samples(250000)

m = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2769, 0.2488, 0.71409999999999996, 0.76949999999999996]

CPU times: user 4min 6s, sys: 16.2 s, total: 4min 22s
Wall time: 43.2 s
[0.2769, 0.2488, 0.71409999999999996, 0.76949999999999996]


## Variable importances

In [125]:
features = list(x_trn.columns)

# Get numerical feature importances
importances = list(m.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: avg_dom              Importance: 0.54
Variable: avg_dow              Importance: 0.27
Variable: onpromotion          Importance: 0.04
Variable: item_nbr             Importance: 0.03
Variable: store_nbr            Importance: 0.02
Variable: class                Importance: 0.02
Variable: city                 Importance: 0.01
Variable: state                Importance: 0.01
Variable: type                 Importance: 0.01
Variable: cluster              Importance: 0.01
Variable: family               Importance: 0.01
Variable: Week                 Importance: 0.01
Variable: Day                  Importance: 0.01
Variable: Dayofweek            Importance: 0.01
Variable: Dayofyear            Importance: 0.01
Variable: holiday              Importance: 0.0
Variable: perishable           Importance: 0.0
Variable: Month                Importance: 0.0
Variable: Is_month_end         Importance: 0.0
Variable: Is_month_start       Importance: 0.0
Variable: Is_quarter_end       Importance: 0.

## Feature Reduction

In [None]:
least_important_features = [feature[0] for feature in feature_importances[15:]]

x_trn.drop(least_important_features, axis=1, inplace=True)
x_val.drop(least_important_features, axis=1, inplace=True)

In [140]:
reset_rf_samples()
set_rf_samples(500000)

m = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)
#=> [0.2769, 0.2488, 0.71409999999999996, 0.76949999999999996]
#=> [0.2732, 0.2487, 0.72150000000000003, 0.76970000000000005]

CPU times: user 6min 46s, sys: 17.9 s, total: 7min 4s
Wall time: 1min 4s
[0.2732, 0.2487, 0.72150000000000003, 0.76970000000000005]


In [141]:
features = list(x_trn.columns)

# Get numerical feature importances
importances = list(m.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: avg_dom              Importance: 0.58
Variable: avg_dow              Importance: 0.23
Variable: item_nbr             Importance: 0.03
Variable: onpromotion          Importance: 0.03
Variable: class                Importance: 0.03
Variable: store_nbr            Importance: 0.01
Variable: city                 Importance: 0.01
Variable: state                Importance: 0.01
Variable: type                 Importance: 0.01
Variable: cluster              Importance: 0.01
Variable: family               Importance: 0.01
Variable: Week                 Importance: 0.01
Variable: Day                  Importance: 0.01
Variable: Dayofweek            Importance: 0.01
Variable: Dayofyear            Importance: 0.01


In [None]:
reset_rf_samples()
# full dataset - need to run on GPU

m = RandomForestRegressor(n_estimators=50, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2769, 0.2488, 0.71409999999999996, 0.76949999999999996]

## Feature Reduction / Data Expansion

In [7]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')

In [131]:
# Extract the names of the least important features
# least_important_features = [feature[0] for feature in feature_importances[15:]]

least_important_features = list(train.columns)[11:-1]
least_important_features.extend(list(['Year', 'Month']))

In [135]:
train.drop(columns=least_important_features, inplace=True)

In [15]:
df, y, nas = proc_df(train, 'unit_sales', subset=30000000)

In [16]:
del train  # need to save memory

In [19]:
# Using Scikit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_trn, x_val, y_trn, y_val = train_test_split(df, y, test_size = 0.20, random_state = 42)

In [20]:
del df,y

In [23]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(x_trn, y_trn)

CPU times: user 43min 26s, sys: 8.57 s, total: 43min 34s
Wall time: 8min 20s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [24]:
print_score(m)   #=> [0.24, 0.5684, 0.946675430780943, 0.7008898192311388]

# slight increase in accuracy (increased data)
# large increase in duration (feature reduction & split over multiple cores)

[0.2401, 0.5682, 0.9466230013003557, 0.701200428528316]


### Apply to Test dataset

In [63]:
test = pd.read_feather(f'{PATH}rf_rnn/test_w_categories')

In [64]:
test.drop(columns=least_important_features, inplace=True)

In [65]:
# add in y value to test: 'unit sales'
test['unit_sales'] = 0.0

df_test, _, nas = proc_df(test, 'unit_sales', skip_flds=['id'], na_dict=nas)

In [66]:
# use rf to predict on test df
log_preds = m.predict(df_test)

In [67]:
test['unit_sales'] = np.expm1(log_preds) # re-scale predictions and add to df

In [68]:
SUBM = f'{PATH}rf_rnn/subm/'
os.makedirs(SUBM, exist_ok=True)

test.to_csv(f'{SUBM}rf_v3.csv.gz', columns=['id','unit_sales'], index=False, compression='gzip')

In [69]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f {SUBM}rf_v3.csv.gz -m "random forest version 3"
# v1 => 1.223 -- throwaway
# v2 => 0.581
# v3 => 0.865 (last 3 months) -- throwaway
# v4 => 0.544 (last 3 months)

Successfully submitted to Corporación Favorita Grocery Sales Forecasting

## Train only on last ~3 months

In [40]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')

In [54]:
# Extract the names of the least important features
# least_important_features = [feature[0] for feature in feature_importances[9:]]

least_important_features = list(train.columns)[11:-1]
least_important_features.extend(list(['Year']))

In [55]:
train.drop(columns=least_important_features, inplace=True)

In [59]:
df, y, nas, mapper = proc_df(train, 'unit_sales', do_scale=True)

In [60]:
del train  # need to save memory

In [61]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=50, n_jobs=-1)
%time m.fit(df, y)

CPU times: user 4h 23min 52s, sys: 20.7 s, total: 4h 24min 13s
Wall time: 34min 24s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

### Apply to Test Data

In [None]:
test = pd.read_feather(f'{PATH}rf_rnn/test_w_categories')

In [None]:
test.drop(columns=least_important_features, inplace=True)

In [None]:
# add in y value to test: 'unit sales'
test['unit_sales'] = 0.0

In [None]:
df_test, _, nas, mapper = proc_df(test, 'unit_sales', skip_flds=['id'], na_dict=nas, do_scale=True, mapper=mapper)

In [None]:
test['unit_sales'] = np.expm1(m.predict(df_test)) # re-scale predictions and add to df

In [None]:
SUBM = f'{PATH}rf_rnn/subm/'
os.makedirs(SUBM, exist_ok=True)

In [None]:
test.to_csv(f'{SUBM}rf_v4.csv.gz', columns=['id','unit_sales'], index=False, compression='gzip')

In [None]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f {SUBM}rf_v4.csv.gz -m "random forest version 4"
# v4 => 0.544 (last 3 months)