In [34]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [35]:
from fastai.structured import *
from fastai.column_data import *
np.set_printoptions(threshold=50, edgeitems=25)

PATH='data/favorita/'

In [36]:
!ls {PATH}

holidays_events.csv  oil.csv		    stores.csv	train.csv
items.csv	     rf_rnn		    test.csv	transactions.csv
models		     sample_submission.csv  tmp


In [37]:
from IPython.display import HTML

# Standard ML workflow:

1. State the question and determine required data
2. Acquire the data in an accessible format
3. Identify and correct missing data points/anomalies as required
4. Prepare the data for the machine learning model
5. Establish a baseline model that you aim to exceed
6. Train the model on the training data
7. Make predictions on the test data
8. Compare predictions to the known test set targets and calculate performance metrics
9. If performance is not satisfactory, adjust the model, acquire more data, or try a different modeling technique
10. Interpret model and report results visually and numerically

## Moving averages
https://www.kaggle.com/paulorzp/log-ma-and-days-of-week-means-lb-0-529/code

In [5]:
dtypes = {'id':'uint32', 'item_nbr':'uint32', 'store_nbr':'uint8', 'unit_sales':'float32', 'onpromotion': np.dtype('bool')}

In [6]:
train = pd.read_csv(f'{PATH}train.csv', index_col='id', dtype=dtypes, parse_dates=['date'], skiprows=range(1, 86672217))  #Skip dates before 2016-08-01

  mask |= (ar1 == a)


In [7]:
# creating records for all items, in all markets on all dates
# for correct calculation of daily unit sales averages.
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=['date','store_nbr','item_nbr']
    )
).reset_index()

del u_dates, u_stores, u_items

train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs
train.loc[:, 'onpromotion'].fillna(False, inplace=True) # fill NaNs
lastdate = train.iloc[train.shape[0]-1].date

In [8]:
train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion
train['dow'] = train['date'].dt.dayofweek

In [9]:
os.makedirs(f'{PATH}rf_rnn', exist_ok=True)
train.to_feather(f'{PATH}rf_rnn/train_full_year')

In [None]:
train = pd.read_feather(f'{PATH}rf_rnn/train_full_year')
lastdate = train.iloc[train.shape[0]-1].date

In [10]:
#Days of Week Means
#By tarobxl: https://www.kaggle.com/c/favorita-grocery-sales-forecasting/discussion/42948
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(
        ['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw').reset_index()
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(
        ['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk').reset_index()

In [11]:
from datetime import timedelta

#Moving Averages
ma_is = train[['item_nbr','store_nbr','unit_sales']].groupby(
        ['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais')

for i in [112,56,28,14,7,3,1]:
    tmp = train[train.date>lastdate-timedelta(int(i))]
    tmpg = tmp.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais'+str(i))
    ma_is = ma_is.join(tmpg, how='left')

del tmp,tmpg

ma_is['mais']=ma_is.median(axis=1)
ma_is.reset_index(inplace=True)
ma_is.drop(list(ma_is.columns.values)[3:],1,inplace=True)
ma_is.head()

Unnamed: 0,item_nbr,store_nbr,mais
0,96995,1,0.126638
1,96995,2,0.024755
2,96995,3,0.355917
3,96995,4,0.124828
4,96995,5,0.118639


In [12]:
#Load test
test = pd.read_csv(f'{PATH}test.csv', dtype=dtypes, parse_dates=['date'])
test['dow'] = test['date'].dt.dayofweek

In [13]:
# merge moving averages onto test df
test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])

# merge moving averages onto train df
train = pd.merge(train, ma_is, how='left', on=['item_nbr','store_nbr'])
train = pd.merge(train, ma_wk, how='left', on=['item_nbr','store_nbr'])
train = pd.merge(train, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])

In [14]:
del ma_is, ma_wk, ma_dw

## Calculate m_average for train/test

In [15]:
train['m_average'] = train.mais
pos_idx = train['mawk'] > 0  # avoid division by zero error
train_pos = train.loc[pos_idx]
train.loc[pos_idx, 'm_average'] = train_pos['mais'] * train_pos['madw'] / train_pos['mawk']
train.loc[:, 'm_average'].fillna(0, inplace=True)
train.drop(['mais', 'mawk', 'madw', 'dow'], axis=1, inplace=True); train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,m_average
0,2016-08-01,1,103520,1.386294,False,0.681978
1,2016-08-01,1,103665,1.098612,False,0.810734
2,2016-08-01,1,105574,2.079442,False,1.83289
3,2016-08-01,1,105575,2.639057,False,2.371175
4,2016-08-01,1,105577,1.098612,False,0.551112


In [16]:
test['m_average'] = test.mais
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'm_average'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
test.loc[:, "m_average"].fillna(0, inplace=True)
test.drop(['mais', 'mawk', 'madw', 'dow'], axis=1, inplace=True); test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,m_average
0,125497040,2017-08-16,1,96995,False,0.114198
1,125497041,2017-08-16,1,99197,False,0.217388
2,125497042,2017-08-16,1,103501,False,0.0
3,125497043,2017-08-16,1,103520,False,0.763717
4,125497044,2017-08-16,1,103665,False,1.012761


In [17]:
train.to_feather(f'{PATH}rf_rnn/train_w_averages')
test.to_feather(f'{PATH}rf_rnn/test_w_averages')

## Establish baseline (moving averages)

In [18]:
# Root Mean Squared Logarathimic Error
# kaggle definition
# np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

def rmsle(pred, targ):
    # pred and targs are both log1p values already
    mean_diff = np.square(pred - targ).mean()
    return round(math.sqrt(mean_diff), 4)

In [30]:
# testing 25% more for promotion items
# train['m_average_promo'] = train['m_average']
# train.loc[train['onpromotion'] == True, 'm_average_promo'] *= 1.25

# drop m_average_promo columns from train/valid
# train.drop('m_average_promo', axis=1, inplace=True)
# valid.drop('m_average_promo', axis=1, inplace=True)

In [19]:
# The baseline predictions on full training set
rmsle(train.m_average, train.unit_sales)  #=> 0.7158

0.7158

In [23]:
# baseline predictions on full training set w/ promo multiplier
rmsle(train.m_average_promo, train.unit_sales)  #=> 0.7133

0.7133

In [27]:
# baseline predictions on validation set
rmsle(valid.m_average, valid.unit_sales)  #=> 0.5138

0.5138

In [28]:
# baseline predictions on validation set w/ promo multiplier
rmsle(valid.m_average_promo, valid.unit_sales)  #=> 0.5116

0.5116

### Submit test on kaggle

In [42]:
# set unit_sales w/ promo multiplier
test['unit_sales'] = test['m_average']
test.loc[test['onpromotion'] == True, 'unit_sales'] *= 1.25

# need to convert unit_sales back from log1p
test['unit_sales'] = test['unit_sales'].apply(pd.np.expm1) # restoring unit values

In [49]:
SUBM = f'{PATH}rf_rnn/subm/'
os.makedirs(SUBM, exist_ok=True)

test.to_csv(f'{SUBM}m_average_promo.csv.gz', columns=['id','unit_sales'], index=False, compression='gzip')

In [48]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f {SUBM}m_average_promo.csv.gz -m "moving averages w/ promo multiplier"
#=> 0.537

Successfully submitted to Corporación Favorita Grocery Sales Forecasting

### Separate validation set

In [37]:
# last month
valid = train.loc[train.date>='2017-07-15']

In [38]:
# drop valid from train
train = train[:-(len(valid))]; train.tail()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,m_average
75626563,2017-07-14,52,2122818,0.0,False,0.0
75626564,2017-07-14,52,2011459,0.0,False,0.0
75626565,2017-07-14,52,2126944,0.0,False,0.0
75626566,2017-07-14,52,2123839,0.0,False,0.0
75626567,2017-07-14,52,2011451,0.0,False,0.0


In [39]:
valid = valid.reset_index(drop=True); valid.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion,m_average
0,2017-07-15,1,103520,0.693147,False,0.643326
1,2017-07-15,1,103665,0.0,False,1.343232
2,2017-07-15,1,105574,1.94591,False,1.507704
3,2017-07-15,1,105575,2.397895,False,2.322006
4,2017-07-15,1,105577,0.0,False,0.541284


## Data prep - handle categorical variables

In [5]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_averages')
test = pd.read_feather(f'{PATH}rf_rnn/test_w_averages')

In [9]:
# add date info
add_datepart(train, 'date')
add_datepart(test, 'date')

In [16]:
cat_vars = ['store_nbr', 'item_nbr']

In [31]:
for v in cat_vars:
    train[v] = train[v].astype('category').cat.as_ordered()

In [32]:
apply_cats(test, train)

In [37]:
train.to_feather(f'{PATH}rf_rnn/train_w_categories')
test.to_feather(f'{PATH}rf_rnn/test_w_categories')

## Separate target from df

In [5]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')
# test = pd.read_feather(f'{PATH}rf_rnn/test_w_categories')

In [7]:
# need to run on a sample otherwise memory error...
train = train.sample(frac=0.25)
# train = train[42934968:].reset_index(drop=True)  # 2/15/2017

In [10]:
df, y, nas = proc_df(train, 'unit_sales')

## Split training/validation

In [28]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_trn, x_val, y_trn, y_val = train_test_split(df, y, test_size = 0.20, random_state = 42)

In [16]:
x_trn.shape, x_val.shape, y_trn.shape, y_val.shape

((16520155, 17), (4130039, 17), (16520155,), (4130039,))

## Train model

In [38]:
def rmsle(pred, targ):
    # pred and targs are both log1p values already
    mean_diff = np.square(pred - targ).mean()
    return round(math.sqrt(mean_diff), 4)

def print_score(m):
    res = [rmsle(m.predict(x_trn), y_trn), rmsle(m.predict(x_val), y_val),
                m.score(x_trn, y_trn), m.score(x_val, y_val)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    
# m.score => coefficient of determination (R^2)
#   proportion of the variance in the dependent variable that is predictable from the independent variable(s)
#   ratio of how much better the model is than the mean prediction (0);  1: perfect;  -*: worse than the mean

In [39]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
m = RandomForestRegressor()
%time m.fit(x_trn, y_trn)
print_score(m)    #=> [0.2436, 0.5763, 0.9450707108116795, 0.6923930808790617]

CPU times: user 32min 10s, sys: 5.16 s, total: 32min 15s
Wall time: 32min 15s
[0.2436, 0.5763, 0.9450707108116795, 0.6923930808790617]


## Variable importances

In [21]:
features = list(df.columns)

# Get numerical feature importances
importances = list(m.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: m_average            Importance: 0.62
Variable: item_nbr             Importance: 0.15
Variable: store_nbr            Importance: 0.07
Variable: Elapsed              Importance: 0.05
Variable: onpromotion          Importance: 0.03
Variable: Day                  Importance: 0.03
Variable: Dayofyear            Importance: 0.03
Variable: Dayofweek            Importance: 0.02
Variable: Week                 Importance: 0.01
Variable: Year                 Importance: 0.0
Variable: Month                Importance: 0.0
Variable: Is_month_end         Importance: 0.0
Variable: Is_month_start       Importance: 0.0
Variable: Is_quarter_end       Importance: 0.0
Variable: Is_quarter_start     Importance: 0.0
Variable: Is_year_end          Importance: 0.0
Variable: Is_year_start        Importance: 0.0


## Feature Reduction / Data Expansion

In [7]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')

In [8]:
# Extract the names of the least important features
# least_important_features = [feature[0] for feature in feature_importances[9:]]

least_important_features = list(train.columns)[11:-1]
least_important_features.extend(list(['Year', 'Month']))

In [9]:
train.drop(columns=least_important_features, inplace=True)

In [15]:
df, y, nas = proc_df(train, 'unit_sales', subset=30000000)

In [16]:
del train  # need to save memory

In [19]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_trn, x_val, y_trn, y_val = train_test_split(df, y, test_size = 0.20, random_state = 42)

In [20]:
del df,y

In [23]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(x_trn, y_trn)

CPU times: user 43min 26s, sys: 8.57 s, total: 43min 34s
Wall time: 8min 20s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [24]:
print_score(m)   #=> [0.24, 0.5684, 0.946675430780943, 0.7008898192311388]

# slight increase in accuracy (increased data)
# large increase in duration (feature reduction & split over multiple cores)

[0.2401, 0.5682, 0.9466230013003557, 0.701200428528316]


### Apply to Test dataset

In [63]:
test = pd.read_feather(f'{PATH}rf_rnn/test_w_categories')

In [64]:
test.drop(columns=least_important_features, inplace=True)

In [65]:
# add in y value to test: 'unit sales'
test['unit_sales'] = 0.0

df_test, _, nas = proc_df(test, 'unit_sales', skip_flds=['id'], na_dict=nas)

In [66]:
# use rf to predict on test df
log_preds = m.predict(df_test)

In [67]:
test['unit_sales'] = np.expm1(log_preds) # re-scale predictions and add to df

In [68]:
SUBM = f'{PATH}rf_rnn/subm/'
os.makedirs(SUBM, exist_ok=True)

test.to_csv(f'{SUBM}rf_v3.csv.gz', columns=['id','unit_sales'], index=False, compression='gzip')

In [69]:
!kaggle competitions submit -c favorita-grocery-sales-forecasting -f {SUBM}rf_v3.csv.gz -m "random forest version 3"
# v1 => 1.223 -- throwaway
# v2 => 0.581
# v3 =>  (last 3 months)

Successfully submitted to Corporación Favorita Grocery Sales Forecasting

## Train only on last ~3 months

In [40]:
train = pd.read_feather(f'{PATH}rf_rnn/train_w_categories')

In [54]:
# Extract the names of the least important features
# least_important_features = [feature[0] for feature in feature_importances[9:]]

least_important_features = list(train.columns)[11:-1]
least_important_features.extend(list(['Year']))

In [55]:
train.drop(columns=least_important_features, inplace=True)

In [59]:
df, y, nas, mapper = proc_df(train, 'unit_sales', do_scale=True)

In [60]:
del train  # need to save memory

In [61]:
m = RandomForestRegressor(n_estimators=100, min_samples_leaf=50, n_jobs=-1)
%time m.fit(df, y)

CPU times: user 4h 23min 52s, sys: 20.7 s, total: 4h 24min 13s
Wall time: 34min 24s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)