# 1 Imports

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [3]:
PATH = "data/grocery-sales/"

In [4]:
!ls {PATH}

holidays_events.csv
items.csv
oil.csv
sample_submission.csv
stores.csv
test.csv
tmp
train.csv
transactions.csv


# 2 Read data

In [5]:
types = {'id': 'int64',
         'item_nbr': 'int32',
         'store_nbr': 'int8',
         'unit_sales': 'float32',
         'onpromotion': 'object'}

In [6]:
%%time
df_all = pd.read_csv(f'{PATH}train.csv', parse_dates=['date'], dtype=types, infer_datetime_format=True)

Wall time: 2min 24s


In [7]:
df_all.onpromotion.fillna(False, inplace=True)
df_all.onpromotion = df_all.onpromotion.map({'False': False, 'True': True})
df_all.onpromotion = df_all.onpromotion.astype(bool)

%time
df_all.to_feather(f'{PATH}tmp/raw_groceries')

Wall time: 0 ns


In [8]:
%time
df_all.describe(include='all')

Wall time: 0 ns


Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
count,125497000.0,125497040,125497000.0,125497000.0,125497000.0,125497040
unique,,1684,,,,2
top,,2017-07-01 00:00:00,,,,False
freq,,118194,,,,96028767
first,,2013-01-01 00:00:00,,,,
last,,2017-08-15 00:00:00,,,,
mean,62748520.0,,27.46458,972769.2,8.554856,
std,36227880.0,,16.33051,520533.6,23.60515,
min,0.0,,1.0,96995.0,-15372.0,
25%,31374260.0,,12.0,522383.0,2.0,


In [9]:
df_test = pd.read_csv(f'{PATH}test.csv', parse_dates=['date'], dtype=types, infer_datetime_format=True)

df_test.onpromotion.fillna(False, inplace=True)
df_test.onpromotion = df_test.onpromotion.map({'False': False, 'True': True})
df_test.onpromotion = df_test.onpromotion.astype(bool)

df_test.describe(include='all')

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
count,3370464.0,3370464,3370464.0,3370464.0,3370464
unique,,16,,,2
top,,2017-08-27 00:00:00,,,False
freq,,210654,,,3171867
first,,2017-08-16 00:00:00,,,
last,,2017-08-31 00:00:00,,,
mean,127182300.0,,27.5,1244798.0,
std,972969.3,,15.58579,589836.2,
min,125497000.0,,1.0,96995.0,
25%,126339700.0,,14.0,805321.0,


In [10]:
df_all.tail()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
125497035,125497035,2017-08-15,54,2089339,4.0,False
125497036,125497036,2017-08-15,54,2106464,1.0,True
125497037,125497037,2017-08-15,54,2110456,192.0,False
125497038,125497038,2017-08-15,54,2113914,198.0,True
125497039,125497039,2017-08-15,54,2116416,2.0,False


In [None]:
df_all = pd.read_feather(f'{PATH}tmp/raw_groceries')

In [11]:
df_all.unit_sales = np.log1p(np.clip(df_all.unit_sales, 0, None))

In [12]:
%time
add_datepart(df_all, 'date')

Wall time: 0 ns


In [13]:
def split_vals(a, n): return a[:n].copy(), a[n:].copy()

In [14]:
n_valid = len(df_test)
n_trn = len(df_all) - n_valid
train, valid = split_vals(df_all, n_trn)
train.shape, valid.shape

((122126576, 18), (3370464, 18))

In [None]:
# Here I don't need to run these code, because no categorical data here
# train_cats(raw_train)
# apply_cats(raw_valid, raw_train)

In [16]:
%%time
trn, y, _ = proc_df(train, 'unit_sales')
val, y_val, _ = proc_df(valid, 'unit_sales')

Wall time: 1min 17s


# 3 Models

In [17]:
def rmse(x, y): return math.sqrt(((x-y)**2).mean())

In [18]:
def print_score(m):
    res = [rmse(m.predict(x), y), 
           rmse(m.predict(val), y_val),
           m.score(x, y), 
           m.score(val, y_val)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [19]:
set_rf_samples(1_000_000)

In [21]:
%%time
x = np.array(trn, dtype=np.float32)

Wall time: 2min 21s


In [23]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=100, n_jobs=-1)
%time m.fit(x, y)

Wall time: 57.7 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=100, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [24]:
print_score(m)

[0.772626601859738, 0.7666912927363381, 0.23243895711742335, 0.21766121689782525]


In [25]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=10, n_jobs=-1)

In [26]:
%prun m.fit(x, y)

 

In [27]:
print_score(m)

[0.7064004305061968, 0.7123515776086922, 0.3583835046328697, 0.3246287346237222]


In [28]:
m = RandomForestRegressor(n_estimators=20, min_samples_leaf=3, n_jobs=-1)
%time m.fit(x, y)
print_score(m)

Wall time: 1min 9s
[0.6873410144449954, 0.7029391755419039, 0.39253937963519164, 0.34235837546079295]
