In [52]:
import numpy as np
import pandas as pd

from fastai.imports import *
from fastai.tabular import *

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from pathlib import Path

## Config

In [2]:
data_pth = './avocado-prices.zip'

## Data

In [54]:
df_raw = (pd.read_csv(data_pth, low_memory=False, parse_dates=['Date'])
        .drop('Unnamed: 0', axis=1)).sort_values('Date')

In [55]:
df_raw.head().T

Unnamed: 0,11569,9593,10009,1819,9333
Date,2015-01-04 00:00:00,2015-01-04 00:00:00,2015-01-04 00:00:00,2015-01-04 00:00:00,2015-01-04 00:00:00
AveragePrice,1.75,1.49,1.68,1.52,1.64
Total Volume,27365.9,17723.2,2896.72,54956.8,1505.12
4046,9307.34,1189.35,161.68,3013.04,1.27
4225,3844.81,15628.3,206.96,35456.9,1129.5
4770,615.28,0,0,1561.7,0
Total Bags,13598.5,905.55,2528.08,14925.2,374.35
Small Bags,13061.1,905.55,2528.08,11264.8,186.67
Large Bags,537.36,0,0,3660.38,187.68
XLarge Bags,0,0,0,0,0


## Pre-processing

In [56]:
add_datepart(df_raw, 'Date')

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,...,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
11569,1.75,27365.89,9307.34,3844.81,615.28,13598.46,13061.10,537.36,0.00,organic,...,4,6,4,False,False,False,False,False,False,1420329600
9593,1.49,17723.17,1189.35,15628.27,0.00,905.55,905.55,0.00,0.00,organic,...,4,6,4,False,False,False,False,False,False,1420329600
10009,1.68,2896.72,161.68,206.96,0.00,2528.08,2528.08,0.00,0.00,organic,...,4,6,4,False,False,False,False,False,False,1420329600
1819,1.52,54956.80,3013.04,35456.88,1561.70,14925.18,11264.80,3660.38,0.00,conventional,...,4,6,4,False,False,False,False,False,False,1420329600
9333,1.64,1505.12,1.27,1129.50,0.00,374.35,186.67,187.68,0.00,organic,...,4,6,4,False,False,False,False,False,False,1420329600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8574,1.36,908202.13,142681.06,463136.28,174975.75,127409.04,103579.41,22467.04,1362.59,conventional,...,25,6,84,False,False,False,False,False,False,1521936000
9018,0.70,9010588.32,3999735.71,966589.50,30130.82,4014132.29,3398569.92,546409.74,69152.63,conventional,...,25,6,84,False,False,False,False,False,False,1521936000
18141,1.42,163496.70,29253.30,5080.04,0.00,129163.36,109052.26,20111.10,0.00,organic,...,25,6,84,False,False,False,False,False,False,1521936000
17673,1.70,190257.38,29644.09,70982.10,0.00,89631.19,89424.11,207.08,0.00,organic,...,25,6,84,False,False,False,False,False,False,1521936000


In [57]:
procs = [Categorify]

In [58]:
valid_idx = range(len(df_raw)-2000, len(df_raw))

In [59]:
dep_var = 'AveragePrice'
cat_names = ['type', 'region']
cont_names = [x for x in df_raw.columns if x not in (cat_names)]

In [60]:
for proc in procs:
    transformation = proc(cat_names, cont_names)
    transformation.apply_train(df_raw)

In [61]:
for cname in cat_names:
    df_raw[cname] = df_raw[cname].cat.codes

In [62]:
df_raw = df_raw.reset_index(drop=True)

In [63]:
df_raw.head()

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,...,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,1.75,27365.89,9307.34,3844.81,615.28,13598.46,13061.1,537.36,0.0,1,...,4,6,4,False,False,False,False,False,False,1420329600
1,1.49,17723.17,1189.35,15628.27,0.0,905.55,905.55,0.0,0.0,1,...,4,6,4,False,False,False,False,False,False,1420329600
2,1.68,2896.72,161.68,206.96,0.0,2528.08,2528.08,0.0,0.0,1,...,4,6,4,False,False,False,False,False,False,1420329600
3,1.52,54956.8,3013.04,35456.88,1561.7,14925.18,11264.8,3660.38,0.0,0,...,4,6,4,False,False,False,False,False,False,1420329600
4,1.64,1505.12,1.27,1129.5,0.0,374.35,186.67,187.68,0.0,1,...,4,6,4,False,False,False,False,False,False,1420329600


In [64]:
tmp_path = Path('./tmp')

In [65]:
tmp_path.mkdir(exist_ok=True)

In [66]:
df_raw.to_feather(tmp_path/'raw')

In [67]:
df_raw = pd.read_feather(tmp_path/'raw')

In [68]:
df_raw

Unnamed: 0,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,...,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,1.75,27365.89,9307.34,3844.81,615.28,13598.46,13061.10,537.36,0.00,1,...,4,6,4,False,False,False,False,False,False,1420329600
1,1.49,17723.17,1189.35,15628.27,0.00,905.55,905.55,0.00,0.00,1,...,4,6,4,False,False,False,False,False,False,1420329600
2,1.68,2896.72,161.68,206.96,0.00,2528.08,2528.08,0.00,0.00,1,...,4,6,4,False,False,False,False,False,False,1420329600
3,1.52,54956.80,3013.04,35456.88,1561.70,14925.18,11264.80,3660.38,0.00,0,...,4,6,4,False,False,False,False,False,False,1420329600
4,1.64,1505.12,1.27,1129.50,0.00,374.35,186.67,187.68,0.00,1,...,4,6,4,False,False,False,False,False,False,1420329600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18244,1.36,908202.13,142681.06,463136.28,174975.75,127409.04,103579.41,22467.04,1362.59,0,...,25,6,84,False,False,False,False,False,False,1521936000
18245,0.70,9010588.32,3999735.71,966589.50,30130.82,4014132.29,3398569.92,546409.74,69152.63,0,...,25,6,84,False,False,False,False,False,False,1521936000
18246,1.42,163496.70,29253.30,5080.04,0.00,129163.36,109052.26,20111.10,0.00,1,...,25,6,84,False,False,False,False,False,False,1521936000
18247,1.70,190257.38,29644.09,70982.10,0.00,89631.19,89424.11,207.08,0.00,1,...,25,6,84,False,False,False,False,False,False,1521936000


In [69]:
df, y = (df_raw.drop('AveragePrice', axis=1), df_raw['AveragePrice'])

## Modelling

In [70]:
def split_vals(a, n): return a[:n].copy(), a[n:].copy()

n_valid = 2000
n_trn = len(df) - n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

In [71]:
mdl = RandomForestRegressor(n_jobs=-1)

In [72]:
mdl.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [73]:
mdl.score(X_valid, y_valid)

0.3310735115873774

In [80]:
print('Train RMSE: ', np.sqrt(mean_squared_error(mdl.predict(X_train), y_train)))

Train RMSE:  0.044039644168379063


In [82]:
print('Test RMSE: ', np.sqrt(mean_squared_error(mdl.predict(X_valid), y_valid)))

Test RMSE:  0.2528532373927611
