<a href="https://colab.research.google.com/github/bogeholm/fastai-intro-to-ml/blob/master/1-intro-to-random-forests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Course lecture
[1 - Introduction to Random Forests](http://course18.fast.ai/lessonsml1/lesson1.html)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bogeholm/fastai-intro-to-ml/blob/master/1-intro-to-random-forests.ipynb)

### Install packages

In [37]:
# Uncomment to install utilities: https://github.com/bogeholm/dataworks
!pip install --quiet --upgrade git+git://github.com/bogeholm/dataworks.git

  Building wheel for dataworks (setup.py) ... [?25l[?25hdone


### Imports

In [0]:
import joblib
import numpy as np
import os
import pandas as pd
import sys

from dataworks.df_utils import *#inspect_df, summarize_df, add_datefields, add_nan_columns, numeric_nans, categorize_df

from IPython.display import display
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

### Set data path

In [0]:
def get_basepath(basepath_local='', basepath_colab='drive/My Drive/'):
    """ Return path to base directory depending on whether the
        notebook is running in Google Colab or not
    """
    if 'google.colab' in sys.modules:
        # Notebook is running in Google Colab
        from google.colab import drive
        drive.mount('/content/drive')
        return basepath_colab
    else:
        return basepath_local

In [40]:
DATAPATH = get_basepath() + 'data/bulldozers/'
MODELPATH = get_basepath() + 'models/bulldozers/'
print(f'DATAPATH: {DATAPATH}')
print(f'MODELPATH: {MODELPATH}')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
DATAPATH: drive/My Drive/data/bulldozers/
MODELPATH: drive/My Drive/models/bulldozers/


In [0]:
df_raw = pd.read_csv(f'{DATAPATH}Train.zip', low_memory=False, parse_dates=['saledate'])

### Utility functions

In [0]:
def display_allrows(df):
    """ Override max rows and display them all
    """
    with pd.option_context('display.max_rows', len(df)):
            display(df)

### Inspect the data

In [43]:
df_raw.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,...,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,...,,,,Standard,Conventional
1,1139248,57000,117657,77,121,...,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,...,,,,,
3,1139251,38500,1026470,332,121,...,,,,,
4,1139253,11000,1057373,17311,121,...,,,,,


In [44]:
inspect = inspect_df(df_raw)
# Uncomment if you want to see statistics for all columns
display_allrows(inspect)

Unnamed: 0,column,null_fraction,nulls,type,is_numeric
0,SalesID,0.0,0,int64,True
1,state,0.0,0,object,False
2,fiProductClassDesc,0.0,0,object,False
3,fiBaseModel,0.0,0,object,False
4,fiModelDesc,0.0,0,object,False
5,ProductGroup,0.0,0,object,False
6,saledate,0.0,0,datetime64[ns],False
7,datasource,0.0,0,int64,True
8,ModelID,0.0,0,int64,True
9,MachineID,0.0,0,int64,True


In [45]:
summary = summarize_df(df_raw)
display_allrows(summary)

Unnamed: 0,type,ncols,ncols_w_nans,n_nans,n_total,nan_frac
0,datetime64[ns],1,0,0,401125,0.0
1,float64,2,0,0,802250,0.0
2,int64,6,0,0,2406750,0.0
3,object,44,22,6002766,17649500,0.34


### Add log price

In [0]:
df_proc = df_raw.copy(deep=True)
if 'SalePrice' in df_proc.columns:
    df_proc['LogSalePrice'] = np.log(df_proc['SalePrice'])
    df_proc.drop(columns=['SalePrice'], inplace=True)

In [47]:
df_proc.head()

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,...,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,LogSalePrice
0,1139246,999089,3157,121,3.0,...,,,Standard,Conventional,11.09741
1,1139248,117657,77,121,3.0,...,,,Standard,Conventional,10.950807
2,1139249,434808,7009,121,3.0,...,,,,,9.21034
3,1139251,1026470,332,121,3.0,...,,,,,10.558414
4,1139253,1057373,17311,121,3.0,...,,,,,9.305651


### Extract date properties
See [Attributes of Pandas Timestamp](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html)

In [0]:
if 'saledate' in df_proc.columns:
    df_proc = add_datefields(df_proc, 'saledate', drop_original=True)

In [49]:
df_proc.head()

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,...,saledate_is_month_start,saledate_is_quarter_end,saledate_is_quarter_start,saledate_quarter,saledate_week
0,1139246,999089,3157,121,3.0,...,False,False,False,4,46
1,1139248,117657,77,121,3.0,...,False,False,False,1,13
2,1139249,434808,7009,121,3.0,...,False,False,False,1,9
3,1139251,1026470,332,121,3.0,...,False,False,False,2,20
4,1139253,1057373,17311,121,3.0,...,False,False,False,3,30


### Add NaN indicator column
Here we add a columns indicating whether the original column contained a NaN

In [50]:
df_proc = add_nan_columns(df_proc)
df_proc.head()

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,...,Backhoe_Mounting_isnull,Blade_Type_isnull,Travel_Controls_isnull,Differential_Type_isnull,Steering_Controls_isnull
0,1139246,999089,3157,121,3.0,...,True,True,True,False,False
1,1139248,117657,77,121,3.0,...,True,True,True,False,False
2,1139249,434808,7009,121,3.0,...,True,True,True,True,True
3,1139251,1026470,332,121,3.0,...,True,True,True,True,True
4,1139253,1057373,17311,121,3.0,...,True,True,True,True,True


### Handle numerical types containing NaN

In [51]:
stats = numeric_nans(df_raw)
stats

Unnamed: 0,column,null_fraction,nulls,type,is_numeric,num_uniques,uniques
0,auctioneerID,0.05,20136,float64,True,30,"[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ..."
1,MachineHoursCurrentMeter,0.644,258360,float64,True,15152,"[0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ..."


It seems that an appropriate value for unknown `auctioneerID` would be the _next_ value, whereas we will make a reasonable  choice and substitute unknown`MachineHoursCurrentMeter` with the median.

In [0]:
df_proc['auctioneerID'] = df_proc['auctioneerID'].fillna(
    df_proc['auctioneerID'].max() + 1
)

df_proc['MachineHoursCurrentMeter'] = df_proc['MachineHoursCurrentMeter'].fillna(
    df_proc['MachineHoursCurrentMeter'].median()
)

In [53]:
numeric_nans(df_proc)

Unnamed: 0,column,null_fraction,nulls,type,is_numeric,num_uniques,uniques


### Add categories
- http://benalexkeen.com/mapping-categorical-data-in-pandas/
- https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html

In [0]:
(df_cats, catcodes) = categorize_df(df_proc)

In [55]:
df_cats.head()

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,...,Backhoe_Mounting_category,Blade_Type_category,Travel_Controls_category,Differential_Type_category,Steering_Controls_category
0,1139246,999089,3157,121,3.0,...,0,0,0,4,2
1,1139248,117657,77,121,3.0,...,0,0,0,4,2
2,1139249,434808,7009,121,3.0,...,0,0,0,0,0
3,1139251,1026470,332,121,3.0,...,0,0,0,0,0
4,1139253,1057373,17311,121,3.0,...,0,0,0,0,0


In [56]:
catcodes.head()

Unnamed: 0,column,n_categories,categories,codes
0,UsageBand,4,"Index(['High', 'Low', 'Medium'], dtype='object')",0 2 1 2 2 1 3 ...
1,fiModelDesc,4999,"Index(['100C', '104', '1066', '1066E', '1080',...",0 950 1 1725 2 331 3...
2,fiBaseModel,1950,"Index(['10', '100', '104', '1066', '1080', '10...",0 296 1 527 2 110 3...
3,fiSecondaryDesc,176,"Index([' MSR SPIN ACE', '#NAME?', '-2', '-3', ...",0 41 1 55 2 0 3 ...
4,fiModelSeries,123,"Index([' III', '#NAME?', '-1', '-1.50E+01', '-...",0 0 1 98 2 0 3 ...


### Inspect results of data processing

In [57]:
summarize_df(df_raw)

Unnamed: 0,type,ncols,ncols_w_nans,n_nans,n_total,nan_frac
0,datetime64[ns],1,0,0,401125,0.0
1,float64,2,0,0,802250,0.0
2,int64,6,0,0,2406750,0.0
3,object,44,22,6002766,17649500,0.34


In [58]:
summarize_df(df_cats)

Unnamed: 0,type,ncols,ncols_w_nans,n_nans,n_total,nan_frac
0,bool,44,0,0,17649500,0.0
1,float64,3,0,0,1203375,0.0
2,int16,4,0,0,1604500,0.0
3,int64,9,0,0,3610125,0.0
4,int8,40,0,0,16045000,0.0


### Split into train and test
Instead of doing it as in the course video where the first 12,000 rows are chosen as test or validation data, we will pick 12,000 rows at random

In [59]:
(nrows, ncols) = df_cats.shape
print('Data rows: {}'.format(nrows))

Data rows: 401125


In [0]:
n_test = 12000

np.random.seed(seed=23)
ind_all = df_cats.index.values
ind_rand = np.random.permutation(ind_all)

ind_test = np.sort(ind_rand[:n_test])
ind_train = np.sort(ind_rand[n_test:])

In [0]:
def split_vars(df: pd.DataFrame, colname: str) -> (pd.DataFrame, np.array):
    """ Return a copy of DataFrame df minus one column, and the column as values
    """
    resdf = df.copy(deep=True)
    y = np.array(resdf[colname].values)
    resdf.drop(columns=[colname], inplace=True)
    return resdf, y

In [0]:
df_train, y_train = split_vars(df_cats.iloc[ind_train], 'LogSalePrice')
df_test, y_test = split_vars(df_cats.iloc[ind_test], 'LogSalePrice')

In [63]:
print('Training data:', df_train.shape)
print('Training target:', y_train.shape)

print('Test data:', df_test.shape)
print('Test target:', y_test.shape)

Training data: (389125, 99)
Training target: (389125,)
Test data: (12000, 99)
Test target: (12000,)


### Fit model

In [0]:
model = RandomForestRegressor(n_jobs=-1)

In [65]:
%time model.fit(df_train, y_train)

CPU times: user 18min 3s, sys: 3.64 s, total: 18min 7s
Wall time: 9min 7s


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [0]:
# At a first glance, it makes sense to save the model. However, this takes up a
# whopping ~2.5 GB of disk space!
#os.makedirs(MODELPATH, exist_ok=True)
#joblib.dump(model, MODELPATH + 'bulldozers-ranforest.joblib')

In [0]:
def rmse(x, y):
    """ Root mean square error
    """
    return np.sqrt(((x-y)**2).mean())

In [68]:
# 0.0902435 in lecture
rmse(model.predict(df_train), y_train)

0.09495213705838801

In [69]:
# 0.250792 in lecture
rmse(model.predict(df_test), y_test)

0.26036101606827916

In [70]:
# 0.982957 in lecture
model.score(df_train, y_train)

0.9812593868688915

In [71]:
# 0.887675 in lecture
model.score(df_test, y_test)

0.859284897263788