### Course lecture
[1 - Introduction to Random Forests](http://course18.fast.ai/lessonsml1/lesson1.html)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/bogeholm/fastai-intro-to-ml/blob/master/1-intro-to-random-forests.ipynb)

### Install packages

In [1]:
# Uncomment to install utilities: https://github.com/bogeholm/dataworks
#!pip install --upgrade git+git://github.com/bogeholm/dataworks.git

### Imports

In [2]:
import numpy as np
import pandas as pd
import sys

from dataworks.df_utils import *#inspect_df, summarize_df, add_datefields, add_nan_columns, numeric_nans, categorize_df

from IPython.display import display
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)

### Set data path

In [3]:
def get_datapath(basepath_local='data/', basepath_colab='My Drive/data/'):
    """ Return path to data directory depending on whether the
        notebook is running in Google Colab or not
    """
    if 'google.colab' in sys.modules:
        # Notebook is running in Google Colab
        from google.colab import drive
        drive.mount('/content/drive')
        return 'drive/' + basepath_colab
    else:
        return basepath_local

In [4]:
PATH = get_datapath() + 'bulldozers/'
print(PATH)

data/bulldozers/


In [5]:
df_raw = pd.read_csv(f'{PATH}Train.zip', low_memory=False, parse_dates=['saledate'])

### Utility functions

In [6]:
def display_allrows(df):
    """ Override max rows and display them all
    """
    with pd.option_context('display.max_rows', len(df)):
            display(df)

### Inspect the data

In [7]:
df_raw.head()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,...,Backhoe_Mounting,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls
0,1139246,66000,999089,3157,121,...,,,,Standard,Conventional
1,1139248,57000,117657,77,121,...,,,,Standard,Conventional
2,1139249,10000,434808,7009,121,...,,,,,
3,1139251,38500,1026470,332,121,...,,,,,
4,1139253,11000,1057373,17311,121,...,,,,,


In [8]:
inspect = inspect_df(df_raw)
#display_allrows(inspect)

In [9]:
summary = summarize_df(df_raw)
display_allrows(summary)

Unnamed: 0,type,ncols,ncols_w_nans,n_nans,n_total,nan_frac
0,datetime64[ns],1,0,0,401125,0.0
1,float64,2,0,0,802250,0.0
2,int64,6,0,0,2406750,0.0
3,object,44,22,6002766,17649500,0.34


### Add log price

In [10]:
df_proc = df_raw.copy(deep=True)
if 'SalePrice' in df_proc.columns:
    df_proc['LogSalePrice'] = np.log(df_proc['SalePrice'])
    df_proc.drop(columns=['SalePrice'], inplace=True)

In [11]:
df_proc.head()

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,...,Blade_Type,Travel_Controls,Differential_Type,Steering_Controls,LogSalePrice
0,1139246,999089,3157,121,3.0,...,,,Standard,Conventional,11.09741
1,1139248,117657,77,121,3.0,...,,,Standard,Conventional,10.950807
2,1139249,434808,7009,121,3.0,...,,,,,9.21034
3,1139251,1026470,332,121,3.0,...,,,,,10.558414
4,1139253,1057373,17311,121,3.0,...,,,,,9.305651


### Extract date properties
See [Attributes of Pandas Timestamp](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html)

In [12]:
if 'saledate' in df_proc.columns:
    df_proc = add_datefields(df_proc, 'saledate', drop_original=True)

In [13]:
df_proc.head()

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,...,saledate_is_month_start,saledate_is_quarter_end,saledate_is_quarter_start,saledate_quarter,saledate_week
0,1139246,999089,3157,121,3.0,...,False,False,False,4,46
1,1139248,117657,77,121,3.0,...,False,False,False,1,13
2,1139249,434808,7009,121,3.0,...,False,False,False,1,9
3,1139251,1026470,332,121,3.0,...,False,False,False,2,20
4,1139253,1057373,17311,121,3.0,...,False,False,False,3,30


### Add NaN column
Here we add a columns indicating whether the original column contained a NaN

In [14]:
df_proc = add_nan_columns(df_proc)
df_proc.head()

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,...,Backhoe_Mounting_isnull,Blade_Type_isnull,Travel_Controls_isnull,Differential_Type_isnull,Steering_Controls_isnull
0,1139246,999089,3157,121,3.0,...,True,True,True,False,False
1,1139248,117657,77,121,3.0,...,True,True,True,False,False
2,1139249,434808,7009,121,3.0,...,True,True,True,True,True
3,1139251,1026470,332,121,3.0,...,True,True,True,True,True
4,1139253,1057373,17311,121,3.0,...,True,True,True,True,True


### Handle numerical types containing NaN

In [15]:
stats = numeric_nans(df_raw)
stats

Unnamed: 0,column,null_fraction,nulls,type,is_numeric,num_uniques,uniques
0,auctioneerID,0.05,20136,float64,True,30,"[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ..."
1,MachineHoursCurrentMeter,0.644,258360,float64,True,15152,"[0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, ..."


In [16]:
df_proc['auctioneerID'] = df_proc['auctioneerID'].fillna(
    df_proc['auctioneerID'].max() + 1
)

df_proc['MachineHoursCurrentMeter'] = df_proc['MachineHoursCurrentMeter'].fillna(
    df_proc['MachineHoursCurrentMeter'].median()
)

In [17]:
numeric_nans(df_proc)

Unnamed: 0,column,null_fraction,nulls,type,is_numeric,num_uniques,uniques


### Add categories
- http://benalexkeen.com/mapping-categorical-data-in-pandas/
- https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html

In [18]:
(df_cats, catcodes) = categorize_df(df_proc)

In [19]:
df_cats.head()

Unnamed: 0,SalesID,MachineID,ModelID,datasource,auctioneerID,...,Backhoe_Mounting_category,Blade_Type_category,Travel_Controls_category,Differential_Type_category,Steering_Controls_category
0,1139246,999089,3157,121,3.0,...,0,0,0,4,2
1,1139248,117657,77,121,3.0,...,0,0,0,4,2
2,1139249,434808,7009,121,3.0,...,0,0,0,0,0
3,1139251,1026470,332,121,3.0,...,0,0,0,0,0
4,1139253,1057373,17311,121,3.0,...,0,0,0,0,0


In [20]:
catcodes.head()

Unnamed: 0,column,n_categories,categories,codes
0,UsageBand,4,"Index(['High', 'Low', 'Medium'], dtype='object')",0 2 1 2 2 1 3 ...
1,fiModelDesc,4999,"Index(['100C', '104', '1066', '1066E', '1080',...",0 950 1 1725 2 331 3...
2,fiBaseModel,1950,"Index(['10', '100', '104', '1066', '1080', '10...",0 296 1 527 2 110 3...
3,fiSecondaryDesc,176,"Index([' MSR SPIN ACE', '#NAME?', '-2', '-3', ...",0 41 1 55 2 0 3 ...
4,fiModelSeries,123,"Index([' III', '#NAME?', '-1', '-1.50E+01', '-...",0 0 1 98 2 0 3 ...


### Inspect results of data processing

In [21]:
summarize_df(df_raw)

Unnamed: 0,type,ncols,ncols_w_nans,n_nans,n_total,nan_frac
0,datetime64[ns],1,0,0,401125,0.0
1,float64,2,0,0,802250,0.0
2,int64,6,0,0,2406750,0.0
3,object,44,22,6002766,17649500,0.34


In [22]:
summarize_df(df_cats)

Unnamed: 0,type,ncols,ncols_w_nans,n_nans,n_total,nan_frac
0,bool,44,0,0,17649500,0.0
1,float64,3,0,0,1203375,0.0
2,int16,4,0,0,1604500,0.0
3,int64,9,0,0,3610125,0.0
4,int8,40,0,0,16045000,0.0


### Split into train and test

In [24]:
(nrows, ncols) = df_cats.shape
print('Data rows: {}'.format(nrows))

Data rows: 401125


In [25]:
n_test = 12000 # Update to 12000

np.random.seed(seed=23)
ind_all = df_cats.index.values
ind_rand = np.random.permutation(ind_all)

ind_test = np.sort(ind_rand[:n_test])
ind_train = np.sort(ind_rand[n_test:])

In [26]:
def split_vars(df: pd.DataFrame, colname: str) -> (pd.DataFrame, np.array):
    """ Return a copy of DataFrame df minus one column, and the column as values
    """
    resdf = df.copy(deep=True)
    y = np.array(resdf[colname].values)
    resdf.drop(columns=[colname], inplace=True)
    return resdf, y

In [27]:
df_train, y_train = split_vars(df_cats.iloc[ind_train], 'LogSalePrice')
df_test, y_test = split_vars(df_cats.iloc[ind_test], 'LogSalePrice')

In [30]:
print('Training data:', df_train.shape)
print('Training target:', y_train.shape)

print('Test data:', df_test.shape)
print('Test target:', y_test.shape)

Training data: (389125, 99)
Training target: (389125,)
Test data: (12000, 99)
Test target: (12000,)


### Fit model

In [67]:
model = RandomForestRegressor(n_jobs=-1)

In [85]:
#model.fit(df_train, y_train)

In [86]:
#model.score(df_train, y_train)

In [87]:
def rmse(x, y):
    return np.sqrt(((x-y)**2).mean())