In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [3]:
# Constants
data_file_path = "./data/home-data-for-ml-course/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0

In [4]:
# Load data
df = pd.read_csv(data_file_path)

In [5]:
# Basic EDA
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [6]:
# Unique data types of every column
print("Unique data types:")
print([str(x) for x in np.unique(df.dtypes.values)])

# Dataframe of all numeric types
df_num = df.select_dtypes(include=["number"])

# Dataframe of non-numerics
df_obj = df.select_dtypes(exclude=["number"])

Unique data types:
['int64', 'float64', 'object']


In [7]:
# Target and features
y = df.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

# Select columns corresponding to features, and preview the data
X = df[features]
X.head()

Unnamed: 0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
0,8450,2003,856,854,2,3,8
1,9600,1976,1262,0,2,3,6
2,11250,2001,920,866,2,3,6
3,9550,1915,961,756,1,3,7
4,14260,2000,1145,1053,2,4,9


In [8]:
# Splitting
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=val_size, random_state=random_state)

# [Handle Missing Values](./missing_values.ipynb)

# [Use Categorical Variables](./categorical_variables.ipynb)

In [9]:
# Model definition and training
mdl = RandomForestRegressor(random_state=random_state)
mdl.fit(train_X, train_y)
mae = mean_absolute_error(val_y, mdl.predict(val_X))
print(mae)

22154.19285856736


In [None]:
# Hyperparameter tuning
maes = []
n_trees_search = (2 ** np.arange(15))
for n_trees in n_trees_search:
    print(f"Training for n_trees = {n_trees}")
    mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
    mdl.fit(train_X, train_y)
    mae = mean_absolute_error(val_y, mdl.predict(val_X))
    maes.append(mae)

Training for n_trees = 1
Training for n_trees = 2
Training for n_trees = 4
Training for n_trees = 8
Training for n_trees = 16
Training for n_trees = 32
Training for n_trees = 64
Training for n_trees = 128
Training for n_trees = 256
Training for n_trees = 512
Training for n_trees = 1024
Training for n_trees = 2048
Training for n_trees = 4096
Training for n_trees = 8192
Training for n_trees = 16384


In [None]:
# Hyperparameter selection
for n, mae in enumerate(maes):
    print(f"n_tree = {n_trees_search[n]}, mae = {mae}")
mae_min = min(maes)
n_trees = n_trees_search[maes.index(mae_min)]
print(f"best = {n_trees}, mae = {mae_min}")

In [None]:
# Retraining with best hyperparameter, and using the validation set as well
train_X2 = pd.concat((train_X, val_X))
train_y2 = pd.concat((train_y, val_y))
mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
mdl.fit(train_X2, train_y2)

In [None]:
# Estimate accuracy on data set not used for training
mae = mean_absolute_error(test_y, mdl.predict(test_X))
print(mae)