In [4]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [5]:
# Constants
data_file_path = "./data/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0

In [6]:
# Load data
df = pd.read_csv(data_file_path)

In [7]:
# Basic EDA
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [68]:
# Unique data types of every column
print("Unique data types:")
print([str(x) for x in np.unique(df.dtypes.values)])

# Dataframe of all numeric types
df_num = df.select_dtypes(include=["number"])

# Dataframe of non-numerics
df_obj = df.select_dtypes(exclude=["number"])

Unique data types:
['int64', 'float64', 'object']


In [70]:
# Columns with missing data
print(df.shape)
missing_count_per_column = df.isnull().sum()
print(missing_count_per_column[missing_count_per_column > 0])

(1460, 81)
LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [71]:
# Print types of columns with missing values
columns_with_missing_values = missing_count_per_column[missing_count_per_column > 0].index
print(df.dtypes[columns_with_missing_values])

LotFrontage     float64
Alley            object
MasVnrType       object
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
FireplaceQu      object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
dtype: object


In [1]:
# Target and features
y = df.SalePrice

features_no_missing = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
features = features_no_missing + ['LotFrontage','MasVnrArea','GarageYrBlt'] # numeric types with missing columns

# Select columns corresponding to features, and preview the data
X = df[features]
X.head()

NameError: name 'df' is not defined

In [73]:
# Splitting
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=val_size, random_state=random_state)

In [74]:
# Number of missing values per column
missing_value_count_by_column = train_X.isnull().sum()
columns_with_missing_values = missing_value_count_by_column[missing_value_count_by_column > 0]
print(train_X.shape)
print(columns_with_missing_values)

(934, 10)
LotFrontage    162
MasVnrArea       5
GarageYrBlt     46
dtype: int64


In [75]:
# Handle missing values
# Option 1: Drop columns with missing values. Not so great if the column to be dropped has a lot of data
missing_handled_train_X = train_X.drop(columns_with_missing_values.index, axis=1)
missing_handled_valid_X = val_X.drop(columns_with_missing_values.index, axis=1)

In [76]:
# Note the difference in columns
print(train_X.columns)
print(missing_handled_train_X.columns)

Index(['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath',
       'BedroomAbvGr', 'TotRmsAbvGrd', 'LotFrontage', 'MasVnrArea',
       'GarageYrBlt'],
      dtype='object')
Index(['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath',
       'BedroomAbvGr', 'TotRmsAbvGrd'],
      dtype='object')


In [77]:
# Option 2: If not dropping the column, it's possible to replace missing values with the mean of that column.
# Other methods exist such as filling in with 0 or with the mode.
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
mean_imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X), columns=train_X.columns)
mean_imputed_val_X = pd.DataFrame(my_imputer.transform(val_X), columns=val_X.columns)

In [78]:
# Note the difference in row counts of missing values
print("Number of rows in original data: " +
      str(train_X.shape[0]))
print("Number of rows with missing entries in original data: "
      + str(train_X[columns_with_missing_values.index].isnull().any(axis=1).sum()))
print("Number of rows in imputed data: " +
      str(mean_imputed_train_X.shape[0]))
print("Number of rows with missing entries in inputed data: "
      + str(mean_imputed_train_X[columns_with_missing_values.index].isnull().any(axis=1).sum()))

Number of rows in original data: 934
Number of rows with missing entries in original data: 209
Number of rows in imputed data: 934
Number of rows with missing entries in inputed data: 0


In [79]:
# Option 3: Impute data while keeping track of entries that were imputed by creating a new column.
def track_missing_entries_and_impute(dfin):
    dfin = dfin.copy()
    for column in columns_with_missing_values.index:
        dfin[column + "_missing"] = dfin[column].isnull()
    return dfin

In [80]:
tracked_missing_train_X = track_missing_entries_and_impute(train_X)
tracked_missing_val_X = track_missing_entries_and_impute(val_X)
my_imputer = SimpleImputer()
imputed_tracked_missing_train_X = pd.DataFrame(my_imputer.fit_transform(tracked_missing_train_X), columns=tracked_missing_train_X.columns)
imputed_tracked_missing_val_X = pd.DataFrame(my_imputer.transform(tracked_missing_val_X), columns=tracked_missing_val_X.columns)

In [99]:
def score(train_X, val_X, train_y, val_y):
    mdl = RandomForestRegressor(random_state=random_state)
    mdl.fit(train_X, train_y)
    mae = mean_absolute_error(val_y, mdl.predict(val_X))
    return (mae, mdl)

mae_regular, _ = score(train_X, val_X, train_y, val_y)
mae_imputed, _ = score(mean_imputed_train_X, mean_imputed_val_X, train_y, val_y)
mae_missing_tracked, _ = score(imputed_tracked_missing_train_X, imputed_tracked_missing_val_X, train_y, val_y)
print(f"mae_regular = {mae_regular}")
print(f"mae_imputed = {mae_imputed}")
print(f"mae_missing_tracked = {mae_missing_tracked}")

mae_regular = 22075.356076516076
mae_imputed = 22105.887148758648
mae_missing_tracked = 21948.07243691494


In [81]:
# Model definition and training
mdl = RandomForestRegressor(random_state=random_state)
mdl.fit(train_X, train_y)
mae = mean_absolute_error(val_y, mdl.predict(val_X))
print(mae)

22075.356076516076


In [82]:
# Hyperparameter tuning
maes = []
n_trees_search = (2 ** np.arange(15))
for n_trees in n_trees_search:
    print(f"Training for n_trees = {n_trees}")
    mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
    mdl.fit(train_X, train_y)
    mae = mean_absolute_error(val_y, mdl.predict(val_X))
    maes.append(mae)

Training for n_trees = 1
Training for n_trees = 2
Training for n_trees = 4
Training for n_trees = 8
Training for n_trees = 16
Training for n_trees = 32
Training for n_trees = 64
Training for n_trees = 128
Training for n_trees = 256
Training for n_trees = 512
Training for n_trees = 1024
Training for n_trees = 2048
Training for n_trees = 4096
Training for n_trees = 8192
Training for n_trees = 16384


In [83]:
# Hyperparameter selection
for n, mae in enumerate(maes):
    print(f"n_tree = {n_trees_search[n]}, mae = {mae}")
mae_min = min(maes)
n_trees = n_trees_search[maes.index(mae_min)]
print(f"best = {n_trees}, mae = {mae_min}")

n_tree = 1, mae = 32597.076923076922
n_tree = 2, mae = 26131.181623931625
n_tree = 4, mae = 24765.744658119656
n_tree = 8, mae = 23460.37980769231
n_tree = 16, mae = 22912.53579059829
n_tree = 32, mae = 22480.76295405983
n_tree = 64, mae = 22040.92172873423
n_tree = 128, mae = 21932.071305517402
n_tree = 256, mae = 21822.602706552705
n_tree = 512, mae = 21847.823539846293
n_tree = 1024, mae = 21817.65591593375
n_tree = 2048, mae = 21761.17050687846
n_tree = 4096, mae = 21758.8497092891
n_tree = 8192, mae = 21751.796534456258
n_tree = 16384, mae = 21727.26276778886
best = 16384, mae = 21727.26276778886


In [84]:
# Retraining with best hyperparameter, and using the validation set as well
train_X2 = pd.concat((train_X, val_X))
train_y2 = pd.concat((train_y, val_y))
mdl = RandomForestRegressor(n_estimators=n_trees, random_state=random_state)
mdl.fit(train_X2, train_y2)

In [85]:
# Estimate accuracy on data set not used for training
mae = mean_absolute_error(test_y, mdl.predict(test_X))
print(mae)

23809.82347210185
