In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

housing_data = pd.read_csv("./data/melb_data.csv")

# prediction target
y = housing_data['Price']

# predicting features
housing_predictors = housing_data.drop(['Price'], axis = 1)
X = housing_predictors.select_dtypes(exclude = ['object'])

# split
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def rate_dataset(X_train, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators = 10, random_state = 0)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_val)
    
    return mean_absolute_error(y_val, predictions)

In [4]:
# Drop cols with missing values
cols_with_nulls = [col for col in X_train.columns if X_train[col].isna().any()]

reduced_X_train = X_train.drop(cols_with_nulls, axis = 1)
reduced_X_val = X_val.drop(cols_with_nulls, axis = 1)

print("MAE when dropping columns with missing values: ", end='')
print(
    round(
        rate_dataset(reduced_X_train, reduced_X_val, y_train, y_val), 0
    )
)

MAE when dropping columns with missing values: 183550.0


In [7]:
# Imputation (filling in means)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_val = pd.DataFrame(imputer.transform(X_val))

# restore col names
imputed_X_train.columns = X_train.columns
imputed_X_val.columns = X_val.columns

print("MAE when imputing: ", end='')
print(
    round(
        rate_dataset(imputed_X_train, imputed_X_val, y_train, y_val), 0
    )
)

MAE when imputing: 178166.0


In [8]:
# Imputation with indicators

X_train_2 = X_train.copy()
X_val_2 = X_val.copy()

for col in cols_with_nulls:
    X_train_2[col + '_was_missing'] = X_train_2[col].isnull()
    X_val_2[col + '_was_missing'] = X_val_2[col].isnull()
    
imputer = SimpleImputer()
imputed_X_train_2 = pd.DataFrame(imputer.fit_transform(X_train_2))
imputed_X_val_2 = pd.DataFrame(imputer.transform(X_val_2))

imputed_X_train_2.columns = X_train_2.columns
imputed_X_val_2.columns = X_val_2.columns

print("MAE when imputing and listing imputations: ", end='')
print(
    round(
        rate_dataset(imputed_X_train_2, imputed_X_val_2, y_train, y_val), 0
    )
)

MAE when imputing and listing imputations: 178928.0


In [10]:
# the performance with imputed data was better because less than half entries were missing in columns with nones
# and dropping those columns removes useful data

print(X_train.shape)

missing_vals_per_col = X_train.isnull().sum()
missing_vals_per_col[missing_vals_per_col > 0]

(10864, 12)


Car               49
BuildingArea    5156
YearBuilt       4307
dtype: int64