In [1]:
import pandas as pd

housing_data = pd.read_csv('./data/melb_data.csv')

y = housing_data['Price']
X = housing_data.drop(['Price'], axis = 1)


from sklearn.model_selection import train_test_split

X_train_full, X_valid_full, y_train, y_valid = train_test_split(
    X, y, train_size = 0.8, test_size = 0.2, random_state = 1
)


# dropping cols with missing values
cols_with_nones = [col_name for col_name in X_train_full.columns if X_train_full[col_name].isna().any()]
X_train_full.drop(cols_with_nones, axis = 1, inplace = True)
X_valid_full.drop(cols_with_nones, axis = 1, inplace = True)

In [2]:
# let's only select categorical columns with low cardinality (convenient)
low_card_cols = [col_name for col_name in X_train_full.columns
                 if  X_train_full[col_name].nunique() < 10
                 and X_train_full[col_name].dtype == 'object']

In [3]:
# and select numerical columns
numerical_cols = [col_name for col_name in X_train_full.columns
                  if X_train_full[col_name].dtype in ['int64', 'float64']]

In [4]:
selected_predictive_features = low_card_cols + numerical_cols

X_train = X_train_full[selected_predictive_features].copy()
X_valid = X_valid_full[selected_predictive_features].copy()

In [5]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
1041,h,S,Southern Metropolitan,3,11.2,3186.0,3.0,1.0,366.0,-37.9038,145.0001,10579.0
1989,h,S,Northern Metropolitan,3,7.8,3058.0,3.0,1.0,238.0,-37.7539,144.9612,11204.0
10157,h,S,Northern Metropolitan,3,5.2,3056.0,3.0,1.0,439.0,-37.77047,144.97005,11918.0
1711,u,S,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,0.0,-37.8863,145.066,7822.0
11565,h,S,Western Metropolitan,4,11.0,3018.0,4.0,2.0,615.0,-37.87057,144.83623,5301.0


In [6]:
s = (X_train.dtypes == 'object')

categorical_features = list(s[s].index)
categorical_features

['Type', 'Method', 'Regionname']

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def rate_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators = 100, random_state = 0)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, predictions)
    
    return mae

In [8]:
# Drop categorical variables

X_train_drop_cat = X_train.select_dtypes(exclude = ['object'])
X_valid_drop_cat = X_valid.select_dtypes(exclude = ['object'])

print("MAE when dropping categorical variables:", end = ' ')
print(
    round(rate_dataset(
        X_train_drop_cat, X_valid_drop_cat, y_train, y_valid
    ), 0)
)

MAE when dropping categorical variables: 176938.0


In [9]:
# use an Ordinal Encoder

from sklearn.preprocessing import OrdinalEncoder


X_train_ordinal = X_train.copy()
X_valid_ordinal = X_valid.copy()

ordinal_encoder = OrdinalEncoder()
X_train_ordinal.loc[:, categorical_features] = ordinal_encoder.fit_transform(X_train[categorical_features])
X_valid_ordinal.loc[:, categorical_features] = ordinal_encoder.transform(X_valid[categorical_features])



print("MAE when using ordinal encoding:", end = ' ')
print(
    round(rate_dataset(
        X_train_ordinal, X_valid_ordinal, y_train, y_valid
    ), 0)
)

MAE when using ordinal encoding: 164359.0


In [10]:
# use a One-Hot Encoder

from sklearn.preprocessing import OneHotEncoder


onehot_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
onehot_cols_train = pd.DataFrame(onehot_encoder.fit_transform(X_train[categorical_features]))
onehot_cols_valid = pd.DataFrame(onehot_encoder.transform(X_valid[categorical_features]))

onehot_cols_train.index = X_train.index
onehot_cols_valid.index = X_valid.index

onehot_cols_train.columns = onehot_cols_train.columns.astype(str)
onehot_cols_valid.columns = onehot_cols_valid.columns.astype(str)

X_train_numerical = X_train.drop(categorical_features, axis = 1)
X_valid_numerical = X_valid.drop(categorical_features, axis = 1)

X_train_onehot = pd.concat([X_train_numerical, onehot_cols_train], axis = 1)
X_valid_onehot = pd.concat([X_valid_numerical, onehot_cols_valid], axis = 1)

print("MAE when using one-hot encoding:",
    round(rate_dataset(
        X_train_onehot, X_valid_onehot, y_train, y_valid
    ), 0)
)

MAE when using one-hot encoding: 161965.0
