In [110]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# Constants
data_file_path = "./data/home-data-for-ml-course/train.csv"
test_size = 0.2
val_size = 0.2
random_state = 0

# Load data
df = pd.read_csv(data_file_path)

# Target and features
y = df.SalePrice

In [111]:
# Splitting
X = df.drop(labels=["SalePrice"], axis=1)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=test_size, random_state=random_state)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=val_size, random_state=random_state)

In [112]:
# Find non-numeric columns
is_obj = X.dtypes == "object"
obj_cols = list(is_obj[is_obj].index)
X[obj_cols].head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [113]:
# Option 1: Drop columns with categorical values. Not great if the column is useful.
dropped_train_X = train_X.drop(labels=obj_cols, axis=1)
dropped_val_X = val_X.drop(labels=obj_cols, axis=1)

In [114]:
# Option 2: Ordinal encoding
# Not great if the column is a nominal variable. The values of a nominal variable have no inherent order.

# A situation can arise such that some values of the categorical column appear in the validation set
# but do not appear in the training set. Ordinal encoding would then fail. A solution to this is to drop
# the columns that have values that appear in the validation set but not in the training set.
good_obj_cols = [obj_col for obj_col in obj_cols if set(val_X[obj_col]).issubset(set(train_X[obj_col]))]
bad_obj_cols = list(set(obj_cols) - set(good_obj_cols))

# Drop the bad columns
oe_train_X = train_X.drop(labels=bad_obj_cols, axis=1)
oe_val_X = val_X.drop(labels=bad_obj_cols, axis=1)

oe = OrdinalEncoder()
oe_train_X[good_obj_cols] = oe.fit_transform(oe_train_X[good_obj_cols])
oe_val_X[good_obj_cols] = oe.transform(val_X[good_obj_cols])

In [115]:
# Option 3: One-hot encoding creates a boolean column for every unique value of categorical column. The value
# of the new boolean column indicates whether that sample had that value unique value of the categorical column.
# The advantage of one-hot encoding over ordinal encoding is that it does not assume an order between the unique
# values of a categorical column.

# However, much like ordinal encoding, some values of a categorical column can appear in the validation set but
# not in the training set. This is handled by setting the optional argument handle_unknown="ignore" when
# instantiating OneHotEncoder.

# A drawback of one-hot encoding is that it can introduce a lot of new entries and columns. By adding these new
# columns, it introduces a lot of extra dimensions, which isn't ideal. See Curse of Dimensionality.
# One way to handle this is to only include columns that have low cardinality (less than 10 unique values). A
# rule of thumb is to avoid one-hot encoding if a column has more than 15 unique values.
obj_cols_unq_cnt = train_X[obj_cols].nunique()
low_card_cols = list(obj_cols_unq_cnt[obj_cols_unq_cnt <= 10].index)
high_card_cols = list(set(obj_cols) - set(low_card_cols))

# Drop high cardinality columns
oh_train_X = train_X.drop(labels=high_card_cols, axis=1)
oh_val_X = val_X.drop(labels=high_card_cols, axis=1)

# One-hot encode low cardinality columns. Remember there are still other numeric columns.
# sparse_output="False" returns a non-sparse numpy array.
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
oh_train_cols = ohe.fit_transform(oh_train_X[low_card_cols])
oh_val_cols = ohe.transform(oh_val_X[low_card_cols])

# Drop low cardinality columns because they will be replaced
oh_train_X = oh_train_X.drop(labels=low_card_cols, axis=1)
oh_val_X = oh_val_X.drop(labels=low_card_cols,axis=1)

# Get column names of new one-hot encoded columns
oh_col_names = ohe.get_feature_names_out()

# Create DataFrames of one-hot encoded columns
oh_train_cols = pd.DataFrame(data=oh_train_cols, columns=oh_col_names, index=oh_train_X.index)
oh_val_cols = pd.DataFrame(data=oh_val_cols, columns=oh_col_names, index=oh_val_X.index)

# Create final DataFrames by concatenation
oh_train_X = pd.concat([oh_train_X, oh_train_cols], axis=1)
oh_val_X = pd.concat([oh_val_X, oh_val_cols], axis=1)

In [116]:
# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [119]:
dropped_mae = score_dataset(dropped_train_X, dropped_val_X, train_y, val_y)
oe_mae = score_dataset(oe_train_X, oe_val_X, train_y, val_y)
oh_mae = score_dataset(oh_train_X, oh_val_X, train_y, val_y)

print(f"dropped_mae = {dropped_mae}")
print(f"oe_mae = {oe_mae}")
print(f"oh_mae = {oh_mae}")

# In this case, ordinal encoding and one-hot encoding are not all that different. A rule of thumb is
# that one-hot encoding > ordinal encoding > dropping.

dropped_mae = 17640.981538461536
oe_mae = 17065.055598290597
oh_mae = 17067.91175213675
