In [2]:
import pandas as pd

#read the data
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

#drop houses where the target is missing
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)

target = train_data.SalePrice

cols_with_missing = [col for col in train_data.columns if train_data[col].isnull().any()]

candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)
candidate_test_predictors = test_data.drop(['Id'] + cols_with_missing, axis=1)

#cardinality stands for the number of unique values in a column
low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                               candidate_train_predictors[cname].nunique() < 10 and
                               candidate_train_predictors[cname].dtype == "object"]
numeric_cols = [cname for cname in candidate_train_predictors.columns if
                   candidate_train_predictors[cname].dtype in ['int64', 'float64']]

my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [3]:
train_predictors.dtypes.sample(10)

ScreenPorch     int64
Foundation     object
HalfBath        int64
1stFlrSF        int64
PavedDrive     object
ExterQual      object
TotalBsmtSF     int64
YrSold          int64
Fireplaces      int64
LotShape       object
dtype: object

In [4]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):
    #multiply by -1 to make positive mae score instead of neg value
    return -1 * cross_val_score(RandomForestRegressor(50), X, y, scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)
mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

Mean Absolute Error when Dropping Categoricals: 18281
Mean Abslute Error with One-Hot Encoding: 18068


In [6]:
#it is important to keep alignment for cases when categorical values have different number of values in training vs testing data sets
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors, join='left', axis=1)