In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


In [3]:
melbourne_file_path = './melb_data.csv'
melbourne_data = pd.read_csv(melbourne_file_path) 
y = melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define model

forest_model = RandomForestRegressor(random_state=1)
def get_mae(X, y):
    # multiple by -1 to make positive MAE score instead of neg value returned as sklearn convention
    return -1 * cross_val_score(RandomForestRegressor(50), 
                                X, y, 
                                scoring = 'neg_mean_absolute_error').mean()


In [5]:
one_hot_encoded_training_predictors = pd.get_dummies(train_X)
predictors_without_categoricals = train_X.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, train_y)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, train_y)

print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))



Mean Absolute Error when Dropping Categoricals: 187891
Mean Abslute Error with One-Hot Encoding: 187229


In [8]:
#aligning train and test data

one_hot_encoded_training_predictors = pd.get_dummies(train_X)
one_hot_encoded_test_predictors = pd.get_dummies(val_X)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                    join='left', 
                                                                    axis=1)


#The align command makes sure the columns show up in the same order in both datasets
#(it uses column names to identify which columns line up in each dataset.) The argument 
#join='left' specifies that we will do the equivalent of SQL's left join. That means, 
#if there are ever columns that show up in one dataset and not the other, we will keep 
#exactly the columns from our training data. The argument join='inner' would do what SQL 
#databases call an inner join, keeping only the columns showing up in both datasets. 
