In [9]:
import pandas as pd 
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.dropna(axis=0, subset=['SalePrice'], inplace=True)
target = train_data.SalePrice

# We use the simplest approach and drop the NA values
# since here we are concerned about categorical variables

cols_with_missing = [col for col in train_data.columns if train_data[col].isnull().any()]
candidate_train_predictors = train_data.drop(['Id', 'SalePrice'] + cols_with_missing, axis=1)
candidate_test_predictors = test_data.drop(['Id'] + cols_with_missing, axis=1)

low_cardinality_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].nunique() < 10 and
                                candidate_train_predictors[cname].dtype == "object"]

numeric_cols = [cname for cname in candidate_train_predictors.columns if 
                                candidate_train_predictors[cname].dtype in ['int64', 'float64']]
my_cols = low_cardinality_cols + numeric_cols
train_predictors = candidate_train_predictors[my_cols]
test_predictors = candidate_test_predictors[my_cols]

In [10]:
train_predictors.dtypes.sample(10)

LotConfig     object
BsmtFinSF1     int64
PoolArea       int64
MoSold         int64
MSSubClass     int64
LotShape      object
CentralAir    object
Condition1    object
Street        object
Utilities     object
dtype: object

In [12]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)

In [14]:
# investigate the effect of adding categorical variables to our model 
from sklearn.model_selection import cross_val_score 
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):
    return -1 * cross_val_score(RandomForestRegressor(50),
                               X,y, scoring='neg_mean_absolute_error').mean()

predictors_without_categoricals = train_predictors.select_dtypes(exclude=['object'])

mae_without_categoricals = get_mae(predictors_without_categoricals, target)

mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)

print('Mean Absolute Error when Dropping Categoricals:'
      + str(int(mae_without_categoricals)))
print('Mean Absolute Error with One-hot Encoding:'
     + str(int(mae_one_hot_encoded)))


Mean Absolute Error when Dropping Categoricals:18656
Mean Absolute Error with One-hot Encoding:18210


Scikit-learn is sensitive to the ordering of columns, so if the training dataset and test datasets get misaligned, the results will be nonsense! This could happen if a categorical had a different number of values in the training data vs the test data. 
Ensure the test data is encoded in the same manner as the training with the align command: 

In [16]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)

final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,
                                                                   join='left',
                                                                   axis=1)