# ML workflow for the house prices dataset

This script contains a machine learning workflow for the house prices dataset. The workflow includes ...

In [50]:
# load libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV


# Print file names in directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/competitions/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/competitions/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/competitions/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/competitions/house-prices-advanced-regression-techniques/test.csv
/kaggle/input/eda-results/eda_config.pickle


In [51]:
# define necessary functions
def encode_ordinal(df, columns, order):
    for column in columns:
        df[column] = (
            pd.Categorical(df[column], categories=order, ordered=True)
            .codes
            .astype('float')
        )
        df[column] = df[column].replace(-1, np.nan)
    return df

# define function to run parameter grid search together with cross-validation
# important to specify scoring to negative RMSE (in line with kaggle)
def run_grid_search(estimator, param_grid, preprocessor, X, y, cv, scoring="neg_root_mean_squared_error"):
    
    # define pipeline
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', estimator)
    ])

    # set-up grid search
    grid = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        cv=cv, # k-fold variable must be defined before so random state is always the same
        scoring=scoring,
        return_train_score=True
    )

    # run grid search
    grid.fit(X, y)

    # Extract results across runs (this runs independent of how many parameters are specified)
    results_df = pd.DataFrame({
        key.replace("param_model__", ""): grid.cv_results_[key]
        for key in grid.cv_results_.keys()
        if key.startswith("param_model__")
    })

    results_df["mean_rmse"] = -grid.cv_results_["mean_test_score"]  # Negate so that the final score is shown as positive
    results_df["std_rmse"] = grid.cv_results_["std_test_score"]

    return grid, results_df.sort_values("mean_rmse")

## Step 1: Load data and EDA results

In [52]:
# specify input paths
train_file_path = '/kaggle/input/competitions/house-prices-advanced-regression-techniques/train.csv'
test_file_path = '/kaggle/input/competitions/house-prices-advanced-regression-techniques/test.csv'
submission_file_path = '/kaggle/input/competitions/house-prices-advanced-regression-techniques/sample_submission.csv'
eda_config_file_path = '/kaggle/input/eda-results/eda_config.pickle'

# load as pandas data frames
df = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)
sub_df = pd.read_csv(submission_file_path)

# load eda config file
with open(eda_config_file_path, 'rb') as handle:
    eda = pickle.load(handle)

# separate variables
target = df["SalePrice"]
df = df.drop(columns=["SalePrice"])

# display options
pd.options.display.max_columns = 50
pd.options.display.max_rows = 50

In [53]:
# show dict keys of eda results
for i in eda.keys():
    print(i)

dropped_columns
ordinal_maps
dtype_overrides
category_missing_levels
final_features
numeric_features
categorical_features
rare_category_features
skewed_features
outlier_sensitive_features
target_transform
numeric_predictors_correlation
categorical_predictors_etasquared
high_corr_pairs_numeric
high_association_pairs_categorical


First, the dataframe will be corrected based on the results of the exploratory data analysis.

In [54]:
# recode into ordinal variables
for col, mapping in eda['ordinal_maps'].items():
    df = encode_ordinal(df, [col], mapping)
    df_test = encode_ordinal(df_test, [col], mapping)

# fix dtype
for col, dtype in eda['dtype_overrides'].items():
    df[col] = df[col].astype(dtype)
    df_test[col] = df_test[col].astype(dtype)

# log-transform target
target = np.log1p(target) # log-transformation takes care of the usual right-skew of price distributions

## Step 2: Pipeline set-up

The EDA previously identified several skewed numeric variables with outliers. Categorical variables were also shown to have some rare categories. The preprocessing pipeline can take this into account by scaling numeric variables and combining infrequent categories. Collapsing infrequent categories has the benefit of reducing the required columns for one-hot-encoding.

In [55]:
# split into numeric and categorical
numeric_features = eda['numeric_features']
categorical_features = eda['categorical_features']

In [56]:
# preprocessing numerical features (insert median for missing values and use z-score scaling)
numeric_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'median')), # impute the median for missing values
    ('scaler',  QuantileTransformer(output_distribution='normal', n_quantiles=1000)) # this ensures outliers and skew are minimized
])

# preprocessing categorical features (insert most frequent for missing values)
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')), # impute most frequent category if missing
    ('onehot', OneHotEncoder( # this is set up so that rare categories are grouped into a 'other' category
        handle_unknown = 'infrequent_if_exist',
        min_frequency = 0.01)) # this takes care of rare categories that appear less than 1%
])

In [57]:
# define the columns transformer
preprocessor = ColumnTransformer(transformers = [
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# prepare the train and test datasets
X = df[numeric_features + categorical_features]
X_test = df_test[numeric_features + categorical_features]

## Step 3: Model training and cross-validation

The EDA has shown that there are a few pairs of variables with high correlation. Therefore, I will first use tree-based models since they are robust to multicollinearity.

In [58]:
# set random seed
rs = 42
np.random.seed(rs)

# define kfold split
kf = KFold(n_splits=5, shuffle=True, random_state=rs)

param_grid = {
    'model__max_depth': [3, 6, 9, 12],
    'model__n_estimators': [50, 100, 200, 250],
    'model__learning_rate': [0.05, 0.1]
}

estimator = XGBRegressor(random_state=rs)

grid_xgb, results_xgb = run_grid_search(
    estimator, param_grid, preprocessor, X, target, kf,
    scoring="neg_root_mean_squared_error"
)
results_xgb.head()

Unnamed: 0,learning_rate,max_depth,n_estimators,mean_rmse,std_rmse
19,0.1,3,250,0.131301,0.016746
18,0.1,3,200,0.131724,0.017629
3,0.05,3,250,0.132043,0.018642
2,0.05,3,200,0.133182,0.018639
17,0.1,3,100,0.133728,0.017973


In [59]:
# get the parameters with best RMSE estimate
best_max_depth = results_xgb['max_depth'].iloc[0]
best_n_estimators = results_xgb['n_estimators'].iloc[0]
best_learning_rate = results_xgb['learning_rate'].iloc[0]

# define the pipeline and model with these parameters
model = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(
        random_state=rs,
        max_depth=best_max_depth,
        n_estimators=best_n_estimators,
        learning_rate=best_learning_rate
    ))
])

# train the model and predict test data
model.fit(X,target)
target_pred = model.predict(X_test)

In [60]:
# create results dataframe
submission = pd.DataFrame({
    'Id': df_test['Id'],
    'SalePrice': np.expm1(target_pred) # take inverse of logarithm for kaggle competition
})

# save in .csv format
submission.to_csv("submission_XGBRegressor.csv", index=False)

The basic XGB Regressor has a test RMSE of 0.133 (kaggle leaderboard). This estimate is very similar to the cross-validated RMSE, so no overfitting. However, there's several things left to do to try and increase the accuracy of the predictions. 

## To-do

- Extract feature importance
- Add feature engineering
- Use ensemble learning