# Ames housing: penalised regression with h2o and category encoders

Contents
 - start
 - basis functions
 - Various encoders
 - regression with h2o
 - save model and save predictions
 
Notes


Sources:
 - 

Copyright (C) 2018 Alan Chalk  
Please do not distribute or publish without permission.

## Start_.

**packages**

In [None]:
import os
import numpy as np
import pandas as pd
import pickle

from sklearn import preprocessing

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

**Directories and paths**

In [None]:
# Set directories
print(os.getcwd())
dirRawData = "../input/"
dirPData =   "../PData/"

**Functions**

In [None]:
def fn_MAE(actuals, predictions):
    return np.round(np.mean(np.abs(predictions - actuals)))

def fn_RMSE(actuals, predictions):
    return np.round(np.sqrt(np.mean((predictions - actuals)**2)))

**Load data**

Do not use the one-hot version!!


In [None]:
f_name = dirPData + '02_df.pickle'

with (open(f_name, "rb")) as f:
    dict_ = pickle.load(f)

df_all = dict_['df_all']

#del f_name, dict_

In [None]:
# load the variables information
f_name = dirPData + '02_vars.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)
    
var_dep              = dict_['var_dep']
vars_ind_numeric     = dict_['vars_ind_numeric']
vars_ind_categorical = dict_['vars_ind_categorical']

del dict_

In [None]:
idx_train  = df_all['fold'].isin(range(6))
idx_val    = df_all['fold'].isin([6, 7])
idx_design = df_all['fold'].isin(range(8))
idx_test   = df_all['fold'].isin([8, 9])

**Prepare basis functions**

In [None]:
# do this only for truly continuous variables
# (this is not necessarily "right" - but just quicker to code ...)
# using >8 made sklearn crash - but h2o is fine with it
vars_ind_tospline = df_all[vars_ind_numeric].columns[(df_all[vars_ind_numeric].nunique() > 8)].tolist()

In [None]:
def fn_tosplines(x):
    x = x.values
    # hack: remove zeros to avoid issues where lots of values are zero
    x_nonzero = x[x != 0]
    ptiles = np.percentile(x_nonzero, [10, 20, 40, 60, 80, 90])
    ptiles = np.unique(ptiles)
    print(var, ptiles)
    df_ptiles = pd.DataFrame({var: x})
    for idx, ptile in enumerate(ptiles):
        df_ptiles[var + '_' + str(idx)] = np.maximum(0, x - ptiles[idx])
    return(df_ptiles)

Now update df_all with splines / basis functions

In [None]:
for var in vars_ind_tospline:
    df_ptiles = fn_tosplines(df_all[var])
    df_all.drop(columns=[var], inplace=True)
    vars_ind_numeric.remove(var)
    df_all = pd.concat([df_all, df_ptiles], axis=1, sort=False)
    vars_ind_numeric.extend(df_ptiles.columns.tolist())

In [None]:
vars_ind = vars_ind_categorical + vars_ind_numeric

In [None]:
# for convenience store dependent variable as y
y = df_all[var_dep].values.ravel()

**HCCV**

Let's try to do something for neighborhood 

In [None]:
df_all[vars_ind_categorical].nunique().sort_values(ascending=False)[0:5]

In [None]:
from category_encoders import *

In [None]:
df_all['neighborhood_cat'] = df_all['neighborhood']
y_train = y[idx_train]

**Ordinal Encoder**

In [None]:
df_train = df_all[idx_train].copy()

enc = preprocessing.OrdinalEncoder()
enc.fit(df_train[['neighborhood_cat']]) # Note the double square brackets to preserve the DataFrame type
arr_enc = enc.transform(df_train[['neighborhood_cat']])
df_train['neighborhood'] = arr_enc

df_train[['neighborhood_cat', 'neighborhood']].head(5)

In [None]:
np.sort(df_train['neighborhood'].unique())

**Target Encoder**

In [None]:
df_train = df_all[idx_train].copy()

In [None]:
df_train[['neighborhood_cat', 'saleprice']].groupby(['neighborhood_cat']).agg('mean')

In [None]:
enc = TargetEncoder(cols=['neighborhood'])

df_encoded = enc.fit_transform(df_train, y_train)
df_encoded[['neighborhood_cat', 'neighborhood']].head(5)

For the transformation of the training data with the supervised methods, you should use fit_transform() method instead of fit().transform(), because these two methods do not have to generate the same result. The difference can be observed with LeaveOneOut encoder, which performs a nested cross-validation for the training data in fit_transform() method (to decrease over-fitting of the downstream model) but uses all the training data for scoring with transform() method (to get as accurate estimates as possible).

**BinaryEncoder**

In [None]:
df_train = df_all[idx_train].copy()

In [None]:
enc = BinaryEncoder(cols=['neighborhood_cat'])

enc.fit(df_train[['neighborhood_cat']])
df_encoded = enc.transform(df_train[['neighborhood_cat']])
df_encoded['neighborhood_cat'] = df_train['neighborhood_cat']
df_encoded.head()


In [None]:
df_encoded.tail()

**Hash encoder**

In [None]:
df_train = df_all[idx_train].copy()
df_train.reset_index(inplace=True, drop=True)

In [None]:
enc = HashingEncoder(cols=['neighborhood_cat'], n_components=3)

enc.fit(df_train[['neighborhood_cat']])
df_encoded = enc.transform(df_train[['neighborhood_cat']])
df_encoded['neighborhood_cat'] = df_train['neighborhood_cat']
df_encoded.head()


**LeaveOneOutEncoder**

I will try this one here - you should experiment with as many of them as you have time for.

In [None]:
enc = LeaveOneOutEncoder(cols=vars_ind_categorical, sigma=0.3)
enc.fit(df_all[idx_design], y[idx_design]) # should really use fit_transform
df_temp = enc.transform(df_all)
df_temp.head()

### "elastic net" regression

**start h2o**

In [None]:
h2o.init(port=54321)
#h2o.connect()

**Load data into h2o**

In [None]:
h2o_df_all = h2o.H2OFrame(df_all[vars_ind + var_dep + ['fold']],
                          destination_frame = 'df_all')

In [None]:
idx_h2o_train  = h2o.H2OFrame(idx_train.astype('int').values)
idx_h2o_val    = h2o.H2OFrame(idx_val.astype('int').values)
idx_h2o_design = h2o.H2OFrame(idx_design.astype('int').values)
idx_h2o_test   = h2o.H2OFrame(idx_test.astype('int').values)

**lambda_search for alpha and lambda given an identity link**

In [None]:
model=H2OGeneralizedLinearEstimator(  alpha=0.99
                                        , family='gaussian'
                                        , link='identity'
                                        , lambda_search=True
                                        , lambda_min_ratio=1e-7
                                        , nlambdas=200
                                        , early_stopping=False
                                        , nfolds=10)
model.train(x=vars_ind, 
            y='saleprice',
            training_frame=h2o_df_all[idx_h2o_design, :])
    

In [None]:
glm_bst = model

In [None]:
bst_pred_train = glm_bst.predict(h2o_df_all[idx_h2o_train, :])
bst_pred_val   = glm_bst.predict(h2o_df_all[idx_h2o_val, :])
bst_pred_test  = glm_bst.predict(h2o_df_all[idx_h2o_test, :])

bst_pred_train = bst_pred_train.as_data_frame().values.ravel()
bst_pred_val   = bst_pred_val.as_data_frame().values.ravel()
bst_pred_test  = bst_pred_test.as_data_frame().values.ravel()

print('train error', fn_MAE(y[idx_train], bst_pred_train))
print('val error',   fn_MAE(y[idx_val], bst_pred_val))
print('test error',  fn_MAE(y[idx_test], bst_pred_test))

h2o.show_progress()

# AC run gives
#train error 11950.0
#val error 12077.0
#test error 13667.0
# And these should be reproduced by this code

In [None]:
h2o.cluster().shutdown()