## Reimplementation of XGB After Feature Extraction With H2O Auto-Encoder 

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
import gc
import h2o
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator

### Load Dataset

In [42]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
print('train set shape: {}'.format(train.shape))
print('test set shape: {}'.format(test.shape))

train set shape: (4459, 4993)
test set shape: (49342, 4992)


In [43]:
y_train = train.target
ids = pd.concat([train.ID, test.ID])

#### Remove part of the original features

In [44]:
train.drop(['ID', 'target'], axis=1, inplace=True)
test.drop(['ID'], axis=1, inplace=True)

In [20]:
unique_cnts = train.nunique().reset_index()
unique_cnts.columns = ['columnName', 'uniqueCnt']
cols_cnst = unique_cnts.loc[unique_cnts.uniqueCnt==1, 'columnName'].tolist()
print('number of constant features : {}'.format(len(cols_cnst)))

number of constant features : 256


In [22]:
is_dup_t_f = train.loc[:,~train.columns.isin(cols_cnst)].T.duplicated()
cols_dup = train.loc[:,~train.columns.isin(cols_cnst)].columns[is_dup_t_f].tolist()
print('number of duplicated columns in train set excluding constant features : {}'.format(len(cols_dup)))

number of duplicated columns in train set excluding constant features : 5


In [25]:
corr_matrix = train.loc[:,~train.columns.isin(cols_cnst+cols_dup)].corr(method='spearman').abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
cols_corr = [column for column in upper.columns if any(upper[column] > 0.98)]
print('features drop due to high correlations : {}'.format(len(cols_corr)))

features drop due to high correlations : 40


In [45]:
df_all = pd.concat([train, test], axis=0)
del [train, test]
gc.collect()
df_all.drop(cols_cnst+cols_dup+cols_corr, axis=1, inplace=True)
print('stacked dataframe shape : {}'.format(df_all.shape))

stacked dataframe shape : (53801, 4690)


#### Feature normalizatino : log1p and minmax transform

In [33]:
df_all = np.log1p(df_all)
df_all = minmax_scale(df_all, axis = 0)

#### Train-Test Split

In [46]:
x_tr, x_te = train_test_split(df_all, test_size=0.2, random_state=8668)

### Train Auto-Encoder with h2o

In [48]:
h2o.init(nthreads=-1, enable_assertions = False)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_162"; Java(TM) SE Runtime Environment (build 1.8.0_162-b12); Java HotSpot(TM) 64-Bit Server VM (build 25.162-b12, mixed mode)
  Starting server from /Users/weixu1/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/hs/84cnr0797g76c094s5594s103kgn2m/T/tmpx2772qq9
  JVM stdout: /var/folders/hs/84cnr0797g76c094s5594s103kgn2m/T/tmpx2772qq9/h2o_weixu1_started_from_python.out
  JVM stderr: /var/folders/hs/84cnr0797g76c094s5594s103kgn2m/T/tmpx2772qq9/h2o_weixu1_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster version:,3.16.0.4
H2O cluster version age:,5 months and 15 days !!!
H2O cluster name:,H2O_from_python_weixu1_ca8nc1
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


#### Load data to h2o

In [49]:
x_tr_h2o = h2o.H2OFrame(x_tr)
x_te_h2o = h2o.H2OFrame(x_te)

Parse progress: |████████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


#### setup and train h2o model

In [54]:
encoding_dim = 20
m_aec = H2OAutoEncoderEstimator(activation = "tanh",
                                autoencoder = True,
                                hidden = [128, encoding_dim, 128],
                                sparse = True,
                                reproducible = True,
                                max_w2 = 5.0,
                                epochs = 10,
                                seed = 0)

In [None]:
m_aec.train(x = list(range(x_tr.shape[1])),
            training_frame = x_tr_h2o,
            validation_frame = x_te_h2o,
            verbose=False)

deeplearning Model Build progress: |██████████████████████████████████████

In [None]:
m_aec.score_history()

In [47]:
h2o.cluster().shutdown()

H2O session _sid_a330 closed.
