## This kernel try to train an Auto-Encoder to extract important features from the dataset

In [66]:
import pandas as pd
import numpy as np
import pickle
import gc
from scipy import stats
import tensorflow as tf
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)

In [34]:
rmv_names_path = '../output/columns/forceremove.column.names'
rmv_names = list(line.strip() for line in open(rmv_names_path, 'r'))
print('number of features removed: {}'.format(len(rmv_names)))

number of features removed: 688


In [35]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
print('train set shape: {}'.format(train.shape))
print('test set shape: {}'.format(test.shape))

train set shape: (4459, 4993)
test set shape: (49342, 4992)


In [36]:
y_train = train.target
ids = pd.concat([train.ID, test.ID])

In [37]:
train.drop(['ID', 'target'] + rmv_names, axis=1, inplace=True)
test.drop(['ID'] + rmv_names, axis=1, inplace=True)
df_all = pd.concat([train, test], axis=0)
del [train, test]
gc.collect()
print('stacked dataframe shape : {}'.format(df_all.shape))

stacked dataframe shape : (53801, 4303)


### Further drop some features with high missing rate

In [38]:
def missingRate(pd_series, na_values=[np.nan]):
    return np.sum(pd_series.isin(na_values)) / pd_series.shape[0]

In [39]:
misrate = pd.DataFrame(df_all.columns).apply(lambda x: missingRate(df_all[x], na_values=[0])).reset_index()
misrate.columns = ['columnName', 'missingRate']

In [42]:
cols_high_misrate = misrate.loc[misrate.missingRate>0.97, 'columnName'].tolist()

In [43]:
df_all.drop(cols_high_misrate, axis=1, inplace=True)
print('stacked dataframe shape after drop high missing rate features : {}'.format(df_all.shape))

stacked dataframe shape after drop high missing rate features : (53801, 528)


### Log Transformation of the dataset and Min-Max Transformatin

In [44]:
df_all = np.log1p(df_all)

In [67]:
df_all = minmax_scale(df_all, axis = 0)

### Train-Validation split

In [74]:
x_tr, x_te = train_test_split(df_all, test_size=0.2, random_state=8668)

### Train Auto-Encoder

We try to minimize the least square as the reconstruction error

#### keras auto-encoder

In [75]:
input_dim = x_tr.shape[1]
encoding_dim = 10

In [78]:
input_layer = Input(shape=(input_dim, ))

# encoder
encoder = Dense(int(input_dim/2), activation="tanh")(input_layer)
encoder = Dense(int(input_dim/10), activation="tanh")(encoder)
encoder = Dense(encoding_dim, activation="sigmoid")(encoder)

# decoder
decoder = Dense(int(input_dim/10), activation='tanh')(encoder)
decoder = Dense(int(input_dim/2), activation='tanh')(decoder)
decoder = Dense(input_dim, activation='sigmoid')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        (None, 528)               0         
_________________________________________________________________
dense_61 (Dense)             (None, 264)               139656    
_________________________________________________________________
dense_62 (Dense)             (None, 52)                13780     
_________________________________________________________________
dense_63 (Dense)             (None, 10)                530       
_________________________________________________________________
dense_64 (Dense)             (None, 52)                572       
_________________________________________________________________
dense_65 (Dense)             (None, 264)               13992     
_________________________________________________________________
dense_66 (Dense)             (None, 528)               139920    
Total para

In [79]:
nb_epoch = 10
batch_size = 32
autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
checkpointer = ModelCheckpoint(filepath="model.h5", verbose=0, save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True)
history = autoencoder.fit(x_tr, x_tr,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(x_te, x_te),
                    verbose=1,
                    callbacks=[checkpointer, tensorboard]).history

Train on 43040 samples, validate on 10761 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### h2o deeplearning module with auto-encoder

In [80]:
import h2o
from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
h2o.init(nthreads=-1, enable_assertions = False)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_162"; Java(TM) SE Runtime Environment (build 1.8.0_162-b12); Java HotSpot(TM) 64-Bit Server VM (build 25.162-b12, mixed mode)
  Starting server from /Users/weixu1/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/hs/84cnr0797g76c094s5594s103kgn2m/T/tmpf8ndoo2w
  JVM stdout: /var/folders/hs/84cnr0797g76c094s5594s103kgn2m/T/tmpf8ndoo2w/h2o_weixu1_started_from_python.out
  JVM stderr: /var/folders/hs/84cnr0797g76c094s5594s103kgn2m/T/tmpf8ndoo2w/h2o_weixu1_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster version:,3.16.0.4
H2O cluster version age:,5 months and 15 days !!!
H2O cluster name:,H2O_from_python_weixu1_zspwhf
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [81]:
x_tr_h2o = h2o.H2OFrame(x_tr)
x_te_h2o = h2o.H2OFrame(x_te)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [84]:
encoding_dim = 6
m_aec = H2OAutoEncoderEstimator(activation = "tanh",
                                autoencoder = True,
                                hidden = [32, encoding_dim, 32],
                                sparse = True,
                                max_w2 = 5.0,
                                epochs = 10,
                                seed = 0)

In [86]:
m_aec.train(x = list(range(x_tr.shape[1])),
            training_frame = x_tr_h2o,
            validation_frame = x_te_h2o,
            verbose=False)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [100]:
m_aec.score_history()

Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_mse,validation_rmse,validation_mse
0,,2018-07-01 13:43:29,0.817 sec,0.00000 obs/sec,0.0,0,0.0,0.181611,0.032983,0.180108,0.032439
1,,2018-07-01 13:43:36,8.284 sec,12056 obs/sec,2.0,2,86080.0,0.154677,0.023925,0.154496,0.023869
2,,2018-07-01 13:43:44,15.795 sec,12025 obs/sec,4.0,4,172160.0,0.155117,0.024061,0.154945,0.024008
3,,2018-07-01 13:43:51,23.320 sec,12006 obs/sec,6.0,6,258240.0,0.155098,0.024055,0.154946,0.024008
4,,2018-07-01 13:43:59,30.816 sec,12008 obs/sec,8.0,8,344320.0,0.156313,0.024434,0.15612,0.024374
5,,2018-07-01 13:44:07,38.556 sec,11928 obs/sec,10.0,10,430400.0,0.155743,0.024256,0.155613,0.024216
6,,2018-07-01 13:44:07,38.900 sec,11925 obs/sec,10.0,10,430400.0,0.154677,0.023925,0.154496,0.023869
