In [100]:
from __future__ import absolute_import, division, print_function

import pandas as pd
import numpy as np
import sklearn as sk
import os
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
# %load_ext autoreload
# %autoreload 2

In [3]:
data = np.load("processed_data.npz")

In [4]:
data.files

['x_train', 'y_train', 'x_valid', 'y_valid', 'x_test', 'y_test']

In [127]:
data['x_train'].shape

(153582, 47)

In [128]:
data['y_train'].shape

(153582, 46)

In [129]:
input_dim = data['x_train'].shape[1]-1
encoding_dim = 10

compression_factor = float(input_dim) / encoding_dim
print("Compression factor: %s" % compression_factor)

autoencoder = tf.keras.Sequential([
    # Encoder Layers
    tf.keras.layers.Dense(4 * encoding_dim, input_shape=(input_dim,), activation='relu'),
    tf.keras.layers.Dense(2 * encoding_dim, activation='relu'),
    tf.keras.layers.Dense(encoding_dim, activation='relu'),
    # Decoder Layers
    tf.keras.layers.Dense(2 * encoding_dim, activation='relu'),
    tf.keras.layers.Dense(4 * encoding_dim, activation='relu'),
    tf.keras.layers.Dense(input_dim, activation='sigmoid')
    ]
)

autoencoder.summary()

Compression factor: 4.6
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_58 (Dense)             (None, 40)                1880      
_________________________________________________________________
dense_59 (Dense)             (None, 20)                820       
_________________________________________________________________
dense_60 (Dense)             (None, 10)                210       
_________________________________________________________________
dense_61 (Dense)             (None, 20)                220       
_________________________________________________________________
dense_62 (Dense)             (None, 40)                840       
_________________________________________________________________
dense_63 (Dense)             (None, 46)                1886      
Total params: 5,856
Trainable params: 5,856
Non-trainable params: 0
__________________________________________________

In [130]:
input_img = tf.keras.layers.Input(shape=(input_dim,))
encoder_layer1 = autoencoder.layers[0]
encoder_layer2 = autoencoder.layers[1]
encoder_layer3 = autoencoder.layers[2]
encoder = tf.keras.Model(input_img, encoder_layer3(encoder_layer2(encoder_layer1(input_img))))

encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 46)                0         
_________________________________________________________________
dense_58 (Dense)             (None, 40)                1880      
_________________________________________________________________
dense_59 (Dense)             (None, 20)                820       
_________________________________________________________________
dense_60 (Dense)             (None, 10)                210       
Total params: 2,910
Trainable params: 2,910
Non-trainable params: 0
_________________________________________________________________


In [131]:
optimizer = keras.optimizers.Adam()
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(data['x_train'][:,:][:,:-1], data['x_train'][:,:][:,:-1],
                epochs=10,
                shuffle=True,
                validation_data=(data['x_valid'][:,:][:,:-1], data['x_valid'][:,:][:,:-1]))

Train on 153582 samples, validate on 19527 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras._impl.keras.callbacks.History at 0x12cce6748>

In [132]:
def get_embedded(not_embedded_data):
    embedded = encoder.predict(not_embedded_data[:,:][:,:-1])
    return np.append(embedded, not_embedded_data[:][:,-1].reshape(embedded.shape[0], 1), axis=1)

In [141]:
x_train_e = get_embedded(data['x_train'])
y_train_e = encoder.predict(data['y_train'])

In [142]:
x_valid_e = get_embedded(data['x_valid'])
y_valid_e = encoder.predict(data['y_valid'])

In [145]:
x_test_e = get_embedded(data['x_test'])
y_test_e = encoder.predict(data['y_test'])

In [146]:
np.savez("embedded_data.npz", x_train_e=x_train_e, y_train_e=y_train_e, 
         x_valid_e= x_valid_e, y_valid_e=y_valid_e, x_test_e=x_test_e, y_test_e=y_test_e)