# Dimensionality reduction with autoencoders
- Autoencoders can be used to reduce dimensionality of the dataset
- They are powerful compared to methods such as PCA since they can involve nonlinear transformations (i.e., nonlinear activation functions)

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import datasets
from keras.layers import Input, Dense
from keras.models import Model

Using TensorFlow backend.


In [2]:
data = datasets.load_digits()

In [7]:
X_data = data.images
y_data = data.target

X_data = X_data.reshape(X_data.shape[0], 64)

In [8]:
# fit in data instances into interval [0,1]
X_data = X_data / 16.
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.3, random_state = 777)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1257, 64)
(540, 64)
(1257,)
(540,)


In [10]:
# define coding dimension. Coding dimension will be the size of reduced data dimension
code_dim = 16

In [11]:
def auto_encoder_model():    
    inputs = Input(shape = (X_train.shape[1],), name = 'input')                         # input layer
    code = Dense(code_dim, activation = 'relu', name = 'code')(inputs)                  # hidden layer => represents "codes"
    outputs = Dense(X_train.shape[1], activation = 'softmax', name = 'output')(code)    # output layer

    auto_encoder = Model(inputs = inputs, outputs = outputs)

    encoder = Model(inputs = inputs, outputs = code)

    decoder_input = Input(shape = (code_dim,))
    decoder_output = auto_encoder.layers[-1]
    decoder = Model(inputs = decoder_input, outputs = decoder_output(decoder_input))

    auto_encoder.compile(optimizer='adam', loss='binary_crossentropy')
    return encoder, decoder, auto_encoder

In [12]:
encoder, decoder, auto_encoder = auto_encoder_model()

In [14]:
%%time
auto_encoder.fit(X_train, X_train, epochs = 100, batch_size = 50, validation_data = (X_test, X_test), verbose = 0)

Wall time: 3.68 s


<keras.callbacks.History at 0x165cf8d0>

In [19]:
# generate reduced data by using "encoders"
training_data_reduced = encoder.predict(X_train)
test_data_reduced = encoder.predict(X_test)

Data instances are reduced to dimensionality of 16 (= coding dimension)

In [21]:
print(training_data_reduced[0])    # first insance of reduced training data
print(test_data_reduced[0])        # first instance of reduced test data

[ 3.80530763  1.40662622  2.70148444  4.4706459   1.58172095  3.037673
  2.62502003  0.76960731  2.0610199   1.76251006  3.91513109  3.0303731
  3.48311281  1.46974468  3.43927789  3.81914759]
[ 2.02643633  2.05360436  1.0553565   3.56827545  3.66347456  3.34595394
  3.43908119  2.14790154  1.23506999  2.21335554  1.63356566  1.7646265
  2.78458714  1.40197086  1.6926899   2.78371859]


In [22]:
print(training_data_reduced.shape)
print(test_data_reduced.shape)

(1257, 16)
(540, 16)
