In [None]:
import copy
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as graph
import seaborn as sns

from multiprocessing import cpu_count

from rosey.helpers import vec_to_array

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from xgboost import XGBClassifier

import keras
import keras.losses as klosses
import keras.optimizers as kopt
from keras.models import Model
from keras.layers import Input, Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras_tqdm import TQDMNotebookCallback
# %env KERAS_BACKEND=theano
# %env THEANO_FLAGS=device=cuda3,floatX=float32,optimizer=fast_run

graph.style.use('fivethirtyeight')

def plot_number(xi, yi=''):
    graph.figure(figsize=(1, 1))
    graph.title('{}'.format(yi))
    graph.imshow(xi.reshape(28, 28), cmap='Greys')
    graph.xticks([])
    graph.yticks([])
    graph.grid(False)
    graph.show()

In [None]:
# Load data
df = pd.read_csv('data/digit-recognizer/train.csv')

y = df.pop('label')
x = df.values
x = MinMaxScaler().fit_transform(x)
print(y.shape, x.shape)

y_train, y_test, x_train, x_test = train_test_split(y, x, test_size=0.25)
_ = [print(a.shape) for a in (y_train, y_test, x_train, x_test)]

In [None]:
plot_number(x[0, :])
plot_number(x[39370, :])
plot_number(x[17521, :])

# Simple Autoencoder

In [None]:
# Simple Autoencoder
input_layer = Input(shape=(x.shape[1],))

encoder_layer = Dense(100, activation='elu')(input_layer)  # Bottleneck

output_layer = Dense(x.shape[1], activation='sigmoid')(encoder_layer)

# Create models
autoencoder = Model(input_layer, output_layer)
encoder = Model(input_layer, encoder_layer)

display(autoencoder.summary())

# Compile
autoencoder.compile(
    loss=klosses.binary_crossentropy,
    optimizer=kopt.RMSprop(),
)

In [None]:
# Before Fitting!
for i in range(3):
    number = vec_to_array(x_train[i, :]).T

    plot_number(number)
    plot_number(autoencoder.predict(number))

In [None]:
%%time
# Fit model! (n*100) epochs
for i in range(5):
    print(f'Epoch Cycle {i+1}')
    
    # Check fit
    for i_img in range(5):
        number_i = vec_to_array(x_train[i_img, :]).T
        
        plot_number(number_i, 'in')
        plot_number(autoencoder.predict(number_i), 'out')
    
    # Train
    hist = autoencoder.fit(
        x_train, x_train,
        epochs=100, validation_data=(x_test, x_test), batch_size=4096,
        callbacks=[
            EarlyStopping(monitor='val_loss', patience=15),
            ReduceLROnPlateau(monitor='val_loss', patience=5, factor=0.5, min_lr=1e-6, verbose=1),
            # TQDMCallback(),
            ModelCheckpoint('data/model.ckp', monitor='val_loss', save_best_only=True)
        ],
        shuffle=True,
        verbose=0
    )
    
    # Summary of performance
    graph.plot(hist.history['loss'], label='Train Score')
    graph.plot(hist.history['val_loss'], label='Validation Score')
    graph.legend()
    graph.show()

In [None]:
# After Fitting!
for i in range(3):
    number = vec_to_array(x_train[i, :]).T

    plot_number(number)
    plot_number(autoencoder.predict(number))

In [None]:
new_number = vec_to_array(x_test[1013, :]).T

plot_number(new_number)

sns.heatmap(new_number.reshape((28, 28)), square=True)
graph.show()

print('Representation')
sns.heatmap(encoder.predict(new_number))
graph.show()

print('')
plot_number(autoencoder.predict(new_number))

sns.heatmap(autoencoder.predict(new_number).reshape((28, 28)), square=True)
graph.show()

In [None]:
# Test set fits!
for i in range(10):
    number = vec_to_array(x_test[i, :]).T

    plot_number(number, 'Input')
    plot_number(autoencoder.predict(number), 'Output')

# Predicting Numbers

I'm going to use the representations as the features for predicting which number is being represented.

In [None]:
# L stands for latent representation
l_train, l_test = [encoder.predict(data) for data in (x_train, x_test)]

print(l_train.shape, l_test.shape)

In [None]:
%%time
logit = LogisticRegressionCV(Cs=25, cv=2, penalty='l2', n_jobs=-1, multi_class='multinomial')
logit.fit(l_train, y_train)

In [None]:
%%time
gbm = XGBClassifier(n_estimators=2000, n_jobs=cpu_count())
gbm.fit(l_train, y_train)

In [None]:
# Scores
print(f'Raw    = {logit.score(l_test, y_test)}')
print(f'XGB    = {gbm.score(l_test, y_test)}')