In [1]:
from keras import backend as K
from keras.objectives import mse
from keras.optimizers import Adam
from keras.layers import Dense, Lambda, Input
from keras.models import Model
import matplotlib.pyplot as plt
import numpy as np
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import FloatType


In [2]:
with open("/dbfs/mnt/ddda/ml-bme-scoring/tmp/learningday_june8/acxiom_features.npz", "rb") as in_file:
  training_data = np.load(in_file)
training_data = training_data.squeeze(1)

In [3]:
def build_autoencoder_model(input_dims, hidden_layer_dims, z_dims, output_activation='linear', hidden_activation='relu'):
  x = enc_in = Input(shape=(input_dims,))
  for layer_dims in hidden_layer_dims:
    x = Dense(layer_dims, activation=hidden_activation)(x)
  z = Dense(z_dims, activation=hidden_activation)(x)
  
  x = dec_in = Input(shape=(z_dims,))
  for layers_dims in reversed(hidden_layer_dims):
    x = Dense(layer_dims, activation=hidden_activation)(x)
  x_recon = Dense(input_dims, activation=output_activation)(x)
 
  encoder = Model(inputs=[enc_in], outputs=[z])
  decoder = Model(inputs=[dec_in], outputs=[x_recon])
  
  trainer_in = Input(shape=(input_dims,))
  trainer_out = decoder(encoder(trainer_in))
  trainer = Model(inputs=[trainer_in], outputs=[trainer_out])
  trainer.compile(loss='mse', optimizer='adam')
  
  return encoder, decoder, trainer


def sampling(args):
  """Reparameterization trick by sampling fr an isotropic unit Gaussian.
  # Arguments:
      args (tensor): mean and log of variance of Q(z|X)
  # Returns:
      z (tensor): sampled latent vector
  """
  z_mean, z_log_var = args
  batch = K.shape(z_mean)[0]
  dim = K.int_shape(z_mean)[1]
  # by default, random_normal has mean=0 and std=1.0
  epsilon = K.random_normal(shape=(batch, dim)) * K.cast(K.learning_phase(), K.floatx())
  return z_mean + K.exp(0.5 * z_log_var) * epsilon
  

def build_vae_model(input_dims, hidden_layer_dims, z_dims, output_activation='linear', hidden_activation='relu', z_activation='linear'):
  x = enc_in = Input(shape=(input_dims,))
  for layer_dims in hidden_layer_dims:
    x = Dense(layer_dims, activation=hidden_activation)(x)
  z_mu = Dense(z_dims, activation=z_activation, name='z_mu')(x)
  z_log_var = Dense(z_dims, activation=z_activation, name='z_log_var')(x)
  z = Lambda(sampling, output_shape=(z_dims,))([z_mu, z_log_var])
  
  x = dec_in = Input(shape=(z_dims,))
  for layers_dims in reversed(hidden_layer_dims):
    x = Dense(layer_dims, activation=hidden_activation)(x)
  x_recon = Dense(input_dims, activation=output_activation)(x)
 
  encoder = Model(inputs=[enc_in], outputs=[z])
  decoder = Model(inputs=[dec_in], outputs=[x_recon])
  
  trainer_in = Input(shape=(input_dims,))
  trainer_out = decoder(encoder(enc_in))
  trainer = Model(inputs=[enc_in], outputs=[trainer_out])
  
  def vae_loss(x, x_recon):
    recon_loss = mse(x, x_recon)

    kl_loss = 1 + z_log_var - K.square(z_mu) - K.exp(z_log_var)
    kl_loss = K.sum(kl_loss, axis=-1)
    kl_loss *= -0.5
    return K.mean(recon_loss + kl_loss)
  
  optimizer = Adam(lr=0.0001)
  trainer.compile(loss=vae_loss, optimizer=optimizer)
  return encoder, decoder, trainer


def build_rel_vae_model(input_dims, hidden_layer_dims, z_dims, output_activation='linear',
                        hidden_activation='relu', z_activation='linear', alpha=0.5):
  x = enc_in = Input(shape=(input_dims,))
  for layer_dims in hidden_layer_dims:
    x = Dense(layer_dims, activation=hidden_activation)(x)
  z_mu = Dense(z_dims, activation=z_activation, name='z_mu')(x)
  z_log_var = Dense(z_dims, activation=z_activation, name='z_log_var')(x)
  z = Lambda(sampling, output_shape=(z_dims,))([z_mu, z_log_var])
  
  x = dec_in = Input(shape=(z_dims,))
  for layers_dims in reversed(hidden_layer_dims):
    x = Dense(layer_dims, activation=hidden_activation)(x)
  x_recon = Dense(input_dims, activation=output_activation)(x)
 
  encoder = Model(inputs=[enc_in], outputs=[z])
  decoder = Model(inputs=[dec_in], outputs=[x_recon])
  
  trainer_in = Input(shape=(input_dims,))
  trainer_out = decoder(encoder(enc_in))
  trainer = Model(inputs=[enc_in], outputs=[trainer_out])
  
  def rel_vae_loss(x, x_recon):
    recon_loss = mse(x, x_recon)
    
    batch_dims = K.shape(x)[0]
    dims = K.int_shape(x_recon)[1:]
    dims_prod = np.prod(dims) * K.cast(batch_dims, K.floatx())
    r_x = K.dot(K.transpose(x), x) / dims_prod
    r_recon = K.dot(K.transpose(x_recon), x_recon) / dims_prod
    relational_loss = K.mean(K.square(r_x - r_recon))
    
    kl_loss = 1 + z_log_var - K.square(z_mu) - K.exp(z_log_var)
    kl_loss = K.sum(kl_loss, axis=-1)
    kl_loss *= -0.5
    
    return K.mean(kl_loss + (1. - alpha) * recon_loss) + alpha * relational_loss
  
  optimizer = Adam(lr=0.001)
  trainer.compile(loss=rel_vae_loss, optimizer=optimizer)
  return encoder, decoder, trainer


In [4]:
input_dims = training_data.shape[-1]
hidden_layer_dims = ()  # shallow VAE 
hidden_layer_dims = (100,)  # deeper VAE
z_dims = 100
alpha = 0.5
encoder, decoder, trainer = build_rel_vae_model(input_dims, hidden_layer_dims, z_dims, alpha=alpha)
#encoder, decoder, trainer = build_vae_model(input_dims, hidden_layer_dims, z_dims)
encoder.summary()
decoder.summary()

In [5]:
x = training_data[:100000]
history = trainer.fit(x, x, validation_split=0.2, verbose=False, epochs=10)

In [6]:
plt.cla()
plt.ylim(0, 10)
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'], color='red')
display(plt.gcf())

In [7]:
history.history

In [8]:
xt = customer_embeddings = encoder.predict(x)

In [9]:
plt.clf()
plt.title('2D projection of Acxiom customer data')
# hw = 0.4
# plt.xlim(-0.3, 0.3)
#plt.ylim(-hw, 0.2)
plt.scatter(xt[:, 0], xt[:, 1])

display(plt.gcf())

In [11]:
customer_embeddings.shape

In [12]:
from sklearn.decomposition import PCA
pca = PCA(2)
pca.fit(xt)

In [13]:
x_pca = pca.transform(xt)

plt.clf()
hw = 5
plt.xlim(-hw, hw)
plt.ylim(-hw, hw)
plt.scatter(x_pca[:, 0], x_pca[:, 1])
plt.title('2D projection of Acxiom customer data using PCA')

display(plt.gcf())

In [14]:
from pyspark.mllib.clustering import KMeans, KMeansModel

data = sc.parallelize([a for a in xt])
clusters = KMeans.train(data, 20, maxIterations=10, initializationMode="random")

In [15]:
plt.clf()
plt.xlim(-0.5, 0.5)
plt.ylim(-0.5, 0.5)
plt.scatter(xt[:, 0], xt[:, 1])

cs = np.array(clusters.centers)
plt.scatter(cs[:, 0], cs[:, 1], color='yellow')

display(plt.gcf())

In [16]:
from sklearn.svm import OneClassSVM

svm = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
svm.fit(xt)

In [17]:
hw = 0.5
grid_samples = 100
xx, yy = np.meshgrid(np.linspace(-hw, hw, grid_samples), np.linspace(-hw, hw, grid_samples))
Z = svm.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.clf()
plt.xlim(-hw, hw)
plt.ylim(-hw, hw)

plt.scatter(xt[:, 0], xt[:, 1])
plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='yellow')

display(plt.gcf())

## ^^ COOL CLUSTERS, FELLOW