# CAE

In [None]:
import math

from keras import backend as K
from keras import Model
from keras.layers import Layer, Softmax, Input
from keras.callbacks import EarlyStopping
from keras.initializers import Constant, glorot_normal
from tensorflow.keras.optimizers import Adam



class ConcreteSelect(Layer):
    
    def __init__(self, output_dim, start_temp = 10.0, min_temp = 0.1, alpha = 0.99999, **kwargs):
        self.output_dim = output_dim
        self.start_temp = start_temp
        self.min_temp = K.constant(min_temp)
        self.alpha = K.constant(alpha)
        super(ConcreteSelect, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.temp = self.add_weight(name = 'temp', shape = [], initializer = Constant(self.start_temp), trainable = False)
        self.logits = self.add_weight(name = 'logits', shape = [self.output_dim, input_shape[1]], initializer = glorot_normal(), trainable = True)
        super(ConcreteSelect, self).build(input_shape)
        
    def call(self, X, training=None):
        uniform = K.random_uniform(self.logits.shape, K.epsilon(), 1.0)
        gumbel = -K.log(-K.log(uniform))
        temp = K.update(self.temp, K.maximum(self.min_temp, self.temp * self.alpha))
        noisy_logits = (self.logits + gumbel) / temp
        samples = K.softmax(noisy_logits)
        
        discrete_logits = K.one_hot(K.argmax(self.logits), self.logits.shape[1])
        
        self.selections = K.in_train_phase(samples, discrete_logits, training)
        Y = K.dot(X, K.transpose(self.selections))
        
        return Y
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)
    
class StopperCallback(EarlyStopping):
    
    def __init__(self, mean_max_target = 0.998):
        self.mean_max_target = mean_max_target
        super(StopperCallback, self).__init__(monitor = '', patience = float('inf'), verbose = 1, mode = 'max', baseline = self.mean_max_target)
    
    def on_epoch_begin(self, epoch, logs = None):
        print('mean max of probabilities:', self.get_monitor_value(logs), '- temperature', K.get_value(self.model.get_layer('concrete_select').temp))
        #print( K.get_value(K.max(K.softmax(self.model.get_layer('concrete_select').logits), axis = -1)))
        #print(K.get_value(K.max(self.model.get_layer('concrete_select').selections, axis = -1)))
    
    def get_monitor_value(self, logs):
        monitor_value = K.get_value(K.mean(K.max(K.softmax(self.model.get_layer('concrete_select').logits), axis = -1)))
        return monitor_value


class ConcreteAutoencoderFeatureSelector():
    
    def __init__(self, K, output_function, num_epochs = 300, batch_size = None, learning_rate = 0.001, start_temp = 10.0, min_temp = 0.1, tryout_limit = 5):
        self.K = K
        self.output_function = output_function
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.start_temp = start_temp
        self.min_temp = min_temp
        self.tryout_limit = tryout_limit
        
    def fit(self, X, Y = None, val_X = None, val_Y = None):
        if Y is None:
            Y = X
        assert len(X) == len(Y)
        validation_data = None
        if val_X is not None and val_Y is not None:
            assert len(val_X) == len(val_Y)
            validation_data = (val_X, val_Y)
        
        if self.batch_size is None:
            self.batch_size = max(len(X) // 256, 16)
        
        num_epochs = self.num_epochs
        steps_per_epoch = (len(X) + self.batch_size - 1) // self.batch_size
        
        for i in range(self.tryout_limit):
            
            K.set_learning_phase(1)
            
            inputs = Input(shape = X.shape[1:])

            alpha = math.exp(math.log(self.min_temp / self.start_temp) / (num_epochs * steps_per_epoch))
            
            self.concrete_select = ConcreteSelect(self.K, self.start_temp, self.min_temp, alpha, name = 'concrete_select')

            selected_features = self.concrete_select(inputs)

            outputs = self.output_function(selected_features)

            self.model = Model(inputs, outputs)

            self.model.compile(Adam(self.learning_rate), loss = 'mean_squared_error')
            
            print(self.model.summary())
            
            stopper_callback = StopperCallback()
            
            hist = self.model.fit(X, Y, self.batch_size, num_epochs, verbose = 1, callbacks = [stopper_callback], validation_data = validation_data)#, validation_freq = 10)
            
            if K.get_value(K.mean(K.max(K.softmax(self.concrete_select.logits, axis = -1)))) >= stopper_callback.mean_max_target:
                break
            
            num_epochs *= 2
        
        self.probabilities = K.get_value(K.softmax(self.model.get_layer('concrete_select').logits))
        self.indices = K.get_value(K.argmax(self.model.get_layer('concrete_select').logits))
            
        return self
    
    def get_indices(self):
        return K.get_value(K.argmax(self.model.get_layer('concrete_select').logits))
    
    def get_mask(self):
        return K.get_value(K.sum(K.one_hot(K.argmax(self.model.get_layer('concrete_select').logits), self.model.get_layer('concrete_select').logits.shape[1]), axis = 0))
    
    def transform(self, X):
        return X[self.get_indices()]
    
    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)
    
    def get_support(self, indices = False):
        return self.get_indices() if indices else self.get_mask()
    
    def get_params(self):
        return self.model

# Experiment

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Softmax
import numpy as np

from functools import reduce


In [None]:
import torch
from torch.distributions.normal import Normal
from torch.distributions.multivariate_normal import MultivariateNormal


def gauss_rule1(x):
    return ((x[0] > 0.55) or (x[1] > 0.55))

def gauss_rule2(x):
    return ((x[0]*x[1] > 0.30) or (x[2]*x[3] > 0.30))

def gauss_rule3(x):
    return ((x[0]*x[1] > 0.30) or (x[0]*x[2] > 0.30))

def gauss_rule4(x):
    return (x[0]*x[3] > 0.30) or (x[6]*x[9] > 0.30)

def gauss_sample(nfeatures):
    return torch.randn(nfeatures)

n_correlated = 3
correlation_value = 0.99
cov_correlated = (1-correlation_value)*torch.eye(n_correlated) + torch.full([n_correlated, n_correlated], correlation_value)
dist_correlated = MultivariateNormal(loc=torch.full([n_correlated], 0.0), covariance_matrix=cov_correlated)
def gauss_sample_correlated(nfeatures):
    noise = torch.randn((nfeatures-4*n_correlated))
    return torch.cat([dist_correlated.sample(), dist_correlated.sample(), dist_correlated.sample(), dist_correlated.sample(), noise], dim=-1)

gauss_rules = {1: gauss_rule1, 2: gauss_rule2, 3: gauss_rule3, 4: gauss_rule4}
gauss_groups = {1: [np.array([0]), np.array([1])], 2: [np.array([0, 1]), np.array([2, 3])],
                3: [np.array([0, 1]), np.array([0, 2])], 4: [np.array([0, 3]), np.array([6, 9])]}
gauss_dists = {1: gauss_sample, 2: gauss_sample, 3: gauss_sample, 4: gauss_sample_correlated}
gauss_oracle_features = {1: np.array([0, 1]), 2: np.array([0, 1, 2, 3]),
                         3: np.array([0, 1, 2]), 4: np.array([0, 3, 6, 9])}



def make_syn_data(rule, num_data, nfeatures=500, train=True):
    sampler = gauss_dists[rule]
    selection_rule = gauss_rules[rule]
    n0 = 0
    n1 = 0
    x_data = torch.empty((num_data, nfeatures))
    y_data = torch.empty((num_data))
    for i in range(num_data):
        x = sampler(nfeatures)
        if selection_rule(x):
            y = torch.tensor(1).long()
            n1 += 1
        else:
            y = torch.tensor(0).long()
            n0 += 1
        x_data[i] = x
        y_data[i] = y

    is_train = '\nTrain' if train else 'Test '
    print(is_train+' Data Proportions:  0: {:.3f}, 1: {:.3f}'.format(n0/num_data, n1/num_data))
    x_data = x_data.numpy()
    y_data = y_data.numpy()
    return x_data, y_data


def set_seed(x):
    # set a consistent seed, so we can run across different runs
    x *= 10000
    np.random.seed(x)
    tf.random.set_seed(x)
    torch.manual_seed(x)
    torch.cuda.manual_seed(x)
    torch.cuda.manual_seed_all(x)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
def get_jaccard_score(true_groups, predicted_groups):
    # get jaccard similarity score for a model
    jaccard_score = 0
    if len(true_groups) == 0: # i.e. we don't know the ground truth
       return -1, len(true_groups), len(predicted_groups)
    if len(predicted_groups)>0:
      for g in true_groups:
         current_max = 0
         for g_hat in predicted_groups:
            jac = np.intersect1d(g, g_hat).size / np.union1d(g, g_hat).size
            if jac == 1:
               current_max = 1
               break
            if jac > current_max:
               current_max = jac
         jaccard_score += current_max
      jaccard_score /= len(true_groups)
      return jaccard_score, len(true_groups), len(predicted_groups)
    else:   # we didn't find anything
      return 0, len(true_groups), len(predicted_groups)


def tpr_fdr(true_groups, predicted_groups):
   # true positive rate and false discovery rate
   
   if len(true_groups) == 0:  # ground truth not known
      return -1, -1
   if len(predicted_groups) == 0:
      return 0.0, 0.0

   predicted_features = np.unique(reduce(np.union1d, predicted_groups))
   true_features = np.unique(reduce(np.union1d, true_groups))

   overlap = np.intersect1d(predicted_features, true_features).size
   tpr = 100*overlap/len(true_features)
   fdr = 100*(len(predicted_features)-overlap)/len(predicted_features) #if len(predicted_features) != 0 else 0.0
   return tpr, fdr

In [None]:
experiment_no = 10
experiment_choice = 4
train_size = 20000
test_size = 200
nfeatures = 500
nhidden = 200
batchsize = 250
lr = 0.001
nepochs = 250
tryoutlimit = 1
nk = len(gauss_oracle_features[experiment_choice])
set_seed(experiment_no)

x_train, y_train_ = make_syn_data(experiment_choice, train_size, nfeatures=nfeatures, train=True)
x_test, y_test_ = make_syn_data(experiment_choice, test_size, nfeatures=nfeatures, train=False)
y_train = to_categorical(y_train_)
y_test = to_categorical(y_test_)     # to_categorical turns to one-hot encoding
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


Train Data Proportions:  0: 0.521, 1: 0.478
Test  Data Proportions:  0: 0.505, 1: 0.495
(20000, 500)
(20000, 2)
(200, 500)
(200, 2)


In [None]:
def f(x):
    x = Dense(nhidden, activation='relu')(x)
    x = Dense(nhidden, activation='relu')(x)
    x = Dense(nfeatures)(x)
    return x

def g(x):
    x = Dense(nhidden, activation='relu')(x)
    x = Dense(nhidden, activation='relu')(x)
    x = Dense(2)(x)
    x = Softmax()(x)
    return x

In [None]:
supervised_selector = ConcreteAutoencoderFeatureSelector(K=nk, output_function=g, num_epochs=nepochs, tryout_limit=tryoutlimit, batch_size=batchsize)
supervised_selector.fit(x_train, y_train, x_test, y_test)

Model: "model_129"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_131 (InputLayer)      [(None, 500)]             0         
                                                                 
 concrete_select (ConcreteSe  (None, 4)                2001      
 lect)                                                           
                                                                 
 dense_1657 (Dense)          (None, 200)               1000      
                                                                 
 dense_1658 (Dense)          (None, 200)               40200     
                                                                 
 dense_1659 (Dense)          (None, 2)                 402       
                                                                 
 softmax_404 (Softmax)       (None, 2)                 0         
                                                         



mean max of probabilities: 0.002304283 - temperature 9.815204
Epoch 2/250
mean max of probabilities: 0.0023825637 - temperature 9.6338215
Epoch 3/250
mean max of probabilities: 0.002557949 - temperature 9.4557905
Epoch 4/250
mean max of probabilities: 0.0027683375 - temperature 9.281052
Epoch 5/250
mean max of probabilities: 0.0029642715 - temperature 9.10954
Epoch 6/250
mean max of probabilities: 0.0032197807 - temperature 8.941198
Epoch 7/250
mean max of probabilities: 0.0034129159 - temperature 8.775969
Epoch 8/250
mean max of probabilities: 0.003659349 - temperature 8.613791
Epoch 9/250
mean max of probabilities: 0.003990921 - temperature 8.454611
Epoch 10/250
mean max of probabilities: 0.0043614362 - temperature 8.298373
Epoch 11/250
mean max of probabilities: 0.0046702893 - temperature 8.1450205
Epoch 12/250
mean max of probabilities: 0.005000039 - temperature 7.994502
Epoch 13/250
mean max of probabilities: 0.005435537 - temperature 7.8467665
Epoch 14/250
mean max of probabiliti

<__main__.ConcreteAutoencoderFeatureSelector at 0x7fb0303e8c90>

In [None]:
unsupervised_selector = ConcreteAutoencoderFeatureSelector(K=nk, output_function=f, num_epochs=nepochs, tryout_limit=tryoutlimit, batch_size=batchsize)
unsupervised_selector.fit(x_train, x_train, x_test, x_test)

Model: "model_130"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_132 (InputLayer)      [(None, 500)]             0         
                                                                 
 concrete_select (ConcreteSe  (None, 4)                2001      
 lect)                                                           
                                                                 
 dense_1660 (Dense)          (None, 200)               1000      
                                                                 
 dense_1661 (Dense)          (None, 200)               40200     
                                                                 
 dense_1662 (Dense)          (None, 500)               100500    
                                                                 
Total params: 143,701
Trainable params: 143,700
Non-trainable params: 1
___________________________________________________



mean max of probabilities: 0.0023053172 - temperature 9.815204
Epoch 2/250
mean max of probabilities: 0.002418059 - temperature 9.6338215
Epoch 3/250
mean max of probabilities: 0.0026144832 - temperature 9.4557905
Epoch 4/250
mean max of probabilities: 0.0028299056 - temperature 9.281052
Epoch 5/250
mean max of probabilities: 0.0030675326 - temperature 9.10954
Epoch 6/250
mean max of probabilities: 0.0033288493 - temperature 8.941198
Epoch 7/250
mean max of probabilities: 0.003618824 - temperature 8.775969
Epoch 8/250
mean max of probabilities: 0.0039321044 - temperature 8.613791
Epoch 9/250
mean max of probabilities: 0.0042811623 - temperature 8.454611
Epoch 10/250
mean max of probabilities: 0.004670392 - temperature 8.298373
Epoch 11/250
mean max of probabilities: 0.005096977 - temperature 8.1450205
Epoch 12/250
mean max of probabilities: 0.005552999 - temperature 7.994502
Epoch 13/250
mean max of probabilities: 0.006066421 - temperature 7.8467665
Epoch 14/250
mean max of probabiliti

<__main__.ConcreteAutoencoderFeatureSelector at 0x7fb03042a3d0>

# Test Models

In [None]:
y_pred = supervised_selector.model(x_test)
y_pred = tf.math.argmax(y_pred, axis=-1)
acc = 100*np.mean((y_pred==y_test_).numpy())

x_pred = unsupervised_selector.model(x_test)
reconstruction_error = np.mean((x_pred.numpy() - x_test)**2)

In [None]:
supervised_selected = supervised_selector.get_indices()
unsupervised_selected = unsupervised_selector.get_indices()

selected = supervised_selected
tpr, fdr = tpr_fdr(gauss_groups[experiment_choice], [selected])

print('Supervised:')
print(get_jaccard_score(gauss_groups[experiment_choice], [selected]))
print(acc)
print(selected)
print(tpr)
print(fdr)


selected = unsupervised_selected
tpr, fdr = tpr_fdr(gauss_groups[experiment_choice], [selected])

print('\nUnsupervised:')
print(get_jaccard_score(gauss_groups[experiment_choice], [selected]))
print(reconstruction_error)
print(selected)
print(tpr)
print(fdr)

Supervised:
(0.35, 2, 1)
91.0
[5 9 6 0]
75.0
25.0

Unsupervised:
(0.0, 2, 1)
0.9942412
[  7 222   1   4]
0.0
100.0
