# CAE

In [None]:
import math

from keras import backend as K
from keras import Model
from keras.layers import Layer, Softmax, Input
from keras.callbacks import EarlyStopping
from keras.initializers import Constant, glorot_normal
from tensorflow.keras.optimizers import Adam



class ConcreteSelect(Layer):
    
    def __init__(self, output_dim, start_temp = 10.0, min_temp = 0.1, alpha = 0.99999, **kwargs):
        self.output_dim = output_dim
        self.start_temp = start_temp
        self.min_temp = K.constant(min_temp)
        self.alpha = K.constant(alpha)
        super(ConcreteSelect, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.temp = self.add_weight(name = 'temp', shape = [], initializer = Constant(self.start_temp), trainable = False)
        self.logits = self.add_weight(name = 'logits', shape = [self.output_dim, input_shape[1]], initializer = glorot_normal(), trainable = True)
        super(ConcreteSelect, self).build(input_shape)
        
    def call(self, X, training=None):
        uniform = K.random_uniform(self.logits.shape, K.epsilon(), 1.0)
        gumbel = -K.log(-K.log(uniform))
        temp = K.update(self.temp, K.maximum(self.min_temp, self.temp * self.alpha))
        noisy_logits = (self.logits + gumbel) / temp
        samples = K.softmax(noisy_logits)
        
        discrete_logits = K.one_hot(K.argmax(self.logits), self.logits.shape[1])
        
        self.selections = K.in_train_phase(samples, discrete_logits, training)
        Y = K.dot(X, K.transpose(self.selections))
        
        return Y
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)
    
class StopperCallback(EarlyStopping):
    
    def __init__(self, mean_max_target = 0.998):
        self.mean_max_target = mean_max_target
        super(StopperCallback, self).__init__(monitor = '', patience = float('inf'), verbose = 1, mode = 'max', baseline = self.mean_max_target)
    
    def on_epoch_begin(self, epoch, logs = None):
        print('mean max of probabilities:', self.get_monitor_value(logs), '- temperature', K.get_value(self.model.get_layer('concrete_select').temp))
        #print( K.get_value(K.max(K.softmax(self.model.get_layer('concrete_select').logits), axis = -1)))
        #print(K.get_value(K.max(self.model.get_layer('concrete_select').selections, axis = -1)))
    
    def get_monitor_value(self, logs):
        monitor_value = K.get_value(K.mean(K.max(K.softmax(self.model.get_layer('concrete_select').logits), axis = -1)))
        return monitor_value


class ConcreteAutoencoderFeatureSelector():
    
    def __init__(self, K, output_function, num_epochs = 300, batch_size = None, learning_rate = 0.001, start_temp = 10.0, min_temp = 0.1, tryout_limit = 5):
        self.K = K
        self.output_function = output_function
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.start_temp = start_temp
        self.min_temp = min_temp
        self.tryout_limit = tryout_limit
        
    def fit(self, X, Y = None, val_X = None, val_Y = None):
        if Y is None:
            Y = X
        assert len(X) == len(Y)
        validation_data = None
        if val_X is not None and val_Y is not None:
            assert len(val_X) == len(val_Y)
            validation_data = (val_X, val_Y)
        
        if self.batch_size is None:
            self.batch_size = max(len(X) // 256, 16)
        
        num_epochs = self.num_epochs
        steps_per_epoch = (len(X) + self.batch_size - 1) // self.batch_size
        
        for i in range(self.tryout_limit):
            
            K.set_learning_phase(1)
            
            inputs = Input(shape = X.shape[1:])

            alpha = math.exp(math.log(self.min_temp / self.start_temp) / (num_epochs * steps_per_epoch))
            
            self.concrete_select = ConcreteSelect(self.K, self.start_temp, self.min_temp, alpha, name = 'concrete_select')

            selected_features = self.concrete_select(inputs)

            outputs = self.output_function(selected_features)

            self.model = Model(inputs, outputs)

            self.model.compile(Adam(self.learning_rate), loss = 'mean_squared_error')
            
            print(self.model.summary())
            
            stopper_callback = StopperCallback()
            
            hist = self.model.fit(X, Y, self.batch_size, num_epochs, verbose = 1, callbacks = [stopper_callback], validation_data = validation_data)#, validation_freq = 10)
            
            if K.get_value(K.mean(K.max(K.softmax(self.concrete_select.logits, axis = -1)))) >= stopper_callback.mean_max_target:
                break
            
            num_epochs *= 2
        
        self.probabilities = K.get_value(K.softmax(self.model.get_layer('concrete_select').logits))
        self.indices = K.get_value(K.argmax(self.model.get_layer('concrete_select').logits))
            
        return self
    
    def get_indices(self):
        return K.get_value(K.argmax(self.model.get_layer('concrete_select').logits))
    
    def get_mask(self):
        return K.get_value(K.sum(K.one_hot(K.argmax(self.model.get_layer('concrete_select').logits), self.model.get_layer('concrete_select').logits.shape[1]), axis = 0))
    
    def transform(self, X):
        return X[self.get_indices()]
    
    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)
    
    def get_support(self, indices = False):
        return self.get_indices() if indices else self.get_mask()
    
    def get_params(self):
        return self.model

# Experiment

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Softmax
import numpy as np
import os
import os.path as osp

from functools import reduce


In [None]:
import torch
from google.colab import drive

drive.mount('gdrive')
foldername = 'gdrive/MyDrive/Cambridge Work/group feature selection/group_lasso/chem_data'


chem_data_groups = {4: [np.array([40]), np.array([1])], # logic_4 = ether OR NOT alkyne
		    10: [np.array([56, 18]), np.array([40])], # logic_10 = (primary amine AND NOT ether) OR (NOT benzene AND NOT ether)
		    13: [np.array([18, 29]), np.array([1, 40])], # logic_13 = (benzene AND NOT carbonyl) OR (alkyne AND NOT ether)
                    }
chem_oracle_features = {1: np.array([18]), 4: np.array([1, 40]), 8: np.array([41, 55, 83]),
                        10: np.array([18, 40, 56]), 12: np.array([8, 41, 55, 56]),
                        13: np.array([1, 18, 29, 40])}


def make_syn_data(rule, train=True):
    is_train = 'train' if train else 'test'
    x_data = np.load(osp.join(foldername, 'logic_'+str(rule)+'_X_'+is_train+'.npy'))
    y_data = np.load(osp.join(foldername, 'logic_'+str(rule)+'_Y_'+is_train+'.npy'))
    return x_data, y_data


def set_seed(x):
    # set a consistent seed, so we can run across different runs
    x *= 10000
    np.random.seed(x)
    tf.random.set_seed(x)
    torch.manual_seed(x)
    torch.cuda.manual_seed(x)
    torch.cuda.manual_seed_all(x)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


In [None]:
def get_jaccard_score(true_groups, predicted_groups):
    # get jaccard similarity score for a model
    jaccard_score = 0
    if len(true_groups) == 0: # i.e. we don't know the ground truth
       return -1, len(true_groups), len(predicted_groups)
    if len(predicted_groups)>0:
      for g in true_groups:
         current_max = 0
         for g_hat in predicted_groups:
            jac = np.intersect1d(g, g_hat).size / np.union1d(g, g_hat).size
            if jac == 1:
               current_max = 1
               break
            if jac > current_max:
               current_max = jac
         jaccard_score += current_max
      jaccard_score /= len(true_groups)
      return jaccard_score, len(true_groups), len(predicted_groups)
    else:   # we didn't find anything
      return 0, len(true_groups), len(predicted_groups)


def tpr_fdr(true_groups, predicted_groups):
   # true positive rate and false discovery rate
   
   if len(true_groups) == 0:  # ground truth not known
      return -1, -1
   if len(predicted_groups) == 0:
      return 0.0, 0.0

   predicted_features = np.unique(reduce(np.union1d, predicted_groups))
   true_features = np.unique(reduce(np.union1d, true_groups))

   overlap = np.intersect1d(predicted_features, true_features).size
   tpr = 100*overlap/len(true_features)
   fdr = 100*(len(predicted_features)-overlap)/len(predicted_features) #if len(predicted_features) != 0 else 0.0
   return tpr, fdr

In [None]:
experiment_no = 10
rule = 13
nhidden = 200
batchsize = 250
lr = 0.001
nepochs = 250
tryoutlimit = 1
nk = len(chem_oracle_features[rule])
set_seed(experiment_no)

x_train, y_train_ = make_syn_data(rule, train=True)
x_test, y_test_ = make_syn_data(rule, train=False)
y_train = to_categorical(y_train_)
y_test = to_categorical(y_test_)     # to_categorical turns to one-hot encoding
nfeatures = x_train.shape[-1]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(14768, 84)
(14768, 2)
(1831, 84)
(1831, 2)


In [None]:
def f(x):
    x = Dense(nhidden, activation='relu')(x)
    x = Dense(nhidden, activation='relu')(x)
    x = Dense(nfeatures, activation='sigmoid')(x)
    return x

def g(x):
    x = Dense(nhidden, activation='relu')(x)
    x = Dense(nhidden, activation='relu')(x)
    x = Dense(2)(x)
    x = Softmax()(x)
    return x

In [None]:
supervised_selector = ConcreteAutoencoderFeatureSelector(K=nk, output_function=g, num_epochs=nepochs, tryout_limit=tryoutlimit, batch_size=batchsize)
supervised_selector.fit(x_train, y_train, x_test, y_test)

Model: "model_63"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_65 (InputLayer)       [(None, 84)]              0         
                                                                 
 concrete_select (ConcreteSe  (None, 4)                337       
 lect)                                                           
                                                                 
 dense_191 (Dense)           (None, 200)               1000      
                                                                 
 dense_192 (Dense)           (None, 200)               40200     
                                                                 
 dense_193 (Dense)           (None, 2)                 402       
                                                                 
 softmax_32 (Softmax)        (None, 2)                 0         
                                                          



mean max of probabilities: 0.016718855 - temperature 9.793396
Epoch 2/250
mean max of probabilities: 0.017873576 - temperature 9.591061
Epoch 3/250
mean max of probabilities: 0.018851046 - temperature 9.392906
Epoch 4/250
mean max of probabilities: 0.019924726 - temperature 9.198845
Epoch 5/250
mean max of probabilities: 0.021004586 - temperature 9.008794
Epoch 6/250
mean max of probabilities: 0.022159971 - temperature 8.822669
Epoch 7/250
mean max of probabilities: 0.023324315 - temperature 8.640389
Epoch 8/250
mean max of probabilities: 0.024732048 - temperature 8.461877
Epoch 9/250
mean max of probabilities: 0.026132058 - temperature 8.287052
Epoch 10/250
mean max of probabilities: 0.027650526 - temperature 8.115838
Epoch 11/250
mean max of probabilities: 0.029208882 - temperature 7.948164
Epoch 12/250
mean max of probabilities: 0.030839983 - temperature 7.7839518
Epoch 13/250
mean max of probabilities: 0.032487124 - temperature 7.6231327
Epoch 14/250
mean max of probabilities: 0.03

<__main__.ConcreteAutoencoderFeatureSelector at 0x7f9b0a68cb10>

In [None]:
unsupervised_selector = ConcreteAutoencoderFeatureSelector(K=nk, output_function=f, num_epochs=nepochs, tryout_limit=tryoutlimit, batch_size=batchsize)
unsupervised_selector.fit(x_train, x_train, x_test, x_test)

Model: "model_64"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_66 (InputLayer)       [(None, 84)]              0         
                                                                 
 concrete_select (ConcreteSe  (None, 4)                337       
 lect)                                                           
                                                                 
 dense_194 (Dense)           (None, 200)               1000      
                                                                 
 dense_195 (Dense)           (None, 200)               40200     
                                                                 
 dense_196 (Dense)           (None, 84)                16884     
                                                                 
Total params: 58,421
Trainable params: 58,420
Non-trainable params: 1
______________________________________________________



mean max of probabilities: 0.017005224 - temperature 9.793396
Epoch 2/250
mean max of probabilities: 0.017104924 - temperature 9.591061
Epoch 3/250
mean max of probabilities: 0.01708558 - temperature 9.392906
Epoch 4/250
mean max of probabilities: 0.017063752 - temperature 9.198845
Epoch 5/250
mean max of probabilities: 0.017119579 - temperature 9.008794
Epoch 6/250
mean max of probabilities: 0.017933462 - temperature 8.822669
Epoch 7/250
mean max of probabilities: 0.019889345 - temperature 8.640389
Epoch 8/250
mean max of probabilities: 0.0222531 - temperature 8.461877
Epoch 9/250
mean max of probabilities: 0.024587234 - temperature 8.287052
Epoch 10/250
mean max of probabilities: 0.02691557 - temperature 8.115838
Epoch 11/250
mean max of probabilities: 0.029272536 - temperature 7.948164
Epoch 12/250
mean max of probabilities: 0.031743933 - temperature 7.7839518
Epoch 13/250
mean max of probabilities: 0.034266964 - temperature 7.6231327
Epoch 14/250
mean max of probabilities: 0.036846

<__main__.ConcreteAutoencoderFeatureSelector at 0x7f9b0a511f90>

# Test Models

In [None]:
y_pred = supervised_selector.model(x_test)
y_pred = tf.math.argmax(y_pred, axis=-1)
acc = 100*np.mean((y_pred==y_test_).numpy())

x_pred = unsupervised_selector.model(x_test)
reconstruction_error = np.mean((x_pred.numpy() - x_test)**2)

In [None]:
supervised_selected = supervised_selector.get_indices()
unsupervised_selected = unsupervised_selector.get_indices()

selected = supervised_selected
tpr, fdr = tpr_fdr(chem_data_groups[rule], [selected])

print('Supervised:')
print(get_jaccard_score(chem_data_groups[rule], [np.unique(selected)]))
print(acc)
print(selected)
print(tpr)
print(fdr)


selected = unsupervised_selected
tpr, fdr = tpr_fdr(chem_data_groups[rule], [selected])

print('\nUnsupervised:')
print(get_jaccard_score(chem_data_groups[rule], [np.unique(selected)]))
print(reconstruction_error)
print(selected)
print(tpr)
print(fdr)

Supervised:
(0.35, 2, 1)
77.22555980338612
[32 40 18 29]
75.0
25.0

Unsupervised:
(0.125, 2, 1)
0.02903564594223849
[29 32 39 32]
25.0
66.66666666666667
