In [1]:
import sys, os

In [2]:
os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/lib/cuda/'

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback, ProgbarLogger
from tensorflow.keras import regularizers as R
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras import layers as L
from tensorflow.keras import optimizers as O
from tensorflow.keras import constraints as C
from tensorflow.keras import backend as K
from tensorflow.keras.losses import binary_crossentropy, categorical_crossentropy, sparse_categorical_crossentropy, Loss
tf.keras.utils.set_random_seed(722)

2023-10-05 21:46:43.279248: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
def balanced_log_loss_np(y_true, y_pred):
    # y_true.shape, y_pred.shape => (62,), (62,)
    # np.unique(y_true) => [0 1]

    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    # .bincount(x, ...) => count number of occurrences of each value in array of non-negative ints.
    nc = np.bincount(y_true)
    # nc => [51 11]   i.e., count of 0, count of 1, count of 2 and so on.
    
    
    # np.log(1-y_pred) => logarithm of the probability of a sample belonging to class "0".
    # np.sum( np.where(y_true==0,1,0) * np.log(1-y_pred) ) => log loss contribution by samples belonging to class "0".
    # -1/nc[0]*( log loss contribution by samples belonging to class "0" ) => scale by the inverse of the count of class "0" instances in the dataset.
    balanced_log_loss_score = (
                                -1/nc[0]*(np.sum( np.where(y_true==0,1,0) * np.log(1-y_pred) )) 
                               - 1/nc[1]*(np.sum( np.where(y_true!=0,1,0) * np.log(y_pred) ))
                              ) / 2
    return balanced_log_loss_score

In [5]:
train_df = pd.read_csv('data/train.csv', index_col='Id')
print(train_df.head(3))


                    AB          AF         AH         AM        AR        AX  \
Id                                                                             
000ff2bfdfe9  0.209377  3109.03329  85.200147  22.394407  8.138688  0.699861   
007255e47698  0.145282   978.76416  85.200147  36.968889  8.138688  3.632190   
013f2bd269f5  0.470030  2635.10654  85.200147  32.360553  8.138688  6.732840   

                    AY         AZ        BC         BD   ...        FL  \
Id                                                       ...             
000ff2bfdfe9  0.025578   9.812214  5.555634  4126.58731  ...  7.298162   
007255e47698  0.025578  13.517790  1.229900  5496.92824  ...  0.173229   
013f2bd269f5  0.025578  12.824570  1.229900  5135.78024  ...  7.709560   

                   FR        FS         GB         GE            GF  \
Id                                                                    
000ff2bfdfe9  1.73855  0.094822  11.339138  72.611063   2003.810319   
007255e47698  0

In [6]:
train_df['EJ'].head(3)

Id
000ff2bfdfe9    B
007255e47698    A
013f2bd269f5    B
Name: EJ, dtype: object

In [7]:
train_df['EJ'] = train_df['EJ'].replace({'A': 0, 'B': 1})

In [8]:
# train_df.isna().any() =>
# ...
# BQ        True
# BR       False
# BZ       False
# CB        True
# CC        True
# CD       False
# ...
# Class    False
# dtype: bool

In [9]:
nan_fill = train_df.isna().any()
nan_fill *= train_df.min() - train_df.max()
# nan_fill =>
# ...
# BP         -0.000000
# BQ       -343.312950
# BR         -0.000000
# BZ         -0.000000
# CB      -2258.936407
# CC         -3.926157
# CD         -0.000000
# ...
# Class      -0.000000
# dtype: float64

In [10]:
assert len(train_df.columns)==len(train_df.median())

In [11]:
nan_fill[nan_fill == 0] = train_df.median()
train_df = train_df.fillna(nan_fill)

In [12]:
X = train_df.iloc[:,:-1].values
tgt = train_df.Class.values

In [13]:
len(tgt)

617

In [14]:
tgt[:5]

array([1, 0, 0, 0, 1])

In [15]:
### This is the hard-coded label from baseline DNN model, where "1" is difficult to predict, "0" - easy.
### (y_true = 1 and y_pred < 0.2) or (y_true = 0 and y_pred > 0.8) -> label "1", otherwise label "0".
tgt2 = np.asarray([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0], dtype=np.int32)

In [16]:
assert len(tgt) == len(tgt2)

In [17]:
batch_size = 8 # 32 # 8 my_

In [18]:
# @tf.keras.utils.register_keras_serializable() => registers an object with the Keras serialization framework.
#        This allows you to use custom layer/class/function in your Keras models and ensure that it can be -
#        - serialized and deserialized correctly when saving and loading models.
#        This decorator injects the decorated class or function into the Keras custom object dictionary, so that -
#        - it can be serialized and deserialized without needing an entry in the user-provided custom object dict.


@tf.keras.utils.register_keras_serializable()
def smish(x):
    # smish(x)=x⋅tanh(ln(1+σ(x)))
    return x * K.tanh(K.log(1 + K.sigmoid(x)))


@tf.keras.utils.register_keras_serializable()
class GatedLinearUnit(Model): # L.Layer # Model my_
    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.linear = L.Dense(units)
        self.sigmoid = L.Dense(units, activation="sigmoid")
        self.units = units

    def get_config(self):
        config = super().get_config()
        config['units'] = self.units
        return config
    
    def call(self, inputs):
        return self.linear(inputs) * self.sigmoid(inputs)

    def model(self):
        # input_shape=(not include batch size).
        x = tf.keras.Input(shape=(self.units), batch_size=batch_size)
        return tf.keras.Model(inputs=x, outputs=self.call(x)) 
    

@tf.keras.utils.register_keras_serializable()
class GatedResidualNetwork(Model): # L.Layer # Model my_
    def __init__(self, units, dropout_rate, num_features, concat, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.dropout_rate = dropout_rate
        self.relu_dense = L.Dense(units, activation=smish)
        self.linear_dense = L.Dense(units)
        self.dropout = L.Dropout(dropout_rate)
        self.gated_linear_unit = GatedLinearUnit(units)
        self.layer_norm = L.LayerNormalization()
        self.project = L.Dense(units)
        self.num_features = num_features
        self.concat = concat

    def get_config(self):
        config = super().get_config()
        config['units'] = self.units
        config['dropout_rate'] = self.dropout_rate
        return config
    
    def call(self, inputs):
        x = self.relu_dense(inputs)
        x = self.linear_dense(x)
        x = self.dropout(x)
        if inputs.shape[-1] != self.units:
            inputs = self.project(inputs)
        x = inputs + self.gated_linear_unit(x)
        x = self.layer_norm(x)
        return x

    def model(self):
        # input_shape=(not include batch size).
        if (self.concat and 2*self.units-self.num_features): sh = self.num_features*(2*self.units-self.num_features)
        elif (self.concat and 2*self.units-self.num_features==0):  sh = self.num_features
        elif (self.concat==False and 2*self.units-self.num_features): sh = 2*self.units-self.num_features
        else: sh = 1        
        x = tf.keras.Input(shape=(sh), batch_size=batch_size)
        return tf.keras.Model(inputs=x, outputs=self.call(x)) 
    

@tf.keras.utils.register_keras_serializable()
class VariableSelection(Model): # L.Layer # Model my_
    def __init__(self, num_features, units, dropout_rate, **kwargs):
        super().__init__(**kwargs)

        # Create a GRN for the concatenation of all the features
        self.grn_concat = GatedResidualNetwork(units, dropout_rate, num_features, True)

        self.grns = list()
        # Create a GRN for each feature independently
        for idx in range(num_features):
            grn = GatedResidualNetwork(units, dropout_rate, num_features, False)
            self.grns.append(grn)
            
        self.softmax = L.Dense(units=num_features, activation="softmax")
        self.num_features = num_features
        self.units = units
        self.dropout_rate = dropout_rate

    def get_config(self):
        config = super().get_config()
        config['num_features'] = self.num_features
        config['units'] = self.units
        config['dropout_rate'] = self.dropout_rate
        return config
    
    def call(self, inputs):
        v = L.concatenate(inputs)
        v = self.grn_concat(v)
        v = tf.expand_dims(self.softmax(v), axis=-1)

        x = []
        for idx, input_ in enumerate(inputs):
            x.append(self.grns[idx](input_))
        x = tf.stack(x, axis=1)

        outputs = tf.squeeze(tf.matmul(v, x, transpose_a=True), axis=1)
        return outputs
        
    def model(self):
        sh = [2*self.units-self.num_features if 2*self.units-self.num_features else 1][0]
        x = []
        for idx in range(self.num_features):
            x.append(tf.keras.layers.Input(shape=(sh)))
        return tf.keras.Model(inputs=x, outputs=self.call(x))     

@tf.keras.utils.register_keras_serializable()
class VariableSelectionFlow(Model): # L.Layer # Model my_
    def __init__(self, num_features, units, dropout_rate, dense_units=None, **kwargs):
        super().__init__(**kwargs)
        self.variableselection = VariableSelection(num_features, units, dropout_rate)
        self.split = L.Lambda(lambda t: tf.split(t, num_features, axis=-1))
        self.dense = dense_units
        if dense_units:
            self.dense_list = [L.Dense(dense_units, \
                                       activation='linear') \
                               for _ in tf.range(num_features)
                              ]
        self.num_features = num_features
        self.units = units
        self.dropout_rate = dropout_rate
        self.dense_units = dense_units
        
    def get_config(self):
        config = super().get_config()
        config['num_features'] = self.num_features
        config['units'] = self.units
        config['dropout_rate'] = self.dropout_rate
        config['dense_units'] = self.dense_units
        return config        
    
    def call(self, inputs):   
        split_input = self.split(inputs)
        if self.dense:
            l = [self.dense_list[i](split_input[i]) for i in range(len(self.dense_list))]
        else:
            l = split_input
        return self.variableselection(l)           

    def model(self):
        # input_shape=(not include batch size).
        sh = 2*self.units-[self.dense_units if self.dense_units else 0][0]
        x = tf.keras.Input(shape=(sh), batch_size=batch_size)
        return tf.keras.Model(inputs=x, outputs=self.call(x)) 

In [19]:
#tf.keras.utils.plot_model(VariableSelectionFlow(units_2, units_3, drop_2).model(), to_file="model.png", expand_nested=True, show_shapes=True)
#tf.keras.utils.plot_model(VariableSelection(units_2, units_3, drop_2).model(), to_file="model.png", expand_nested=True, show_shapes=True)
#tf.keras.utils.plot_model(GatedResidualNetwork(units_3, drop_2, units_2, True).model(), to_file="model.png", expand_nested=True, show_shapes=True)
#tf.keras.utils.plot_model(GatedResidualNetwork(units_3, drop_2, units_2, False).model(), to_file="model.png", expand_nested=True, show_shapes=True)
#tf.keras.utils.plot_model(GatedLinearUnit(units_3).model(), to_file="model.png", expand_nested=True, show_shapes=True)

In [27]:
%%time

blls = []

units_1 = 32 
drop_1 = 0.0 #0.75 # 0.0 my_
dense_units = 8

units_2 = 16
drop_2 = 0.0 #0.5 # 0.0 my_

units_3 = 8
drop_3 = 0.0 #0.25 #0.0 my_

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=722)

#  cv was some kind of Multi-label. 
for n, (train_idx, val_idx) in enumerate(cv.split(X, (tgt + 1) * (tgt2 - 3))):
    for k in range(2):#10 # 2 my_
        print(f'______fold {n+1}______, ________repeat {k+1}__________')

        inputs_1 = tf.keras.Input(shape=(56,))
        
        features_1 = VariableSelectionFlow(56, units_1, drop_1, dense_units=dense_units)(inputs_1)
        features_2 = VariableSelectionFlow(units_1, units_2, drop_2)(features_1)         
        features_3 = VariableSelectionFlow(units_2, units_3, drop_3)(features_2)         

        outputs = L.Dense(1, activation="sigmoid")(features_3)

        model = Model(inputs=inputs_1, outputs=outputs)      
        # X[train_idx].shape => (555, 56)
        
        #print(model.summary(expand_nested=True))  
        #model(X[train_idx][:12])
        #tf.keras.utils.plot_model(model, to_file="model.png", expand_nested=True, show_shapes=True)
        
        opt = O.Adam(.001234, epsilon=1e-7) # 1e-3
        #opt = O.Adam(.00001234, epsilon=1e-7) # 1e-3
        loss = binary_crossentropy

        lr = ReduceLROnPlateau(monitor="val_loss", mode='min', factor=0.999, patience=1, verbose=1, min_lr=.0001234) # factor=0.95
        #lr = ReduceLROnPlateau(monitor="val_loss", mode='min', factor=0.999, patience=1, verbose=1, min_lr=.000001234) # factor=0.95
        es = EarlyStopping(monitor='val_loss', mode='min', patience=25, verbose=1, restore_best_weights=True)

        model.compile(optimizer=opt, loss=loss)

        #model.load_weights(f'best_weights/mod_f{n}.h5')
        
        history = model.fit(x=X[train_idx], # [:125] my_
                          y=tgt[train_idx], # [:125] my_
                          batch_size=batch_size,
                          epochs=250, #200 #300 my_
                          validation_data=(X[val_idx], tgt[val_idx]),
                          callbacks=[lr,es]
                )                
            
        probs = model.predict(X[val_idx])[:,0]        
        bll = balanced_log_loss_np(train_df.Class.values[val_idx], probs)
        blls.append(bll)
        val_loss = np.asarray(history.history['val_loss'])
        train_loss = np.asarray(history.history['loss'])
        min_val_loss = val_loss.min()
        min_train_loss = train_loss[val_loss.argmin()]
        print(f'{min_train_loss:.4f}, {min_val_loss:.4f}, {bll:.4f}')  
        
        model.save_weights(f'ICR_tf_adv_models/mod_f{n}_r{k}_tr{min_train_loss:.4f}_val{min_val_loss:.4f}.h5')

print(np.mean(blls))

______fold 5______, ________repeat 1__________
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0012327659666771069.
Epoch 5/250
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0012315331982681527.
Epoch 6/250
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0012303017091471703.
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 9: ReduceLROnPlateau reducing learning rate to 0.0012290713830152527.
Epoch 10/250
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0012278423361713068.
Epoch 11/250
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0012266144523164256.
Epoch 12/250
Epoch 13/250
Epoch 13: ReduceLROnPlateau reducing learning rate to 0.0012253878477495163.
Epoch 14/250
Epoch 14: ReduceLROnPlateau reducing learning rate to 0.001224162406171672.
Epoch 15/250
Epoch 15: ReduceLROnPlateau reducing learning rate to 0.0012229382438817993.
Epoch 16/250
Epoch 16: ReduceLROnPlateau reducing learning rate to 0.0012217153

KeyboardInterrupt: 