In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
# import matplotlib.pyplot as plt
import copy
import random
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout, Conv2D, MaxPooling2D, BatchNormalization

2025-04-02 09:00:15.061536: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-02 09:00:15.074015: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743584415.087449 2984266 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743584415.091287 2984266 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-02 09:00:15.105893: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
import tensorflow as tf

# Get the list of all physical GPUs
physical_devices = tf.config.list_physical_devices('GPU')

# Select only the last GPU (index 7)
if len(physical_devices) >= 2:
    try:
        # Set only the last GPU as visible
        tf.config.set_visible_devices(physical_devices[1], 'GPU')
        # Optional: Limit GPU memory growth (prevents TensorFlow from allocating all memory)
        tf.config.experimental.set_memory_growth(physical_devices[1], True)
    except RuntimeError as e:
        print(f"Error while setting GPU configuration: {e}")
else:
    print("Insufficient GPUs available.")



Create custom Effnet

In [3]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import copy
import random
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout, Conv2D, MaxPooling2D, BatchNormalization
# from keras.activations import silu
class TMCViTReplacement(keras.Model):
    def __init__(self, config):
        super().__init__()
        channel_size, seq_len = config['Data_shape'][1], config['Data_shape'][2]
        self.emb_size = config['emb_size']  # d_x (input embedding dimension)
        self.patch_size = 4  # Ensuring appropriate patching
        self.num_patches = seq_len // self.patch_size
        self.projection_dim = self.emb_size
        self.num_heads = 4
        self.transformer_units = [self.projection_dim * 2, self.projection_dim]
        self.transformer_layers = 8
        self.token_emb = keras.Sequential([
            layers.Conv2D(16, (channel_size, 1), activation="relu", padding="valid", data_format='channels_first'),
            layers.BatchNormalization(),
            layers.MaxPooling2D((1, config['pooling_size']), data_format='channels_first'),
            layers.Conv2D(32, (16, 3), activation="relu", padding="same", data_format='channels_first'),  # Reduce channels
            layers.BatchNormalization(),
            layers.Conv2D(1, (32, 3), activation="relu", padding="same", data_format='channels_first'),  # Output 1 channel
            layers.BatchNormalization(),
            layers.Conv2D(self.emb_size, (1, 1), activation="relu", padding="valid", data_format='channels_first'),  # Ensure emb_size in the right dimension
            layers.BatchNormalization(),

        ])
        self.flatten = layers.Flatten()
        self.patch_encoder = layers.Dense(units=self.projection_dim)
        self.position_embedding = layers.Embedding(input_dim=self.num_patches, output_dim=self.projection_dim)
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.attention_layer = layers.MultiHeadAttention(
            num_heads=self.num_heads, key_dim=self.projection_dim, dropout=0.1
        )
        self.mlp_layers = [layers.Dense(unit, activation=tf.nn.gelu) for unit in self.transformer_units]
        self.dropout = layers.Dropout(0.1)
        self.transformer_blocks = [
            layers.LayerNormalization(epsilon=1e-6),
            layers.MultiHeadAttention(num_heads=self.num_heads, key_dim=self.projection_dim, dropout=0.1),
            layers.LayerNormalization(epsilon=1e-6),
            layers.Dense(self.transformer_units[0], activation=tf.nn.gelu),
            layers.Dense(self.transformer_units[1], activation=tf.nn.gelu),
        ]

        self.mlp_head = keras.Sequential([
            layers.LayerNormalization(epsilon=1e-6),
            layers.Dense(2048, activation=tf.nn.gelu),
            layers.Dropout(0.5),
            layers.Dense(self.projection_dim, activation=tf.nn.gelu),
            layers.Dropout(0.5),
        ])
        
    def mlp(self, x):
        for layer in self.mlp_layers:
            x = layer(x)
        return self.dropout(x)

    def call(self, x):
        x = tf.expand_dims(x, axis=1)  # (bs, 1, channels, seq_len)
        x = self.token_emb(x)  # (bs, 1, emb_size, seq_len // pooling)
        x = tf.transpose(x, [0, 2, 1, 3])
        x = tf.squeeze(x, axis=1)  # (bs, emb_size, seq_len // pooling)
        x = tf.transpose(x, perm=[0, 2, 1])  # (bs, seq_len // pooling, emb_size)
        encoded_patches = self.patch_encoder(x) + self.position_embedding(tf.range(x.shape[1]))
        for _ in range(self.transformer_layers):
            # Layer normalization 1.
            x1 = self.norm1(encoded_patches)
            # Create a multi-head attention layer.
            attention_output = self.attention_layer(x1, x1)
            # Skip connection 1.
            x2 = layers.Add()([attention_output, encoded_patches])
            # Layer normalization 2.
            x3 = self.norm2(x2)
            # MLP.
            x3 = self.mlp(x3)
            # Skip connection 2.
            encoded_patches = layers.Add()([x3, x2])

        return self.mlp_head(encoded_patches)


In [4]:
class PositionalEmbedding(keras.Model):
    def __init__(self, max_len, d_model):
        super(PositionalEmbedding, self).__init__()

        position = tf.range(0, max_len, dtype=tf.float32)[:, tf.newaxis]  # Shape: (max_len, 1)
        div_term = tf.exp(- (tf.range(0, d_model, 2, dtype=tf.float32) *
                             -(np.log(10000.0) / d_model)))  # Shape: (d_model // 2,)

        # Expand div_term to align with position for broadcasting
        div_term = tf.expand_dims(div_term, 0)  # Shape: (1, d_model // 2)

        pe = tf.concat(
            [tf.sin(position * div_term), tf.cos(position * div_term)],
            axis=1
        )  # Shape: (max_len, d_model)

        # Pad if d_model is odd
        if d_model % 2 != 0:
            pe = tf.concat([pe, tf.zeros((max_len, 1), dtype=tf.float32)], axis=1)

        pe = tf.expand_dims(pe, axis=0)
        self.pe = tf.Variable(pe, trainable=False)

    def call(self, x):
        return self.pe[:, :tf.shape(x)[1], :]

In [5]:
def Semantic_Subsequence_Preserving(time_step_indices, chunk_count, target_percentage):
    # Get the total number of time steps
    total_time_steps = len(time_step_indices) # var=512
    # Calculate the desired total time steps for the selected chunks
    target_total_time_steps = int(total_time_steps * target_percentage) #256

    # Calculate the size of each chunk
    chunk_size = target_total_time_steps // chunk_count #128

    # Randomly select starting points for each chunk with minimum distance
    start_points = [random.randint(0, total_time_steps - chunk_size)] #(0,384)
    # Randomly select starting points for each subsequent chunk with minimum distance
    for _ in range(chunk_count - 1):
        next_start_point = random.randint(0, total_time_steps - chunk_size)
        start_points.append(next_start_point)

    # Select non-overlapping chunks using indices
    selected_chunks_indices = [time_step_indices[start:start + chunk_size] for start in start_points]

    return selected_chunks_indices

Add Attention.py TransformerBlock

In [6]:
class Attention(keras.Model):
    def call(self, query, key, value, mask=None, dropout=None, training=None):
        #print(f' shapes received:\n query={query.shape}\n key={key.shape}\n value={value.shape}\n mask={mask.shape}')
        d_k = tf.cast(tf.shape(query)[-1], tf.float32) #Casting the last value of shape into float for division in the next line
        scores = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(d_k) #transpose will swap the last 2 dims of the key matrix
        if mask is not None:
            mask = tf.cast(mask, tf.float32)
            scores += (mask * -1e9)  # Masking with a very negative value
        p_attn = tf.nn.softmax(scores, axis=-1)
        if dropout is not None:
            p_attn = dropout(p_attn, training=training)
        return tf.matmul(p_attn, value),p_attn

Add attention.py Multihead Attention

In [7]:
class MultiHeadAttention(keras.Model):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        assert d_model % h == 0

        # Below Assumption by the authors
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h

        self.linear_layers = [keras.layers.Dense(d_model) for _ in range(3)]
        self.output_linear = keras.layers.Dense(d_model)
        self.attention = Attention()
        self.dropout = keras.layers.Dropout(rate=dropout)

    def call(self, query, key, value, mask=None, training=None):
        batch_size = tf.shape(query)[0]

        query, key, value = [
            tf.transpose(tf.reshape(l(x), (batch_size, -1, self.h, self.d_k)), perm=[0, 2, 1, 3])
            for l, x in zip(self.linear_layers, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout, training=training)

        # 3) "Concat" using a view and apply a final linear.
        x = tf.transpose(x, (0, 2, 1, 3))
        x = tf.reshape(x, (batch_size, -1, self.h * self.d_k))

        return self.output_linear(x)

Add Attention.py PointWiseFeedForward



In [8]:
class PointWiseFeedForward(keras.Model):
    def __init__(self, d_model, d_ffn, dropout=0.1):
        super(PointWiseFeedForward, self).__init__()
        self.linear1 = keras.layers.Dense(d_ffn, input_shape=(d_model,))
        self.linear2 = keras.layers.Dense(d_model, input_shape=(d_ffn,))
        self.activation = keras.activations.gelu
        self.dropout = keras.layers.Dropout(rate=dropout)

    def call(self, x, training=None):
        return self.dropout(self.linear2(self.activation(self.linear1(x))), training=training)

Add Attention.py SublayerConnection


In [9]:
class SublayerConnection(keras.Model):
    """
    A residual connection followed by a layer norm.
    """

    def __init__(self, size, enable_res_parameter, dropout=0.1):
        super(SublayerConnection, self).__init__()
        self.norm = keras.layers.LayerNormalization(axis=-1, epsilon=1e-6)
        self.dropout = keras.layers.Dropout(rate=dropout)
        self.enable = enable_res_parameter
        if enable_res_parameter:
            self.a = tf.Variable(1e-8, trainable=True, dtype=tf.float32)

    def call(self, x, sublayer, training=None):
        if isinstance(x, list):
            return self.norm(x[1] + self.dropout(self.a * sublayer(x, training=training), training=training))
        if not self.enable:
            return self.norm(x + self.dropout(sublayer(x, training=training), training=training))
        else:
            return self.norm(x + self.dropout(self.a * sublayer(x, training=training), training=training))

In [10]:
class TransformerBlock(keras.Model):
    """
    TRM layer
    """

    def __init__(self, d_model, attn_heads, d_ffn, enable_res_parameter, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.attn = MultiHeadAttention(attn_heads, d_model, dropout)
        self.ffn = PointWiseFeedForward(d_model, d_ffn, dropout)
        self.skipconnect1 = SublayerConnection(d_model, enable_res_parameter, dropout)
        self.skipconnect2 = SublayerConnection(d_model, enable_res_parameter, dropout)

    def call(self, x, mask, training=None):
        # Pass training argument directly to sublayer function
        def attn_sublayer(_x, training):
            return self.attn(_x, _x, _x, mask=mask, training=training)
        x = self.skipconnect1(x, sublayer=(attn_sublayer), training=training)
        x = self.skipconnect2(x, sublayer=(self.ffn), training=training)
        return x

Encoder class

In [11]:
@tf.keras.utils.register_keras_serializable(package="Custom")
class Encoder(keras.Model):
    def __init__(self, config):
        super(Encoder, self).__init__()
        self.config = config  # Store config for use in get_config
        d_model = config['emb_size']
        attn_heads = config['num_heads']
        # d_ffn = 4 * d_model
        d_ffn = config['dim_ff']
        layers = config['layers']
        dropout = config['dropout']
        enable_res_parameter = True
        # TRMs
        self.TRMs = [
            TransformerBlock(d_model, attn_heads, d_ffn, enable_res_parameter, dropout)
            for _ in range(layers)
        ]

    def call(self, x, training=None):
        for TRM in self.TRMs:
            x = TRM(x, mask=None, training=training)
        return x

        # def call(self, x, training=None):
        # for TRM in self.TRMs:
        #     x = TRM(x, mask=None, training=training)
        # return x

    def get_config(self):
        # Save configuration for reconstruction
        return {"config": self.config}

    @classmethod
    def from_config(cls, config):
        # Recreate the Encoder using the saved configuration
        return cls(**config)

Add attention.py CrossAttnTRMBlock

In [12]:
class CrossAttnTRMBlock(keras.Model):
    def __init__(self, d_model, attn_heads, d_ffn, enable_res_parameter, dropout=0.1):
        super(CrossAttnTRMBlock, self).__init__()
        self.attn = MultiHeadAttention(attn_heads, d_model, dropout)
        self.ffn = PointWiseFeedForward(d_model, d_ffn, dropout)
        self.skipconnect1 = SublayerConnection(d_model, enable_res_parameter, dropout)
        self.skipconnect2 = SublayerConnection(d_model, enable_res_parameter, dropout)

    def call(self, rep_visible, rep_mask_token, mask=None, training=None):
        x = [rep_visible, rep_mask_token]
        def attn_sublayer(_x, training):
            return self.attn(_x[1], _x[0], _x[0], mask=mask, training=training)
        x = self.skipconnect1(x, sublayer=(attn_sublayer), training=training)
        x = self.skipconnect2(x, sublayer=(self.ffn), training=training)
        return x

In [13]:
class Predictor(keras.Model):
    def __init__(self, d_model, attn_heads, d_ffn, enable_res_parameter, num_layers):
        super(Predictor, self).__init__()
        self.num_layers = [
            CrossAttnTRMBlock(d_model, attn_heads, d_ffn, enable_res_parameter,)
            for _ in range(num_layers)
        ]

    def call(self, rep_visible, rep_mask_token, training=None):
        for TRM in self.num_layers:
            rep_mask_token = TRM(rep_visible, rep_mask_token)
        return rep_mask_token

In [14]:

from fastdtw import fastdtw

def dtw_metric(y_true, y_pred):
    def compute_dtw(y_true, y_pred):
        distance, _ = fastdtw(y_true.numpy(), y_pred.numpy())
        return np.float32(distance)

    return tf.py_function(compute_dtw, [y_true, y_pred], tf.float32)

def pcc_metric(y_true, y_pred):
    x = y_true - tf.reduce_mean(y_true)
    y = y_pred - tf.reduce_mean(y_pred)
    r_num = tf.reduce_sum(x * y)
    r_den = tf.sqrt(tf.reduce_sum(tf.square(x)) * tf.reduce_sum(tf.square(y)))
    return r_num / (r_den + tf.keras.backend.epsilon())

def cosine_similarity_metric(y_true, y_pred):
    y_true = tf.nn.l2_normalize(y_true, axis=-1)
    y_pred = tf.nn.l2_normalize(y_pred, axis=-1)
    return tf.reduce_mean(tf.reduce_sum(y_true * y_pred, axis=-1))


In [15]:
def make_label(arr):
    return arr[:,71,:][:,1]

In [16]:

train_data = np.load('stacked_train.npy')

In [17]:
print(np.unique(make_label(train_data)))
make_label(train_data).shape

[ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30.]


(8020,)

In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from tensorflow.keras.layers import Input, Dense, LayerNormalization, Reshape
from tensorflow.keras.models import Model

# Enable mixed precision training
# tf.keras.mixed_precision.set_global_policy('mixed_float32')

# Load and preprocess data
# val_repr = np.load("train_repr_VIT.npy")
val_repr_labels = make_label(train_data)
sentence_data = pd.read_csv('Spanish.csv')
#                             , encoding='latin-1')
sentence_dict = dict(zip(sentence_data['ID'], sentence_data['Sentence (Spanish)']))
y_sentences = [sentence_dict[sid] for sid in val_repr_labels]

# Tokenization setup with Spanish BERT
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized = tokenizer(y_sentences, padding='max_length', truncation=True, return_tensors="tf")

# Convert to TensorFlow datasets
# X_eeg = tf.convert_to_tensor(val_repr, dtype=tf.float32)
y_input_ids = tf.convert_to_tensor(tokenized["input_ids"], dtype=tf.int32)
y_attention_mask = tf.convert_to_tensor(tokenized["attention_mask"], dtype=tf.int32)

# Get sequence parameters
max_seq_len = y_input_ids.shape[1]
vocab_size = tokenizer.vocab_size

def build_eeg_to_text_model():
    eeg_input = Input(shape=(1375,), name="eeg_input")
    
    # Projection to BERT's space
    x = LayerNormalization()(eeg_input)
    x = Dense(768 * max_seq_len, activation='tanh')(x)
    x = Reshape((max_seq_len, 768))(x)
    
    # Load BERT with correct expectations
    bert = TFAutoModel.from_pretrained(
        MODEL_NAME,
        output_loading_info=False  # Suppress warnings
          # Sometimes helps with compatibility
    )
    
    # Get encoder outputs
    encoder_outputs = bert(input_ids = None, inputs_embeds=tf.cast(x, tf.float32), training=False).last_hidden_state
    
    # Add task-specific head
    logits = Dense(vocab_size)(encoder_outputs)
    model = Model(inputs=eeg_input, outputs=logits)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss=masked_loss, weighted_metrics=['accuracy'])

    return model

# Masked loss function handling BERT's padding token (0)
def masked_loss(y_true, y_pred):
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
    mask = tf.cast(y_true != tokenizer.pad_token_id, tf.float32)
    return tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)
def masked_accuracy(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_true = tf.cast(y_true, y_pred.dtype)
    mask = tf.cast(y_true != tokenizer.pad_token_id, y_pred.dtype)
    matches = tf.cast(y_pred == y_true, y_pred.dtype) * mask
    accuracy = tf.reduce_sum(matches) / tf.maximum(tf.reduce_sum(mask), 1)
    return accuracy
# Build and compile the model


# Training callbacks
callbacks = [
#     tf.keras.callbacks.TerminateOnNaN(),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=2),
#     tf.keras.callbacks.ModelCheckpoint('ViT_bert_emb1024_tuned.keras', save_best_only=True, save_weights_only=True)
]


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

I0000 00:00:1743584523.402211 2984266 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 77635 MB memory:  -> device: 1, name: NVIDIA H100 80GB HBM3, pci bus id: 0000:43:00.0, compute capability: 9.0


EEG2Rep

In [20]:
class EEG2Rep(keras.Model):
    def __init__(self, config, num_classes):
        super().__init__()
        """
         channel_size: number of EEG channels
         seq_len: number of timepoints in a window
        """
        # Parameters Initialization -----------------------------------------------
        channel_size, seq_len = config['Data_shape'][1], config['Data_shape'][2]
        emb_size = config['emb_size']  # d_x
        # Embedding Layer -----------------------------------------------------------
        seq_len = int(seq_len / config['pooling_size'])  # Number of patches (l)
        self.ViT = TMCViTReplacement(config)  # input (Batch,Channel, length) -> output (Batch, l, d_x)
        self.PositionalEncoding = PositionalEmbedding(seq_len, emb_size)
        # -------------------------------------------------------------------------
        self.summary_writer = tf.summary.create_file_writer('logs/ViT_bert_emb1024_pre')
        self.momentum = config['momentum']
        self.device = config['device']
        self.mask_ratio = config['mask_ratio']
        self.mask_len = int(config['mask_ratio'] * seq_len)
        self.mask_token = self.add_weight(
            shape=(emb_size,), initializer="random_normal", trainable=True, name="mask_token"
        )
        self.contex_encoder = Encoder(config)
        self.target_encoder = copy.deepcopy(self.contex_encoder)
        self.Predictor = Predictor(emb_size, config['num_heads'], config['dim_ff'], 1, config['pre_layers'])
        self.predict_head = keras.layers.Dense(config['num_labels'])
        self.Norm = keras.layers.LayerNormalization()
        self.Norm2 = keras.layers.LayerNormalization()
        self.gap = keras.layers.GlobalAveragePooling1D()
        self.lambda_ = config['lambda']
        self.mu = config['mu']
        self.gamma = config['gamma']
        self.mae_metric = tf.keras.metrics.MeanAbsoluteError()
        self.dtw = dtw_metric
        self.pcc_met = pcc_metric
        self.cosine_sim = cosine_similarity_metric
        self.bert = build_eeg_to_text_model()
        # Create a projection layer to map our EEG representation to the input shape expected by BERT.
        # (The BERT branch expects inputs of shape (1375,), so we project to that dimension.)
        self.downstream_projection = keras.layers.Dense(1375)
        # Mode: set externally to either "pretrain" or "downstream".
        self.mode = "downstream"
        # Add accuracy metrics
        self.train_acc = tf.keras.metrics.Mean(name="train_accuracy")
        self.val_acc = tf.keras.metrics.Mean(name="val_accuracy")
        
    @property
    def metrics(self):
        # Include the compiled metrics and the custom metrics
        if self.mode=="pretrain":
            return super().metrics + [self.mae_metric]
        else:
            return [self.train_acc, self.val_acc]
    
    def make_representation(self, val_data, val_label, device='gpu'):
        out = []
        labels = []
        for x, targets in zip(val_data,val_label): #shape of x (64,2500)
            x = tf.convert_to_tensor(x)
            x = tf.expand_dims(x, axis=0)

            with tf.device(device):
                rep = self.linear_prob(x, training=False)

            out.append(rep.numpy())
            labels.append(targets)

        out = np.array(out)
        labels = np.array(labels)
        return out, labels
    
    def test_step(self, data):

        x, y = data
        predictions = self.downstream(x, training=False)
        loss = self.compiled_loss(y, predictions)
        acc = masked_accuracy(y, predictions)
        with self.summary_writer.as_default():
            step = tf.cast(self.optimizer.iterations, tf.int64)
            tf.summary.scalar('val_accuracy', acc, step=step)
        self.val_acc.update_state(acc)
        return {"loss": loss, "accuracy": self.val_acc.result()}

    def train_step(self, data):
        if self.mode=="pretrain":
            # Unpack the data
            x = data

            # Ensure the copy_weight function is called only at the start of training
            if not hasattr(self, "_weights_copied") or not self._weights_copied:
                self.copy_weight()  # Call the copy_weight function
                self._weights_copied = True

            with tf.GradientTape() as tape:
                # Forward pass
                rep_mask, rep_mask_prediction, _, _ = self(x, training=True)

                # Compute the reconstruction loss
                reconstruction_loss = self.compute_loss(y=rep_mask, y_pred=rep_mask_prediction)

                # Compute the variance and covariance regularization terms using VICReg
                representations = tf.transpose(rep_mask_prediction, perm=[0, 2, 1])
                y = tf.reduce_mean(representations, axis=2)
                y = y - tf.reduce_mean(y, axis=0, keepdims=True)

                std_y = tf.sqrt(tf.math.reduce_variance(y, axis=0) + 0.0001)
                variance_loss = tf.reduce_mean(tf.nn.relu(1 - std_y))  # Hinge loss

                cov_matrix = tf.matmul(y, y, transpose_a=True) / tf.cast(tf.shape(y)[0] - 1, tf.float32)
                cov_matrix = cov_matrix - tf.linalg.diag(tf.linalg.diag_part(cov_matrix))
                covariance_loss = tf.reduce_sum(tf.square(cov_matrix)) / tf.cast(tf.shape(y)[-1], tf.float32)

                # Total loss
                total_loss = (
                    self.lambda_ * reconstruction_loss
                    + self.mu * variance_loss
                    + self.gamma * covariance_loss
                )

            # Compute gradients
            trainable_vars = self.trainable_variables
            gradients = tape.gradient(total_loss, trainable_vars)

            # Update weights
            self.optimizer.apply_gradients(zip(gradients, trainable_vars))
            # Update built-in metrics
            self.mae_metric.update_state(rep_mask, rep_mask_prediction)

            # Calculate and log custom metrics
            dtw_value = self.dtw(rep_mask, rep_mask_prediction)
            pcc_value = self.pcc_met(rep_mask, rep_mask_prediction)
            cosine_similarity_value = self.cosine_sim(rep_mask, rep_mask_prediction)

            with self.summary_writer.as_default():
                tf.summary.scalar('loss', total_loss, step=self.optimizer.iterations)
                tf.summary.scalar('mae', self.mae_metric.result(), step=self.optimizer.iterations)
    #             tf.summary.scalar('dtw', dtw_value, step=self.optimizer.iterations)
                tf.summary.scalar('PCC', pcc_value, step=self.optimizer.iterations)
                tf.summary.scalar('Cosine Similarity', cosine_similarity_value, step=self.optimizer.iterations)

            self.momentum_update()

            # Return a dictionary of metric results
            return {
                "loss": total_loss,
                "mae": self.mae_metric.result(),
                "dtw": dtw_value,
                "pcc": pcc_value,
                "cosine_sim": cosine_similarity_value,
            }

        else:
            # Downstream mode using BERT.
            x, y = data[0], data[1]
            with tf.GradientTape() as tape:
                predictions = self.downstream(x, training=True)
                loss = self.compiled_loss(y, predictions)
                acc = masked_accuracy(y, predictions)
                
            gradients = tape.gradient(loss, self.trainable_variables)
            self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
            
            with self.summary_writer.as_default():
                step = tf.cast(self.optimizer.iterations, tf.int64)
                tf.summary.scalar('downstream_loss', loss, step=step)
                tf.summary.scalar('train_mask_acc', acc, step=step)
#                 results = {m.name: m.result() for m in self.metrics}
#                 results.update({"loss": loss})
            self.train_acc.update_state(acc)
            return {"loss": loss, "accuracy": self.train_acc.result()}
    


    def copy_weight(self):
        for target_var, context_var in zip(self.target_encoder.trainable_variables, self.contex_encoder.trainable_variables):
            target_var.assign(context_var)

    def momentum_update(self):
        for target_var, context_var in zip(self.target_encoder.trainable_variables, self.contex_encoder.trainable_variables):
            target_var.assign(self.momentum * target_var + (1 - self.momentum) * context_var)

    def linear_prob(self, x, training=False): #Used for evaluation of learnt embeddings frozen weights
        patches = tf.stop_gradient(self.ViT(x))
        patches = tf.stop_gradient(self.Norm(patches))
        patches = tf.stop_gradient(patches + self.PositionalEncoding(patches))
        patches = tf.stop_gradient(self.Norm2(patches))
        out = tf.stop_gradient(self.contex_encoder(patches,training=training))
        out = tf.transpose(out, perm=[0, 2, 1])
        out = self.gap(out)
        return tf.squeeze(out)

    def pretrain_forward(self, x, training=True):
        patches = self.ViT(x)  # (Batch, l//m, d_x)
        patches = self.Norm(patches)
        patches = patches + self.PositionalEncoding(patches)
        patches = self.Norm2(patches) # (Batch, l//m, d_x) = (64, 512, 256)

        batch_size = tf.shape(patches)[0] #64
        seq_len = tf.shape(patches)[1] #512
        rep_mask_token = tf.tile(self.mask_token[tf.newaxis, tf.newaxis, :], [batch_size, seq_len, 1])
        rep_mask_token = rep_mask_token + self.PositionalEncoding(rep_mask_token)
        index = np.arange(patches.shape[1])
        index_chunk = Semantic_Subsequence_Preserving(index, 2, self.mask_ratio)
        v_index = np.ravel(index_chunk)
        m_index = np.setdiff1d(index, v_index)
        
        # visible = patches[:, v_index, :]
# Ensure v_index is a TensorFlow tensor of integer type
        v_index = tf.constant(v_index, dtype=tf.int32)

        # Use tf.gather to index along the second dimension
        visible = tf.gather(patches, v_index, axis=1)
        m_index = tf.constant(m_index, dtype=tf.int32)
        rep_mask_token = tf.gather(rep_mask_token, m_index, axis=1)
        rep_contex = self.contex_encoder(visible, training=training)

        rep_target = tf.stop_gradient(self.target_encoder(patches, training=training))
        rep_mask = tf.gather(rep_target, m_index, axis=1)
        rep_mask_prediction = self.Predictor(rep_contex, rep_mask_token, training=training)
        return [rep_mask, rep_mask_prediction, rep_contex, rep_target]

    def downstream(self, x, training=True):
        patches = self.ViT(x)
        patches = self.Norm(patches)
        patches = patches + self.PositionalEncoding(patches)
        patches = self.Norm2(patches)
        rep = self.contex_encoder(patches, training=training)

        # Ensure rep has a well-defined shape
        rep_shape = tf.shape(rep)
        batch_size = rep_shape[0]
        feature_size = rep_shape[1] * rep_shape[2]  # Flatten the last two dimensions
        rep = tf.reshape(rep, [batch_size, feature_size])

        # Apply projection to match BERT's expected input dimension
        projected = self.downstream_projection(rep)
        bert_logits = self.bert(projected, training=training)
        return bert_logits

    def call(self, x, training=False):
        if training:
            if self.mode == "pretrain":
                return self.pretrain_forward(x, training=training)
            else:
                return self.downstream(x, training=training)
        else:
            return self.linear_prob(x, training=training)


In [21]:

# Define the configuration
config={
    'Data_shape': (59360, 64, 2750),
    'dropout': 0.1,
    'Norm': False,
    'val_interval': 2,
    'key_metric': 'loss',
    'Training_mode': 'Rep-Learning',
    'Pre_Training': 'In-domain',
    'Input_Embedding': ['C'],
    'Pos_Embedding': ['Sin'],
    'Encoder': ['T'],
    'layers': 4,
    'pre_layers': 2,
    'mask_ratio': 0.5,
    'momentum': 0.99,
    'patch_size': 8,
    'emb_size': 1024,
    'dim_ff': 256,
    'num_heads': 8,
    'device': 'cuda',
    'num_labels' : 10,
    'lambda': 1.0,
    'mu': 0.1,
    'gamma': 0.005,
    'pooling_size': 2
}


num_classes = 30  # Example number of output classes

# Initialize the model
model = EEG2Rep(config, num_classes)
# Training callbacks
callbacks_pre = [
    tf.keras.callbacks.ModelCheckpoint('ViT_bert_emb1024_pre.keras', save_best_only=True, save_weights_only=True)
]
# Run the forward pass for classification
model.compile(optimizer='AdamW', loss='MSE')
model.mode = "pretrain"

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
train_data = train_data[:,:64,:]
train_data.shape

In [None]:
history = model.fit(train_data, batch_size=16, epochs=100, callbacks=callbacks_pre)

Epoch 1/100


I0000 00:00:1739686541.435310  166078 service.cc:145] XLA service 0x7fecf806bf10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1739686541.435335  166078 service.cc:153]   StreamExecutor device (0): NVIDIA H100 80GB HBM3, Compute Capability 9.0
I0000 00:00:1739686541.532100  166078 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100

In [None]:
val_data = np.load("stacked_val.npy")
val_labels = make_label(val_data)
val_data = val_data[:,:64,:]
val_data.shape

In [None]:

y_sentences = [sentence_dict[sid] for sid in val_labels]

tokenized1 = tokenizer(y_sentences, padding='max_length', truncation=True, return_tensors="tf")

y_val_input_ids = tf.convert_to_tensor(tokenized1["input_ids"], dtype=tf.int32)

In [None]:
print("Starting Downstream Fine-Tuning...")
model.mode = "downstream"  # Switch mode to downstream.
# For downstream, compile with the BERT masked loss and appropriate metrics.
# from sklearn.model_selection import train_test_split

# # Split data (assuming X_eeg and y_input_ids are your features and labels)
# X_train, X_val, y_train, y_val = train_test_split(
#     train_data, 
#     y_input_ids.numpy(),
#     test_size=0.2,
#     random_state=42
# )


# Convert back to tensors
X_train = tf.convert_to_tensor(train_data)
X_val = tf.convert_to_tensor(val_data)
y_train = y_input_ids
y_val = y_val_input_ids
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),
    loss=masked_loss,
    metrics=[masked_accuracy]
)
history1 = model.fit(
    x=X_train,
    y=y_train,
    epochs=100,
    batch_size=16,
    validation_data=(X_val, y_val),
    callbacks=callbacks
)

In [27]:
model.save_weights("saved_models/ViT_emb1024_pre/ViT_emb1024_pre.ckpt")

In [29]:
def generate_from_eeg(eeg_embedding, model, tokenizer, max_length=30):
    # Convert input to tensor and add batch dimension
    eeg_input = tf.convert_to_tensor(eeg_embedding, dtype=tf.float32)
    if len(eeg_input.shape) == 1:
        eeg_input = tf.expand_dims(eeg_input, axis=0)  # Shape: [1, 1375]

    # Initialize sequence with proper dimensions
    generated_ids = tf.constant([[tokenizer.cls_token_id]], dtype=tf.int32)  # Shape: [1, 1]
    
    for _ in range(max_length):
        # Get predictions for all positions
        logits = model(eeg_input)  # Shape: [1, seq_len, vocab_size]
        
        # Get the position we need to predict next (current sequence length)
        pos = generated_ids.shape[-1] - 1
        
        # Focus on the relevant position's logits
        next_token_logits = logits[:, pos, :]  # Shape: [1, vocab_size]
        
        # Greedy decoding
        next_token = tf.argmax(next_token_logits, axis=-1, output_type=tf.int32)  # Shape: [1]
        
        # Reshape for proper concatenation
        next_token = tf.reshape(next_token, (1, 1))  # Shape: [1, 1]
        
        # Stop if [SEP] token is generated
        if next_token.numpy()[0][0] == tokenizer.sep_token_id:
            break
            
        # Concatenate with proper dimensions
        generated_ids = tf.concat([generated_ids, next_token], axis=-1)

        # Early exit if sequence reaches max length
        if generated_ids.shape[-1] >= max_length:
            break

    # Convert to text and remove special tokens
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return decoded

In [32]:
# Verify preprocessing
val_repr = np.load('train_repr_VIT.npy')
X_eeg = tf.convert_to_tensor(val_repr, dtype=tf.float32)
sample_eeg = X_eeg[200]  # Should be shape (1375,)
print("Input shape:", sample_eeg.shape)

# Generate text
try:
    generated = generate_from_eeg(sample_eeg, model, tokenizer)
    print("Generated:", generated)
    print("Original:", sentence_dict[val_repr_labels[200]])  # Use same index
except Exception as e:
    print("Generation failed:", str(e))

Input shape: (1375,)
Generation failed: Exception encountered when calling layer 'sequential' (type Sequential).

Input 0 of layer "conv2d" is incompatible with the layer: expected min_ndim=4, found ndim=3. Full shape received: (1, 1, 1375)

Call arguments received by layer 'sequential' (type Sequential):
  • inputs=tf.Tensor(shape=(1, 1, 1375), dtype=float32)
  • training=False
  • mask=None


In [30]:
# X_train[:1024].shape

TensorShape([1024, 64, 2750])

In [51]:
# import pandas as pd
# import numpy as np
# import tensorflow as tf
# from transformers import AutoTokenizer, TFAutoModel
# from tensorflow.keras.layers import Input, Dense, LayerNormalization, Reshape
# from tensorflow.keras.models import Model

# # Enable mixed precision training
# tf.keras.mixed_precision.set_global_policy('mixed_float16')

# # Load and preprocess data
# val_repr = np.load("train_repr_VIT.npy")
# val_repr_labels = np.load("train_repr_labels_VIT.npy")
# sentence_data = pd.read_csv('/workspace/Spanish.csv', encoding='latin-1')
# sentence_dict = dict(zip(sentence_data['ID'], sentence_data['Sentence (Spanish)']))
# y_sentences = [sentence_dict[sid] for sid in val_repr_labels]

# # Tokenization setup with Spanish BERT
# MODEL_NAME = "dccuchile/bert-base-spanish-wwm-cased"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# tokenized = tokenizer(y_sentences, padding='max_length', truncation=True, return_tensors="tf")

# # Convert to TensorFlow datasets
# X_eeg = tf.convert_to_tensor(val_repr, dtype=tf.float32)
# y_input_ids = tf.convert_to_tensor(tokenized["input_ids"], dtype=tf.int32)
# y_attention_mask = tf.convert_to_tensor(tokenized["attention_mask"], dtype=tf.int32)

# # Get sequence parameters
# max_seq_len = y_input_ids.shape[1]
# vocab_size = tokenizer.vocab_size

# def build_eeg_to_text_model():
#     eeg_input = Input(shape=(1375,), name="eeg_input")
    
#     # Projection to BERT's space
#     x = LayerNormalization()(eeg_input)
#     x = Dense(768 * max_seq_len, activation='tanh')(x)
#     x = Reshape((max_seq_len, 768))(x)
    
#     # Load BERT with correct expectations
#     bert = TFAutoModel.from_pretrained(
#         MODEL_NAME,
#         output_loading_info=False  # Suppress warnings
#           # Sometimes helps with compatibility
#     )
    
#     # Get encoder outputs
#     encoder_outputs = bert(input_ids = None, inputs_embeds=tf.cast(x, tf.float16), training=False).last_hidden_state
    
#     # Add task-specific head
#     logits = Dense(vocab_size)(encoder_outputs)
    
#     return Model(inputs=eeg_input, outputs=logits)

# # Masked loss function handling BERT's padding token (0)
# def masked_loss(y_true, y_pred):
#     loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)
#     mask = tf.cast(y_true != tokenizer.pad_token_id, tf.float16)
#     return tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)

# # Build and compile the model
# model = build_eeg_to_text_model()
# optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=1.0)
# model.compile(optimizer=optimizer, loss=masked_loss, weighted_metrics=['accuracy'])

# # Training callbacks
# callbacks = [
#     tf.keras.callbacks.TerminateOnNaN(),
#     tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=2),
#     tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
# ]


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPUs will likely run quickly with dtype policy mixed_float16 as they all have compute capability of at least 7.0


Some layers from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 eeg_input (InputLayer)      [(None, 1375)]            0         
                                                                 
 layer_normalization (Layer  (None, 1375)              2750      
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 393216)            541065216 
                                                                 
 reshape (Reshape)           (None, 512, 768)          0         
                                                                 
 tf.cast (TFOpLambda)        (None, 512, 768)          0         
                                                                 
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPo   109850880 
 )                           olingAndCrossAttentions         

In [3]:
# history = model.fit(
#     X_eeg,
#     y_input_ids,
#     verbose = 1,
#     sample_weight=y_attention_mask.numpy(),
#     epochs=20,
#     batch_size=32,
#     validation_split=0.25,
#     callbacks=callbacks
# )

Epoch 1/20
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <gast.gast.Expr object at 0x7ff8f06d2020>
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <gast.gast.Expr object at 0x7ff8f06d2020>


2025-02-04 18:55:21.448042: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 90100
I0000 00:00:1738695323.406425 3125250 service.cc:145] XLA service 0x7fef5f8e9910 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738695323.406457 3125250 service.cc:153]   StreamExecutor device (0): NVIDIA A100-SXM4-80GB, Compute Capability 8.0
I0000 00:00:1738695323.406463 3125250 service.cc:153]   StreamExecutor device (1): NVIDIA A100-SXM4-80GB, Compute Capability 8.0
I0000 00:00:1738695323.406467 3125250 service.cc:153]   StreamExecutor device (2): NVIDIA A100-SXM4-80GB, Compute Capability 8.0
I0000 00:00:1738695323.406471 3125250 service.cc:153]   StreamExecutor device (3): NVIDIA A100-SXM4-80GB, Compute Capability 8.0
I0000 00:00:1738695323.406474 3125250 service.cc:153]   StreamExecutor device (4): NVIDIA A100-SXM4-80GB, Compute Capability 8.0
I0000 00:00:1738695323.406479 3125250 service.cc:153]   StreamExecut

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
