In [None]:
import tensorflow as tf

class MusicHighlighter(tf.keras.Model):
    def __init__(self):
        super(MusicHighlighter, self).__init__()
        self.dim_feature = 64
        
        # Convolutional layers
        self.conv1 = tf.keras.layers.Conv2D(self.dim_feature, (3, 129), strides=(2, 1), activation='relu')
        self.conv2 = tf.keras.layers.Conv2D(self.dim_feature * 2, (4, 1), strides=(2, 1), activation='relu')
        self.conv3 = tf.keras.layers.Conv2D(self.dim_feature * 4, (4, 1), strides=(2, 1), activation='relu')
        
        # Fully connected layers
        self.fc1 = tf.keras.layers.Dense(self.dim_feature * 4, activation='relu')
        self.fc2 = tf.keras.layers.Dense(self.dim_feature * 4, activation='relu')
        self.fc3 = tf.keras.layers.Dense(1024, activation='relu')
        self.fc4 = tf.keras.layers.Dense(190, activation='softmax')
        
        # Attention mechanism layers
        self.attn_fc1 = tf.keras.layers.Dense(self.dim_feature * 4, activation='tanh')
        self.attn_fc2 = tf.keras.layers.Dense(1)

    def call(self, x, pos_enc, num_chunk):
        # Expand dimensions to create [batch, time, frequency, channels]
        net = tf.expand_dims(x, axis=-1)
        
        # 2D Conv. feature extraction
        net = self.conv1(net)
        net = self.conv2(net)
        net = self.conv3(net)
        
        # Global max-pooling
        net = tf.reduce_max(net, axis=1)
        net = tf.squeeze(net, axis=1)
        
        # Restore shape [batch_size, num_chunk, dim_feature * 4]
        net = tf.reshape(net, [-1, num_chunk, self.dim_feature * 4])

        # Apply positional encoding
        attn_net = net + pos_enc
        
        # Attention mechanism
        attn_net = self.fc1(attn_net)
        attn_net = self.fc2(attn_net)
        attn_score = self.attention(attn_net, self.dim_feature * 4)

        # Compute final predictions
        net = self.fc3(net)
        chunk_predictions = self.fc4(net)
        
        # Compute weighted sum with attention scores
        overall_predictions = tf.squeeze(tf.matmul(attn_score, chunk_predictions, transpose_a=True), axis=1)
        return overall_predictions, attn_score

    def attention(self, inputs, dim):
        outputs = self.attn_fc1(inputs)
        outputs = self.attn_fc2(outputs)
        attn_score = tf.nn.softmax(outputs, axis=1)
        return attn_score

    def compute_loss(self, y_true, y_pred):
        return tf.reduce_mean(-tf.reduce_sum(y_true * tf.math.log(y_pred + 1e-10), axis=1))

# Example usage
def train_step(model, x, pos_enc, num_chunk, y_true, optimizer):
    with tf.GradientTape() as tape:
        y_pred, attn_score = model(x, pos_enc, num_chunk)
        loss = model.compute_loss(y_true, y_pred)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss




InvalidArgumentError: Exception encountered when calling layer 'music_highlighter' (type MusicHighlighter).

{{function_node __wrapped__Squeeze_device_/job:localhost/replica:0/task:0/device:CPU:0}} Can not squeeze dim[1], expected a dimension of 1, got 0 [Op:Squeeze] name: 

Call arguments received by layer 'music_highlighter' (type MusicHighlighter):
  • x=tf.Tensor(shape=(1, 100, 128), dtype=float32)
  • pos_enc=tf.Tensor(shape=(1, 100, 256), dtype=float32)
  • num_chunk=100

In [9]:
# Sau khi khởi tạo mô hình

model = MusicHighlighter()

model.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or by calling the model on a batch of data.

In [None]:
class MusicHighlighter(object):
    def __init__(self):
        self.dim_feature = 64

        # During training or testing, we both use batch normalization
        self.is_training = tf.compat.v1.placeholder_with_default(False, shape=(), name='is_training')
        self.bn_params = {'is_training': self.is_training,
                          'center': True, 'scale': True,
                          'updates_collections': None}

        # Placeholders for input data
        self.x = tf.compat.v1.placeholder(tf.float32, shape=[None, None, 128], name='x')
        self.pos_enc =  tf.compat.v1.placeholder(tf.float32, shape=[None, None, self.dim_feature * 4], name='pos_enc')
        self.num_chunk =  tf.compat.v1.placeholder(tf.int32, name='num_chunk')

        # Build the model
        self.build_model()
    def conv(self, inputs, filters, kernel, stride):
        dim = inputs.get_shape().as_list()[-2]
        return slim.conv2d(inputs, filters,
                           [kernel, dim], [stride, dim],
                           activation_fn=tf.nn.relu,
                           normalizer_fn=slim.batch_norm,
                           normalizer_params=self.bn_params)

    def fc(self, inputs, num_units, act=tf.nn.relu):
        return slim.fully_connected(inputs, num_units,
                                    activation_fn=act,
                                    normalizer_fn=slim.batch_norm,
                                    normalizer_params=self.bn_params)

    def attention(self, inputs, dim):
        outputs = self.fc(inputs, dim, act=tf.nn.tanh)
        outputs = self.fc(outputs, 1, act=None)
        attn_score = tf.nn.softmax(outputs, axis=1)
        return attn_score

    def build_model(self):
        # 2D Conv. feature extraction
        net = tf.expand_dims(self.x, axis=3)  # [batch_size, time, 128, 1]
        net = self.conv(net, self.dim_feature, 3, 2)
        net = self.conv(net, self.dim_feature * 2, 4, 2)
        net = self.conv(net, self.dim_feature * 4, 4, 2)

        # Global max-pooling
        net = tf.squeeze(tf.reduce_max(net, axis=1), axis=1)

        # Restore shape [batch_size, num_chunk, dim_feature*4]
        net = tf.reshape(net, [1, self.num_chunk, self.dim_feature * 4])

        # Attention mechanism
        attn_net = net + self.pos_enc
        attn_net = self.fc(attn_net, self.dim_feature * 4)
        attn_net = self.fc(attn_net, self.dim_feature * 4)
        self.attn_score = self.attention(attn_net, self.dim_feature * 4)
        # This part is only used in training ##
        # net = self.fc(net, 1024)
        # chunk_predictions = self.fc(net, 18, act=tf.nn.sigmoid)
        # overall_predictions = tf.squeeze(tf.matmul(self.attn_score, chunk_predictions, transpose_a=True), axis=1)
        # loss = tf.reduce_mean(-tf.reduce_sum(self.y * tf.log(overall_predictions), axis=1))
        # Initialize the Saver to restore the model later
        self.saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())

    def calculate(self, sess, x, pos_enc, num_chunk):
        # Feed input data and calculate attention score
        feed_dict = {self.x: x, self.pos_enc: pos_enc, self.num_chunk: num_chunk, self.is_training: False}
        attn_score = sess.run(self.attn_score, feed_dict=feed_dict)
        return attn_score