In [29]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Embedding, Lambda, LSTM, Dense, Dropout, Concatenate, Attention

class TextFeature:
    def __init__(self, vocab_size, embedding_dim, seq_len, hidden_size, fusion_dim, dropout_rate, embedding_matrix):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.seq_len = seq_len
        self.hidden_size = hidden_size
        self.fusion_dim = fusion_dim
        self.dropout_rate = dropout_rate
        self.embedding_matrix = embedding_matrix

    def model_create(self, fusion_vector):
        text_input = Input(shape=(self.seq_len,), name='text_input')
        embedding_layer = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, weights=[self.embedding_matrix], input_length=self.seq_len, trainable=False)(text_input)
        fusion_input = tf.constant(fusion_vector, dtype=tf.float32, name='fusion_vector')
        fusion_expanded = Lambda(lambda x: tf.expand_dims(x, axis=1))(fusion_input)
        fusion_tiled = Lambda(lambda x: tf.tile(x, [1, self.seq_len, 1]))(fusion_expanded)
        lstm_input = Concatenate(axis=-1)([embedding_layer, fusion_tiled])
        lstm_forward = LSTM(self.hidden_size, return_sequences=True, dropout=self.dropout_rate)
        lstm_backward = LSTM(self.hidden_size, return_sequences=True, dropout=self.dropout_rate, go_backwards=True)
        out_fw = lstm_forward(lstm_input)
        out_bw = lstm_backward(lstm_input)
        lstm_out = Concatenate(axis=-1)([out_fw, out_bw])
        attention = Attention()([lstm_out, lstm_out])
        mean_vec = Lambda(lambda x: tf.reduce_mean(x, axis=1))(attention)
        fused_vec = Dense(self.hidden_size, activation='relu')(mean_vec)
        return Model(inputs=[text_input], outputs=[lstm_out, mean_vec, fused_vec])

In [30]:
class DataLoader:
    def __init__(self, vocab_file, vector_file):
        self.vocab_file = vocab_file
        self.vector_file = vector_file

    def get_vocab(self):
        with open(self.vocab_file, 'r') as f:
            vocab = pickle.load(f)
        return vocab

    def load_embedding_matrix(self, vocab):
        with open(self.vector_file, 'rb') as f:
            first_line = f.readline()
            embedding_dimension = len(first_line.split())  # include the word itself
        embedding_matrix = np.zeros((len(vocab), embedding_dimension))
        with open(self.vector_file, 'r') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                if word in vocab:
                    embedding_matrix[vocab[word]] = vector
        return embedding_matrix

In [31]:
class DataLoader:
    def __init__(self, vocab_file, vector_file):
        self.vocab_file = vocab_file
        self.vector_file = vector_file

    def get_vocab(self):
        vocab = {}
        with open(self.vocab_file, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.split()
                if len(parts) == 2:
                    word, index = parts
                    vocab[word] = int(index)
        return vocab

    def load_embedding_matrix(self, vocab):
        with open(self.vector_file, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            embedding_dimension = len(first_line.split())  # include the word itself
        embedding_matrix = np.zeros((len(vocab), embedding_dimension))
        with open(self.vector_file, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                if word in vocab:
                    embedding_matrix[vocab[word]] = vector
        return embedding_matrix



Here, showing an example on how to compile the model with an example

NOTE: For fusion vector, take a look at the flowchart shared on the group. In the attribute modality part of the model, you shall be getting a final fusion vector after concatenating the vectors you got from that part.

In [32]:
vocab_file_path = '/Users/akshatsrivastava/Downloads/vocab.txt'
vector_file_path = '/Users/akshatsrivastava/Downloads/vector.txt'

data_loader = DataLoader(vocab_file=vocab_file_path, vector_file=vector_file_path)
vocab = data_loader.get_vocab()
embedding_matrix = data_loader.load_embedding_matrix(vocab)

vocab_size = len(vocab)
embedding_dim = 200
seq_len = 75
hidden_size = 256
fusion_dim = 128
dropout_rate = 0.2

dummy_fusion_vector = tf.random.uniform((1, fusion_dim))

text_feature = TextFeature(vocab_size, embedding_dim, seq_len, hidden_size, fusion_dim, dropout_rate, embedding_matrix)
model_with_input = text_feature.model_create(fusion_vector=dummy_fusion_vector)
model_with_input.summary()

In [33]:
dummy_fusion_vector

<tf.Tensor: shape=(1, 128), dtype=float32, numpy=
array([[0.15271997, 0.43502796, 0.3780625 , 0.47857165, 0.4456017 ,
        0.56018305, 0.58017147, 0.6445726 , 0.48586547, 0.62983954,
        0.4075606 , 0.6657312 , 0.3207897 , 0.6257962 , 0.61284983,
        0.68612003, 0.6839267 , 0.8543855 , 0.46688068, 0.9506557 ,
        0.6016748 , 0.70467365, 0.251804  , 0.27099264, 0.1252563 ,
        0.11441779, 0.2842698 , 0.01917946, 0.9990982 , 0.3985033 ,
        0.2563609 , 0.9770006 , 0.9612322 , 0.79477966, 0.84978235,
        0.93681073, 0.03004217, 0.97958815, 0.88094854, 0.8607477 ,
        0.5723047 , 0.5640781 , 0.07415664, 0.9083953 , 0.60345066,
        0.9472817 , 0.88929844, 0.31198025, 0.20118284, 0.7715012 ,
        0.14483213, 0.02880359, 0.9133599 , 0.325096  , 0.5438969 ,
        0.70141673, 0.38908815, 0.95163894, 0.22169352, 0.01865411,
        0.55895126, 0.95328   , 0.4788072 , 0.35873008, 0.55451953,
        0.5652684 , 0.35448194, 0.1492188 , 0.07788527, 0.9523579 