In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers

In [9]:
# C-LSTM with the embedding layer with the pre-trained glove embeddings
class CLSTMClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.num_layers = config.num_layers
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # embedding layer initialized with glove embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # conv layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # lstm layer
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        input_x = inputs
        x = self.embedding(input_x)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)    # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # squeezing out the 'channels' dimension
            conv_outputs.append(conv)

        # the minimum sequence length across all convolution outputs
        min_length = min([conv.shape[1] for conv in conv_outputs])

        # trimmikng all convolution outputs to the same sequence length
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)  # concat along the last dimension
        else:
            rnn_inputs = conv_outputs[0]

        # feed it to the LSTM
        rnn_outputs = self.lstm(rnn_inputs)

        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # final output layer
        logits = self.fc(rnn_outputs)
        return logits

In [11]:
# hyperparameters
BATCH_SIZE = 64
EPOCHS = 10
MAX_LEN = 500  # max length of sequences (padded or truncated)
VOCAB_SIZE = 5000  # the vocabulary
EMBEDDING_DIM = 300  # glove embedding dimensions

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)

# padding the sequences to ensure uniform input size
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)


def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider the top 'vocab_size' words
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                # Words not found in the embedding index will be all zeros.
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, EMBEDDING_DIM)

word_index = imdb.get_word_index()

embedding_matrix = create_embedding_matrix(word_index, glove_embeddings, VOCAB_SIZE, EMBEDDING_DIM)

class Config:
    max_length = MAX_LEN
    num_classes = 2  # imdb is binary classification (positive/negative)
    vocab_size = VOCAB_SIZE
    embedding_size = EMBEDDING_DIM  # glove embedding dimension
    filter_sizes = "3,4,5"
    num_filters = 64
    num_layers = 1
    l2_reg_lambda = 0.1
    keep_prob = 0.5

config = Config()
model = CLSTMClassifier(config, embedding_matrix)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    verbose=1
)

test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')




Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 112ms/step - accuracy: 0.5708 - loss: 0.8184 - val_accuracy: 0.6430 - val_loss: 0.6244
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 40ms/step - accuracy: 0.7935 - loss: 0.4546 - val_accuracy: 0.8782 - val_loss: 0.3180
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 40ms/step - accuracy: 0.8969 - loss: 0.2759 - val_accuracy: 0.8878 - val_loss: 0.2804
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 40ms/step - accuracy: 0.9213 - loss: 0.2275 - val_accuracy: 0.8899 - val_loss: 0.2889
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 40ms/step - accuracy: 0.9408 - loss: 0.1839 - val_accuracy: 0.8842 - val_loss: 0.2919
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 40ms/step - accuracy: 0.9517 - loss: 0.1545 - val_accuracy: 0.8916 - val_loss: 0.3061
Epoch 7/10
[1m

RUNNING WITH SST (Stanford Sentiment Treebank)

In [12]:
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

dataset = load_dataset("glue", "sst2")

train_texts = dataset['train']['sentence']
train_labels = dataset['train']['label']
val_texts = dataset['validation']['sentence']
val_labels = dataset['validation']['label']
test_texts = dataset['test']['sentence']
test_labels = dataset['test']['label']

VOCAB_SIZE = 5000
MAX_LEN = 50

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
train_padded = pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post')

val_sequences = tokenizer.texts_to_sequences(val_texts)
val_padded = pad_sequences(val_sequences, maxlen=MAX_LEN, padding='post')

test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post')

train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)
