In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers

In [3]:
# C-LSTM with the embedding layer with the pre-trained glove embeddings
class CLSTMBinaryClassifierIMDB(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryClassifierIMDB, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.num_layers = config.num_layers
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # embedding layer initialized with glove embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # conv layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # lstm layer
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        input_x = inputs
        x = self.embedding(input_x)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)    # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # squeezing out the 'channels' dimension
            conv_outputs.append(conv)

        # the minimum sequence length across all convolution outputs
        min_length = min([conv.shape[1] for conv in conv_outputs])

        # trimmikng all convolution outputs to the same sequence length
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)  # concat along the last dimension
        else:
            rnn_inputs = conv_outputs[0]

        # feed it to the LSTM
        rnn_outputs = self.lstm(rnn_inputs)

        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # final output layer
        logits = self.fc(rnn_outputs)
        return logits

In [5]:
# hyperparameters
BATCH_SIZE = 64
EPOCHS = 10
MAX_LEN = 500  # max length of sequences (padded or truncated)
VOCAB_SIZE = 5000  # the vocabulary
EMBEDDING_DIM = 300  # glove embedding dimensions

BINARY C-LSTM ON IMDB

In [8]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)

# padding the sequences to ensure uniform input size
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)


def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider the top 'vocab_size' words
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                # Words not found in the embedding index will be all zeros.
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, EMBEDDING_DIM)

word_index = imdb.get_word_index()

embedding_matrix = create_embedding_matrix(word_index, glove_embeddings, VOCAB_SIZE, EMBEDDING_DIM)

class Config:
    max_length = MAX_LEN
    num_classes = 2  # imdb is binary classification (positive/negative)
    vocab_size = VOCAB_SIZE
    embedding_size = EMBEDDING_DIM  # glove embedding dimension
    filter_sizes = "3,4,5"
    num_filters = 64
    num_layers = 1
    l2_reg_lambda = 0.1
    keep_prob = 0.5

config = Config()
model = CLSTMBinaryClassifierIMDB(config, embedding_matrix)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')



Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 70ms/step - accuracy: 0.5652 - loss: 0.8189 - val_accuracy: 0.6956 - val_loss: 0.6310
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.7890 - loss: 0.4771 - val_accuracy: 0.8730 - val_loss: 0.3244
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.8893 - loss: 0.2974 - val_accuracy: 0.8896 - val_loss: 0.2933
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.9200 - loss: 0.2335 - val_accuracy: 0.8934 - val_loss: 0.2822
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.9342 - loss: 0.1941 - val_accuracy: 0.8850 - val_loss: 0.2927
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.9543 - loss: 0.1529 - val_accuracy: 0.8897 - val_loss: 0.3024
Epoch 7/10
[1m3

FINE-GRAINED C-LSTM ON IMDB

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers
from sklearn.preprocessing import KBinsDiscretizer

# Function to simulate fine-grained labels (Here, we use a placeholder for fine-grained sentiment)
# You need a dataset with true fine-grained labels, but for simulation, we'll bin binary labels into 5 classes
def convert_to_fine_grained_labels(binary_labels, n_classes=5):
    est = KBinsDiscretizer(n_bins=n_classes, encode='ordinal', strategy='uniform')
    fine_grained_labels = est.fit_transform(binary_labels.reshape(-1, 1))
    return fine_grained_labels.astype(int).reshape(-1)

# C-LSTM with the embedding layer and pre-trained glove embeddings
class CLSTMFineGrainedClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMFineGrainedClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.num_layers = config.num_layers
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with glove embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Conv layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # LSTM layer
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Final output layer for fine-grained classification (5 classes)
        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        input_x = inputs
        x = self.embedding(input_x)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)    # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # squeezing out the 'channels' dimension
            conv_outputs.append(conv)

        # The minimum sequence length across all convolution outputs
        min_length = min([conv.shape[1] for conv in conv_outputs])

        # Trimming all convolution outputs to the same sequence length
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)  # Concatenate along the last dimension
        else:
            rnn_inputs = conv_outputs[0]

        # Feed to the LSTM
        rnn_outputs = self.lstm(rnn_inputs)

        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Final output layer for fine-grained classification
        logits = self.fc(rnn_outputs)
        return logits

# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 10
MAX_LEN = 500  # Max length of sequences (padded or truncated)
VOCAB_SIZE = 5000  # The vocabulary
EMBEDDING_DIM = 300  # GloVe embedding dimensions
NUM_CLASSES = 5  # Number of fine-grained sentiment classes (Very Negative, Negative, Neutral, Positive, Very Positive)

# Load the IMDb dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)

# Simulate fine-grained labels (binned into 5 classes)
y_train_fine = convert_to_fine_grained_labels(y_train)
y_test_fine = convert_to_fine_grained_labels(y_test)

# Padding the sequences to ensure uniform input size
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider the top 'vocab_size' words
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, EMBEDDING_DIM)

word_index = imdb.get_word_index()

embedding_matrix = create_embedding_matrix(word_index, glove_embeddings, VOCAB_SIZE, EMBEDDING_DIM)

# Config for fine-grained classification
class Config:
    max_length = MAX_LEN
    num_classes = NUM_CLASSES  # Fine-grained classification (5 classes)
    vocab_size = VOCAB_SIZE
    embedding_size = EMBEDDING_DIM  # GloVe embedding dimension
    filter_sizes = "3,4,5"
    num_filters = 64
    num_layers = 1
    l2_reg_lambda = 0.1
    keep_prob = 0.5

config = Config()
model = CLSTMFineGrainedClassifier(config, embedding_matrix)

# Compile model for fine-grained classification
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train, y_train_fine,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test_fine),
    verbose=1
)

# Evaluate on test data
test_loss, test_acc = model.evaluate(x_test, y_test_fine)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/10




[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 35ms/step - accuracy: 0.5528 - loss: 1.1647 - val_accuracy: 0.7341 - val_loss: 0.6335
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.7707 - loss: 0.5450 - val_accuracy: 0.8513 - val_loss: 0.4060
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.8844 - loss: 0.3435 - val_accuracy: 0.8850 - val_loss: 0.3313
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.9202 - loss: 0.2629 - val_accuracy: 0.8920 - val_loss: 0.3226
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.9398 - loss: 0.2177 - val_accuracy: 0.8846 - val_loss: 0.3395
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 35ms/step - accuracy: 0.9568 - loss: 0.1816 - val_accuracy: 0.8898 - val_loss: 0.3456
Epoch 7/10
[1m391/391[0m 

BINARY C-LSTM ON SST

In [13]:
# !pip install datasets

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from datasets import load_dataset

# Load SST dataset from Hugging Face
dataset = load_dataset("sst", trust_remote_code=True)

# Extract sentences and fine-grained labels
train_sentences = dataset['train']['sentence']
train_fine_labels = dataset['train']['label']

valid_sentences = dataset['validation']['sentence']
valid_fine_labels = dataset['validation']['label']

test_sentences = dataset['test']['sentence']
test_fine_labels = dataset['test']['label']

# Convert fine-grained labels to binary labels for SST-2 (0: negative, 1: positive)
def convert_to_binary_labels(labels):
    binary_labels = [0 if label < 2 else 1 for label in labels]
    return binary_labels

# Convert labels for binary classification
train_binary_labels = convert_to_binary_labels(train_fine_labels)
valid_binary_labels = convert_to_binary_labels(valid_fine_labels)
test_binary_labels = convert_to_binary_labels(test_fine_labels)

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)

VOCAB_SIZE = len(tokenizer.word_index) + 1
MAX_LEN = 50  # Max sequence length

# Updated Config with dynamic variables
class Config:
    def __init__(self, max_length, vocab_size, embedding_size=300, l2_reg_lambda=0.0015, keep_prob=0.6):
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size  # GloVe embedding dimension
        self.l2_reg_lambda = l2_reg_lambda
        self.keep_prob = keep_prob
        self.num_classes = 2  # Binary classification (SST-2)

config = Config(max_length=MAX_LEN, vocab_size=VOCAB_SIZE)

# Convert sentences to sequences and pad them
train_sequences = pad_sequences(tokenizer.texts_to_sequences(train_sentences), maxlen=MAX_LEN)
valid_sequences = pad_sequences(tokenizer.texts_to_sequences(valid_sentences), maxlen=MAX_LEN)
test_sequences = pad_sequences(tokenizer.texts_to_sequences(test_sentences), maxlen=MAX_LEN)

# C-LSTM Model for Binary Classification (SST-2)
class CLSTMBinaryClassifierSST(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryClassifierSST, self).__init__()
        self.max_length = config.max_length
        self.embedding_size = config.embedding_size
        self.num_filters = 150  # As per the paper
        self.hidden_size = 150  # LSTM hidden units
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings (trainable)
        self.embedding = layers.Embedding(input_dim=config.vocab_size,
                                          output_dim=config.embedding_size,
                                          input_length=config.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Dropout for embedding layer
        self.embedding_dropout = layers.Dropout(rate=config.keep_prob)

        # Single conv layer with filter size 3 + Batch Norm (No pooling as per the paper)
        self.conv_layer = layers.Conv2D(filters=self.num_filters,
                                        kernel_size=(3, config.embedding_size),
                                        activation='relu', padding='valid')
        self.batch_norm = layers.BatchNormalization()

        # LSTM layer to capture long-term dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        # Dropout after LSTM
        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Output layer for binary classification (SST-2)
        self.fc_binary = layers.Dense(2, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.embedding_dropout(x, training=training)
        x = tf.expand_dims(x, -1)

        # Apply convolution layer and batch normalization
        conv = self.conv_layer(x)
        conv = self.batch_norm(conv, training=training)
        conv = tf.squeeze(conv, 2)

        # Feed the convolution output to LSTM
        rnn_outputs = self.lstm(conv)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output for binary classification (SST-2)
        binary_output = self.fc_binary(rnn_outputs)

        return binary_output

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider words in vocab
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load pre-trained GloVe embeddings
glove_file_path = "/content/drive/MyDrive/glove/glove.6B.300d.txt"  # Update this path
glove_embeddings = load_glove_embeddings(glove_file_path, embedding_dim=300)

# Initialize the embedding matrix using the tokenizer word index
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, embedding_dim=300)

# Train the model on SST (Binary classification: SST-2)
def compile_and_train_model(config):
    # Pass the pre-loaded embedding matrix
    model = CLSTMBinaryClassifierSST(config, embedding_matrix=embedding_matrix)

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

    # Compile the model for binary classification
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # Convert labels to numpy arrays
    train_binary_labels_np = np.array(train_binary_labels)
    valid_binary_labels_np = np.array(valid_binary_labels)

    # Train the model
    history = model.fit(
        np.array(train_sequences),
        train_binary_labels_np,
        batch_size=64,
        epochs=20,
        validation_data=(np.array(valid_sequences), valid_binary_labels_np),
        verbose=1
    )

    return model

# Train and evaluate the model
model = compile_and_train_model(config)

# Evaluate on test data
test_binary_labels_np = np.array(test_binary_labels)
test_loss, test_acc = model.evaluate(np.array(test_sequences), test_binary_labels_np)

print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Downloading data:   0%|          | 0.00/6.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/790k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8544 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1101 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2210 [00:00<?, ? examples/s]

Epoch 1/20




[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.9411 - loss: 0.1497 - val_accuracy: 1.0000 - val_loss: 0.3405
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9986 - loss: 0.0116 - val_accuracy: 1.0000 - val_loss: 0.1571
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0070 - val_accuracy: 1.0000 - val_loss: 0.0148
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0063 - val_accuracy: 1.0000 - val_loss: 0.0060
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0059 - val_accuracy: 1.0000 - val_loss: 0.0056
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0058 - val_accuracy: 1.0000 - val_loss: 0.0055
Epoch 7/20
[1m134/134[0m [32m━━━━━━

BINARY C-LSTM ON SST

In [19]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from datasets import load_dataset

# Load SST dataset from Hugging Face
dataset = load_dataset("sst")

# Extract sentences and fine-grained labels for SST-5
train_sentences = dataset['train']['sentence']
train_fine_labels = dataset['train']['label']

valid_sentences = dataset['validation']['sentence']
valid_fine_labels = dataset['validation']['label']

test_sentences = dataset['test']['sentence']
test_fine_labels = dataset['test']['label']

# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)

VOCAB_SIZE = len(tokenizer.word_index) + 1
MAX_LEN = 50  # Max sequence length

# Updated Config with dynamic variables for fine-grained classification
class Config:
    def __init__(self, max_length, vocab_size, embedding_size=300, l2_reg_lambda=0.0015, keep_prob=0.5):
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size  # GloVe embedding dimension
        self.l2_reg_lambda = l2_reg_lambda
        self.keep_prob = keep_prob
        self.num_classes = 5  # Fine-grained classification (SST-5)

config = Config(max_length=MAX_LEN, vocab_size=VOCAB_SIZE)

# Convert sentences to sequences and pad them
train_sequences = pad_sequences(tokenizer.texts_to_sequences(train_sentences), maxlen=MAX_LEN)
valid_sequences = pad_sequences(tokenizer.texts_to_sequences(valid_sentences), maxlen=MAX_LEN)
test_sequences = pad_sequences(tokenizer.texts_to_sequences(test_sentences), maxlen=MAX_LEN)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider words in vocab
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load pre-trained GloVe embeddings
glove_file_path = "/content/drive/MyDrive/glove/glove.6B.300d.txt"  # Update the path
glove_embeddings = load_glove_embeddings(glove_file_path, embedding_dim=300)

# Initialize the embedding matrix using the tokenizer word index
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, embedding_dim=300)

# C-LSTM Model for Fine-Grained Classification (SST-5)
class CLSTMFineGrainedClassifierSST(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMFineGrainedClassifierSST, self).__init__()
        self.max_length = config.max_length
        self.embedding_size = config.embedding_size
        self.num_filters = 150  # As per the paper
        self.hidden_size = 150  # LSTM hidden units
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings (trainable)
        self.embedding = layers.Embedding(input_dim=config.vocab_size,
                                          output_dim=config.embedding_size,
                                          input_length=config.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Dropout for embedding layer
        self.embedding_dropout = layers.Dropout(rate=config.keep_prob)

        # Single conv layer with filter size 3 + Batch Norm (No pooling as per the paper)
        self.conv_layer = layers.Conv2D(filters=self.num_filters,
                                        kernel_size=(3, config.embedding_size),
                                        activation='relu', padding='valid')
        self.batch_norm = layers.BatchNormalization()

        # LSTM layer to capture long-term dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        # Dropout after LSTM
        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Output layer for fine-grained classification (SST-5)
        self.fc_fine = layers.Dense(5, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.embedding_dropout(x, training=training)
        x = tf.expand_dims(x, -1)

        # Apply convolution layer and batch normalization
        conv = self.conv_layer(x)
        conv = self.batch_norm(conv, training=training)
        conv = tf.squeeze(conv, 2)

        # Feed the convolution output to LSTM
        rnn_outputs = self.lstm(conv)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output for fine-grained classification (SST-5)
        fine_output = self.fc_fine(rnn_outputs)

        return fine_output

# Train the model on SST (Fine-Grained Classification: SST-5)
def compile_and_train_model(config):
    # Pass the pre-loaded embedding matrix
    model = CLSTMFineGrainedClassifierSST(config, embedding_matrix=embedding_matrix)

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

    # Compile the model for fine-grained classification (5 classes)
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # Convert labels to numpy arrays
    train_fine_labels_np = np.array(train_fine_labels)
    valid_fine_labels_np = np.array(valid_fine_labels)

    # Train the model
    history = model.fit(
        np.array(train_sequences),
        train_fine_labels_np,
        batch_size=64,
        epochs=20,
        validation_data=(np.array(valid_sequences), valid_fine_labels_np),
        verbose=1
    )

    return model

# Train and evaluate the model
model = compile_and_train_model(config)

# Evaluate on test data
test_fine_labels_np = np.array(test_fine_labels)
test_loss, test_acc = model.evaluate(np.array(test_sequences), test_fine_labels_np)

print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.0016 - loss: 0.3100 - val_accuracy: 0.0036 - val_loss: 0.8444
Epoch 2/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0012 - loss: 0.0353 - val_accuracy: 0.0036 - val_loss: 0.5418
Epoch 3/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0023 - loss: 0.0296 - val_accuracy: 0.0036 - val_loss: 0.0826
Epoch 4/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0019 - loss: 0.0342 - val_accuracy: 0.0036 - val_loss: 0.0442
Epoch 5/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0023 - loss: 0.0249 - val_accuracy: 0.0036 - val_loss: 0.0418
Epoch 6/20
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0021 - loss: 0.0258 - val_accuracy: 0.0036 - val_loss: 0.0420
Epoch 7/20
[1m134/134[0m 


BINARY C-LSTM ON SST AMAZON REVIEWS DATASET

In [28]:

# C-LSTM with the embedding layer with the pre-trained glove embeddings
class CLSTMBinaryAmazonReviewsClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryAmazonReviewsClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.num_layers = config.num_layers
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # embedding layer initialized with glove embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # conv layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # lstm layer
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        input_x = inputs
        x = self.embedding(input_x)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)    # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # squeezing out the 'channels' dimension
            conv_outputs.append(conv)

        # the minimum sequence length across all convolution outputs
        min_length = min([conv.shape[1] for conv in conv_outputs])

        # trimmikng all convolution outputs to the same sequence length
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)  # concat along the last dimension
        else:
            rnn_inputs = conv_outputs[0]

        # feed it to the LSTM
        rnn_outputs = self.lstm(rnn_inputs)

        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # final output layer
        logits = self.fc(rnn_outputs)
        return logits

In [29]:
# hyperparameters
BATCH_SIZE = 64
EPOCHS = 10
MAX_LEN = 500  # max length of sequences (padded or truncated)
VOCAB_SIZE = 5000  # the vocabulary
EMBEDDING_DIM = 300  # glove embedding dimensions

In [24]:
from datasets import load_dataset
import numpy as np

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)

reviews = [item['text'] for item in dataset['full']]
labels = [1 if item['rating'] >= 3 else 0 for item in dataset['full']]

print(f"First review: {reviews[0]}")
print(f"First label: {labels[0]}")

Amazon-Reviews-2023.py:   0%|          | 0.00/39.6k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

All_Beauty.jsonl:   0%|          | 0.00/327M [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

First review: This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!
First label: 1


In [None]:

review_lengths = [len(review.split()) for review in reviews]
avg_review_length = np.mean(review_lengths)
max_review_length = np.max(review_lengths)
print(f"Average review length: {avg_review_length}")
print(f"Maximum review length: {max_review_length}")

tokeniser = Tokenizer()
tokeniser.fit_on_texts(reviews)
total_unique_words = len(tokeniser.word_index)
print(f"Total unique words in the dataset: {total_unique_words}")


import collections
rating_distribution = collections.Counter([item['rating'] for item in dataset['full']])
print(f"Rating distribution: {rating_distribution}")

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

tokeniser = Tokenizer(num_words=10000)  # setting vocab_size to 10,000 as per the updated config
tokeniser.fit_on_texts(reviews)  # fitting the tokeniser on the Amazon reviews dataset
sequences = tokeniser.texts_to_sequences(reviews)

# pad the sequences to the max length of 300
x_data = pad_sequences(sequences, maxlen=300)

# labels to numpy array
y_data = np.array(labels)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider the top 'vocab_size' words
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                # Words not found in the embedding index will be all zeros.
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, 300)

# embedding matrix
word_index = tokenizer.word_index
embedding_matrix = create_embedding_matrix(word_index, glove_embeddings, 10000, 300)

class Config:
    max_length = 300  # based on the average review length analysis
    num_classes = 2  # binary classification (positive/negative)
    vocab_size = 10000  # limit vocabulary size to top 10,000 words
    embedding_size = 300  #  300-dimensional GloVe embeddings
    filter_sizes = "3,4,5"  # convolution filter sizes
    num_filters = 64  # num of filters for each filter size
    num_layers = 1  #  lstm layer
    l2_reg_lambda = 0.1  # L2 regularisation to prevent overfitting
    keep_prob = 0.5  # dropout probability

# init the C-LSTM model with the updated configuration and embedding matrix
config = Config()
model = CLSTMBinaryAmazonReviewsClassifier(config, embedding_matrix)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

BATCH_SIZE = 64
EPOCHS = 10

history = model.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    verbose=1
)



Epoch 1/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 18ms/step - accuracy: 0.8900 - loss: 0.2832 - val_accuracy: 0.9227 - val_loss: 0.1947
Epoch 2/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 17ms/step - accuracy: 0.9297 - loss: 0.1810 - val_accuracy: 0.9247 - val_loss: 0.1909
Epoch 3/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 17ms/step - accuracy: 0.9368 - loss: 0.1654 - val_accuracy: 0.9267 - val_loss: 0.1940
Epoch 4/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 17ms/step - accuracy: 0.9435 - loss: 0.1517 - val_accuracy: 0.9260 - val_loss: 0.1932
Epoch 5/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 17ms/step - accuracy: 0.9496 - loss: 0.1399 - val_accuracy: 0.9229 - val_loss: 0.1973
Epoch 6/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 17ms/step - accuracy: 0.9546 - loss: 0.1287 - val_accuracy: 0.9229 - val_loss: 0.206

FINE-GRAINED C-LSTM ON AMAZON REVIEWS DATASET

In [33]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# C-LSTM for Fine-Grained Classification with GloVe embeddings
class CLSTMFineGrainedAmazonReviewsClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMFineGrainedAmazonReviewsClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.num_layers = config.num_layers
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # embedding layer initialized with GloVe embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # conv layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # lstm layer
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        input_x = inputs
        x = self.embedding(input_x)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)    # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # squeezing out the 'channels' dimension
            conv_outputs.append(conv)

        # the minimum sequence length across all convolution outputs
        min_length = min([conv.shape[1] for conv in conv_outputs])

        # trimming all convolution outputs to the same sequence length
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)  # concat along the last dimension
        else:
            rnn_inputs = conv_outputs[0]

        # feed it to the LSTM
        rnn_outputs = self.lstm(rnn_inputs)

        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # final output layer
        logits = self.fc(rnn_outputs)
        return logits

# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 10
MAX_LEN = 300  # max length of sequences (padded or truncated)
VOCAB_SIZE = 10000  # vocabulary size
EMBEDDING_DIM = 300  # GloVe embedding dimensions

# Tokenizer and padding
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(reviews)  # assuming `reviews` contains the Amazon reviews text

sequences = tokenizer.texts_to_sequences(reviews)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)

# Labels for fine-grained classification (assuming labels contain ratings 1 to 5)
y_data = np.array(labels)

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider the top 'vocab_size' words
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to the GloVe embeddings
glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, EMBEDDING_DIM)

# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = create_embedding_matrix(word_index, glove_embeddings, VOCAB_SIZE, EMBEDDING_DIM)

# Configuration for fine-grained classification
class Config:
    max_length = MAX_LEN
    num_classes = 5  # Fine-grained classification (1 to 5 stars)
    vocab_size = VOCAB_SIZE
    embedding_size = EMBEDDING_DIM  # GloVe embedding dimensions
    filter_sizes = "3,4,5"
    num_filters = 64
    num_layers = 1
    l2_reg_lambda = 0.1
    keep_prob = 0.5

# Initialize the model
config = Config()
model = CLSTMFineGrainedAmazonReviewsClassifier(config, embedding_matrix)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 17ms/step - accuracy: 0.8956 - loss: 0.3258 - val_accuracy: 0.9239 - val_loss: 0.1949
Epoch 2/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 17ms/step - accuracy: 0.9295 - loss: 0.1848 - val_accuracy: 0.9265 - val_loss: 0.1906
Epoch 3/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 17ms/step - accuracy: 0.9372 - loss: 0.1673 - val_accuracy: 0.9275 - val_loss: 0.1888
Epoch 4/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 17ms/step - accuracy: 0.9436 - loss: 0.1545 - val_accuracy: 0.9272 - val_loss: 0.1918
Epoch 5/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 17ms/step - accuracy: 0.9501 - loss: 0.1410 - val_accuracy: 0.9258 - val_loss: 0.1973
Epoch 6/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 17ms/step - accuracy: 0.9557 - loss: 0.1291 - val_accuracy: 0.9241 - val_loss: 0.207


BINARY C-LSTM ON YELP DATASET

In [36]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load Yelp dataset from Hugging Face
ds = load_dataset("yelp_review_full", split='train')

# Sample of the dataset
reviews = ds['text']  # Extract reviews
labels = ds['label']  # Extract ratings (0-4)

# Convert ratings to binary labels (e.g., 1-3 stars = negative, 4-5 stars = positive)
def convert_to_binary_labels(labels):
    binary_labels = [1 if label >= 3 else 0 for label in labels]
    return binary_labels

binary_labels = convert_to_binary_labels(labels)

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300  # Max sequence length based on Yelp reviews

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(reviews)

sequences = tokenizer.texts_to_sequences(reviews)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)
y_data = np.array(binary_labels)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to GloVe embeddings file
glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, 300)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# C-LSTM Model for Binary Classification
class CLSTMBinaryYelpClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryYelpClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Convolutional layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Fully connected layer for binary classification (2 classes)
        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)   # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # Squeeze channels dimension
            conv_outputs.append(conv)

        # Trimming all conv outputs to the same sequence length
        min_length = min([conv.shape[1] for conv in conv_outputs])
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        # Concatenate conv outputs if more than one
        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)
        else:
            rnn_inputs = conv_outputs[0]

        # LSTM layer
        rnn_outputs = self.lstm(rnn_inputs)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output layer (binary classification)
        logits = self.fc(rnn_outputs)
        return logits

# Configuration for the model
class Config:
    max_length = 300
    num_classes = 2  # Binary classification
    vocab_size = VOCAB_SIZE
    embedding_size = 300  # GloVe embeddings size
    filter_sizes = "3,4,5"  # Convolution filter sizes
    num_filters = 64  # Number of filters for each filter size
    l2_reg_lambda = 0.1  # Regularization
    keep_prob = 0.5  # Dropout probability

# Instantiate config and the model
config = Config()
model = CLSTMBinaryYelpClassifier(config, embedding_matrix)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 17ms/step - accuracy: 0.8496 - loss: 0.3712 - val_accuracy: 0.8927 - val_loss: 0.2601
Epoch 2/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9029 - loss: 0.2446 - val_accuracy: 0.8999 - val_loss: 0.2451
Epoch 3/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9147 - loss: 0.2197 - val_accuracy: 0.9021 - val_loss: 0.2462
Epoch 4/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9242 - loss: 0.1992 - val_accuracy: 0.9016 - val_loss: 0.2445
Epoch 5/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9335 - loss: 0.1802 - val_accuracy: 0.8997 - val_loss: 0.2502
Epoch 6/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9421 - loss: 0.1621 - val_accuracy: 0.8986 - val_loss: 0.258

FINE-GRAINED C-LSTM ON YELP

In [38]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load Yelp dataset from Hugging Face
ds = load_dataset("yelp_review_full", split='train')

# Sample of the dataset
reviews = ds['text']  # Extract reviews
labels = ds['label']  # Extract ratings (0-4)

# Convert ratings to binary labels (e.g., 1-3 stars = negative, 4-5 stars = positive)
def convert_to_binary_labels(labels):
    binary_labels = [1 if label >= 3 else 0 for label in labels]
    return binary_labels

binary_labels = convert_to_binary_labels(labels)

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300  # Max sequence length based on Yelp reviews

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(reviews)

sequences = tokenizer.texts_to_sequences(reviews)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)
y_data = np.array(binary_labels)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Path to GloVe embeddings file
glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, 300)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# C-LSTM Model for Binary Classification
class CLSTMBinaryYelpClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryYelpClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Convolutional layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Fully connected layer for binary classification (2 classes)
        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)   # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # Squeeze channels dimension
            conv_outputs.append(conv)

        # Trimming all conv outputs to the same sequence length
        min_length = min([conv.shape[1] for conv in conv_outputs])
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        # Concatenate conv outputs if more than one
        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)
        else:
            rnn_inputs = conv_outputs[0]

        # LSTM layer
        rnn_outputs = self.lstm(rnn_inputs)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output layer (binary classification)
        logits = self.fc(rnn_outputs)
        return logits

# Configuration for the model
class Config:
    max_length = 300
    num_classes = 2  # Binary classification
    vocab_size = VOCAB_SIZE
    embedding_size = 300  # GloVe embeddings size
    filter_sizes = "3,4,5"  # Convolution filter sizes
    num_filters = 64  # Number of filters for each filter size
    l2_reg_lambda = 0.1  # Regularization
    keep_prob = 0.5  # Dropout probability

# Instantiate config and the model
config = Config()
model = CLSTMBinaryYelpClassifier(config, embedding_matrix)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 17ms/step - accuracy: 0.8440 - loss: 0.3785 - val_accuracy: 0.8952 - val_loss: 0.2576
Epoch 2/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9034 - loss: 0.2436 - val_accuracy: 0.9011 - val_loss: 0.2483
Epoch 3/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 17ms/step - accuracy: 0.9144 - loss: 0.2204 - val_accuracy: 0.9028 - val_loss: 0.2461
Epoch 4/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9250 - loss: 0.1985 - val_accuracy: 0.9004 - val_loss: 0.2637
Epoch 5/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9340 - loss: 0.1796 - val_accuracy: 0.8995 - val_loss: 0.2583
Epoch 6/10
[1m8125/8125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 17ms/step - accuracy: 0.9421 - loss: 0.1639 - val_accuracy: 0.8956 - val_loss: 0.276

BINARY C-LSTM on TREC



In [2]:
!pip install tensorflow scikit-learn datasets




In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers  # This is the missing import

from datasets import load_dataset

# Load the TREC dataset
dataset = load_dataset("trec", trust_remote_code=True)

# Extract training and test data
train_sentences = dataset['train']['text']
train_labels = dataset['train']['coarse_label']  # Corrected: 'coarse_label'
test_sentences = dataset['test']['text']
test_labels = dataset['test']['coarse_label']  # Corrected: 'coarse_label'

# Convert labels to binary classification problem
# For example, let's group labels into 0-2 -> Class 0, 3-5 -> Class 1 (binary classification)
def convert_to_binary_labels(labels):
    return [0 if label < 3 else 1 for label in labels]

train_binary_labels = convert_to_binary_labels(train_labels)
test_binary_labels = convert_to_binary_labels(test_labels)

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 100  # Max sequence length for questions

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Padding sequences
x_train = pad_sequences(train_sequences, maxlen=MAX_LEN)
x_test = pad_sequences(test_sequences, maxlen=MAX_LEN)
y_train = np.array(train_binary_labels)
y_test = np.array(test_binary_labels)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load GloVe embeddings
glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'  # Update this path
glove_embeddings = load_glove_embeddings(glove_file_path, 300)

# Create embedding matrix for the vocabulary
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# C-LSTM Model for Binary Classification
class CLSTMBinaryTRECClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryTRECClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Convolution layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # LSTM layer to capture long-term dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Fully connected layer for binary classification
        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)   # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # Squeeze channels dimension
            conv_outputs.append(conv)

        # Trimming all conv outputs to the same sequence length
        min_length = min([conv.shape[1] for conv in conv_outputs])
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        # Concatenate conv outputs if more than one
        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)
        else:
            rnn_inputs = conv_outputs[0]

        # LSTM layer
        rnn_outputs = self.lstm(rnn_inputs)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output layer (binary classification)
        logits = self.fc(rnn_outputs)
        return logits

# Configuration for the model
class Config:
    max_length = MAX_LEN
    num_classes = 2  # Binary classification
    vocab_size = VOCAB_SIZE
    embedding_size = 300  # GloVe embeddings size
    filter_sizes = "3,4,5"  # Convolution filter sizes
    num_filters = 64  # Number of filters for each filter size
    l2_reg_lambda = 0.1  # Regularization
    keep_prob = 0.5  # Dropout probability

# Instantiate config and the model
config = Config()
model = CLSTMBinaryTRECClassifier(config, embedding_matrix)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')



Epoch 1/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 25ms/step - accuracy: 0.7325 - loss: 0.7947 - val_accuracy: 0.8660 - val_loss: 0.4028
Epoch 2/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9205 - loss: 0.2747 - val_accuracy: 0.9280 - val_loss: 0.2676
Epoch 3/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9634 - loss: 0.1372 - val_accuracy: 0.9220 - val_loss: 0.2656
Epoch 4/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9848 - loss: 0.0722 - val_accuracy: 0.9360 - val_loss: 0.2631
Epoch 5/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9897 - loss: 0.0579 - val_accuracy: 0.9200 - val_loss: 0.3197
Epoch 6/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9931 - loss: 0.0454 - val_accuracy: 0.9360 - val_loss: 0.2897
Epoch 7/10
[1m86/86[0m [32m━━━━━━━━

FINE-GRAINED C-LSTM on TREC


In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from datasets import load_dataset

# Load the TREC dataset
dataset = load_dataset("trec")

# Extract training and test data
train_sentences = dataset['train']['text']
train_labels = dataset['train']['fine_label']  # Fine labels: 50 categories
test_sentences = dataset['test']['text']
test_labels = dataset['test']['fine_label']  # Fine labels: 50 categories

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 100  # Max sequence length for questions

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Padding sequences
x_train = pad_sequences(train_sequences, maxlen=MAX_LEN)
x_test = pad_sequences(test_sequences, maxlen=MAX_LEN)
y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load GloVe embeddings
glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'  # Update this path
glove_embeddings = load_glove_embeddings(glove_file_path, 300)

# Create embedding matrix for the vocabulary
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# C-LSTM Model for Fine-Grained Classification
class CLSTMFineGrainedTRECClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMFineGrainedTRECClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes  # Now it's fine-grained classification
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Convolution layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # LSTM layer to capture long-term dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Fully connected layer for fine-grained classification (50 classes)
        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)   # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # Squeeze channels dimension
            conv_outputs.append(conv)

        # Trimming all conv outputs to the same sequence length
        min_length = min([conv.shape[1] for conv in conv_outputs])
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        # Concatenate conv outputs if more than one
        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)
        else:
            rnn_inputs = conv_outputs[0]

        # LSTM layer
        rnn_outputs = self.lstm(rnn_inputs)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output layer (fine-grained classification with 50 classes)
        logits = self.fc(rnn_outputs)
        return logits

# Configuration for the model
class Config:
    max_length = MAX_LEN
    num_classes = 50  # Fine-grained classification (50 classes)
    vocab_size = VOCAB_SIZE
    embedding_size = 300  # GloVe embeddings size
    filter_sizes = "3,4,5"  # Convolution filter sizes
    num_filters = 64  # Number of filters for each filter size
    l2_reg_lambda = 0.1  # Regularization
    keep_prob = 0.5  # Dropout probability

# Instantiate config and the model
config = Config()
model = CLSTMFineGrainedTRECClassifier(config, embedding_matrix)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.2365 - loss: 8.7466 - val_accuracy: 0.5220 - val_loss: 3.9876
Epoch 2/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.4983 - loss: 3.1919 - val_accuracy: 0.5820 - val_loss: 2.6429
Epoch 3/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6129 - loss: 2.0846 - val_accuracy: 0.5980 - val_loss: 2.3345
Epoch 4/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6986 - loss: 1.7205 - val_accuracy: 0.6220 - val_loss: 2.2469
Epoch 5/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.7484 - loss: 1.5472 - val_accuracy: 0.6440 - val_loss: 2.1575
Epoch 6/10
[1m86/86[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.7945 - loss: 1.3928 - val_accuracy: 0.6780 - val_loss: 2.1095
Epoch 7/10
[1m86/86[0m [32m━━━━━━━━━

BINARY CLASSIFICATION C-LSTM on 20 NEWSGROUP


In [10]:
!pip install tensorflow scikit-learn datasets




In [13]:
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', categories=None, remove=('headers', 'footers', 'quotes'))

# Texts and labels
texts = newsgroups.data
labels = newsgroups.target

# Define a binary classification scenario (e.g., 'sci.space' vs 'rec.sport.baseball')
categories = ['sci.space', 'rec.sport.baseball']
newsgroups_binary = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Binary texts and labels
texts_binary = newsgroups_binary.data
labels_binary = newsgroups_binary.target  # 0 for 'rec.sport.baseball', 1 for 'sci.space'

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300  # Maximum sequence length

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts_binary)
sequences = tokenizer.texts_to_sequences(texts_binary)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)
y_data = np.array(labels_binary)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load GloVe embeddings
glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'  # Update with your GloVe path
glove_embeddings = load_glove_embeddings(glove_file_path, 300)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# C-LSTM Model for Binary Classification
class CLSTMBinaryNewsgroupsClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryNewsgroupsClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Convolutional layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Fully connected layer for binary classification
        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)   # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # Squeeze channels dimension
            conv_outputs.append(conv)

        # Trimming all conv outputs to the same sequence length
        min_length = min([conv.shape[1] for conv in conv_outputs])
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        # Concatenate conv outputs if more than one
        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)
        else:
            rnn_inputs = conv_outputs[0]

        # LSTM layer
        rnn_outputs = self.lstm(rnn_inputs)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output layer (binary classification)
        logits = self.fc(rnn_outputs)
        return logits

# Configuration for the model
class Config:
    max_length = 300
    num_classes = 2  # Binary classification
    vocab_size = VOCAB_SIZE
    embedding_size = 300  # GloVe embeddings size
    filter_sizes = "3,4,5"  # Convolution filter sizes
    num_filters = 64  # Number of filters for each filter size
    l2_reg_lambda = 0.1  # Regularization
    keep_prob = 0.5  # Dropout probability

# Instantiate config and the model
config = Config()
model = CLSTMBinaryNewsgroupsClassifier(config, embedding_matrix)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 432ms/step - accuracy: 0.6966 - loss: 0.9041 - val_accuracy: 0.8917 - val_loss: 0.5623
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.9016 - loss: 0.5027 - val_accuracy: 0.9169 - val_loss: 0.3685
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9306 - loss: 0.3255 - val_accuracy: 0.9169 - val_loss: 0.2832
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9619 - loss: 0.2000 - val_accuracy: 0.9295 - val_loss: 0.2148
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9716 - loss: 0.1384 - val_accuracy: 0.9320 - val_loss: 0.2062
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.9528 - loss: 0.1857 - val_accuracy: 0.9370 - val_loss: 0.1656
Epoch 7/10
[1m25/25[0m [32m━━

Fine-grained on 20Newsgroup

In [14]:
import numpy as np
import tensorflow as tf
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the 20 Newsgroups dataset with all categories
newsgroups = fetch_20newsgroups(subset='all', categories=None, remove=('headers', 'footers', 'quotes'))

# Texts and labels
texts = newsgroups.data
labels = newsgroups.target

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300  # Maximum sequence length

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=MAX_LEN)

# Encode labels into 0-19 for 20 newsgroups categories
y_data = np.array(labels)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load GloVe embeddings
glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'  # Update with your GloVe path
glove_embeddings = load_glove_embeddings(glove_file_path, 300)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# C-LSTM Model for Fine-Grained Classification (20 categories)
class CLSTMFineGrainedNewsgroupsClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMFineGrainedNewsgroupsClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Convolutional layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Fully connected layer for fine-grained classification (20 categories)
        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)   # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # Squeeze channels dimension
            conv_outputs.append(conv)

        # Trimming all conv outputs to the same sequence length
        min_length = min([conv.shape[1] for conv in conv_outputs])
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        # Concatenate conv outputs if more than one
        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)
        else:
            rnn_inputs = conv_outputs[0]

        # LSTM layer
        rnn_outputs = self.lstm(rnn_inputs)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output layer (fine-grained classification)
        logits = self.fc(rnn_outputs)
        return logits

# Configuration for the model
class Config:
    max_length = 300
    num_classes = 20  # Fine-grained classification: 20 categories
    vocab_size = VOCAB_SIZE
    embedding_size = 300  # GloVe embeddings size
    filter_sizes = "3,4,5"  # Convolution filter sizes
    num_filters = 64  # Number of filters for each filter size
    l2_reg_lambda = 0.1  # Regularization
    keep_prob = 0.5  # Dropout probability

# Instantiate config and the model
config = Config()
model = CLSTMFineGrainedNewsgroupsClassifier(config, embedding_matrix)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Epoch 1/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 51ms/step - accuracy: 0.1420 - loss: 4.3812 - val_accuracy: 0.3170 - val_loss: 2.2588
Epoch 2/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.3324 - loss: 2.1876 - val_accuracy: 0.3785 - val_loss: 2.0136
Epoch 3/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.4528 - loss: 1.8125 - val_accuracy: 0.4838 - val_loss: 1.7625
Epoch 4/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.5373 - loss: 1.6040 - val_accuracy: 0.4987 - val_loss: 1.6743
Epoch 5/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.5744 - loss: 1.4780 - val_accuracy: 0.5353 - val_loss: 1.6419
Epoch 6/10
[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.6263 - loss: 1.3773 - val_accuracy: 0.5798 - val_loss: 1.6189
Epoch 7/10
[1m236/23

In [2]:
!pip install datasets



Binary classification C-LSTM on AG News Dataset

In [5]:
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

# Load AG News Dataset from Hugging Face
dataset = load_dataset("ag_news")

# Select only two categories for binary classification (e.g., World and Sports)
def filter_categories(dataset, categories):
    filtered_texts = []
    filtered_labels = []
    for i in range(len(dataset['text'])):
        if dataset['label'][i] in categories:
            filtered_texts.append(dataset['text'][i])
            filtered_labels.append(categories.index(dataset['label'][i]))
    return filtered_texts, filtered_labels

# Use label 0 (World) and label 1 (Sports) for binary classification
categories = [0, 1]  # World and Sports categories
train_texts, train_labels = filter_categories(dataset['train'], categories)
test_texts, test_labels = filter_categories(dataset['test'], categories)

# Tokenization and padding
VOCAB_SIZE = 10000
MAX_LEN = 300  # Max sequence length

tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad the sequences
x_train = pad_sequences(train_sequences, maxlen=MAX_LEN)
x_test = pad_sequences(test_sequences, maxlen=MAX_LEN)

# Convert labels to numpy arrays
y_train = np.array(train_labels)
y_test = np.array(test_labels)

# Load GloVe embeddings
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

# Create embedding matrix
def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Load pre-trained GloVe embeddings (adjust the path)
glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, 300)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, glove_embeddings, VOCAB_SIZE, 300)

# C-LSTM Model for Binary Classification
class CLSTMBinaryAGNewsClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMBinaryAGNewsClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # Embedding layer initialized with GloVe embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # Convolutional layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # LSTM layer to capture dependencies
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        # Fully connected layer for binary classification
        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        x = self.embedding(inputs)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)   # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # Squeeze channels dimension
            conv_outputs.append(conv)

        # Trimming all conv outputs to the same sequence length
        min_length = min([conv.shape[1] for conv in conv_outputs])
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        # Concatenate conv outputs if more than one
        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)
        else:
            rnn_inputs = conv_outputs[0]

        # LSTM layer
        rnn_outputs = self.lstm(rnn_inputs)

        # Apply dropout
        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # Output layer (binary classification)
        logits = self.fc(rnn_outputs)
        return logits

# Configuration for the model
class Config:
    max_length = MAX_LEN
    num_classes = 2  # Binary classification
    vocab_size = VOCAB_SIZE
    embedding_size = 300  # GloVe embeddings size
    filter_sizes = "3,4,5"  # Convolution filter sizes
    num_filters = 64  # Number of filters for each filter size
    l2_reg_lambda = 0.1  # Regularization
    keep_prob = 0.5  # Dropout probability

# Instantiate config and the model
config = Config()
model = CLSTMBinaryAGNewsClassifier(config, embedding_matrix)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    x_train, y_train,
    batch_size=64,
    epochs=10,
    validation_data=(x_test, y_test),
    verbose=1
)

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


KeyboardInterrupt: 