In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers

In [None]:
# C-LSTM with the embedding layer with the pre-trained glove embeddings
class CLSTMClassifier(tf.keras.Model):
    def __init__(self, config, embedding_matrix):
        super(CLSTMClassifier, self).__init__()
        self.max_length = config.max_length
        self.num_classes = config.num_classes
        self.vocab_size = config.vocab_size
        self.embedding_size = config.embedding_size
        self.filter_sizes = list(map(int, config.filter_sizes.split(",")))
        self.num_filters = config.num_filters
        self.num_layers = config.num_layers
        self.hidden_size = len(self.filter_sizes) * self.num_filters
        self.l2_reg_lambda = config.l2_reg_lambda

        # embedding layer initialized with glove embeddings
        self.embedding = layers.Embedding(input_dim=self.vocab_size,
                                          output_dim=self.embedding_size,
                                          input_length=self.max_length,
                                          weights=[embedding_matrix],
                                          trainable=True)

        # conv layers for different filter sizes
        self.conv_layers = [
            layers.Conv2D(filters=self.num_filters,
                          kernel_size=(filter_size, self.embedding_size),
                          activation='relu', padding='valid')
            for filter_size in self.filter_sizes
        ]

        # lstm layer
        self.lstm = layers.LSTM(self.hidden_size, return_sequences=False)

        self.dropout = layers.Dropout(rate=config.keep_prob)

        self.fc = layers.Dense(self.num_classes, activation='softmax', kernel_regularizer=tf.keras.regularizers.L2(self.l2_reg_lambda))

    def call(self, inputs, training=False):
        input_x = inputs
        x = self.embedding(input_x)  # [batch_size, max_length, embedding_size]
        x = tf.expand_dims(x, -1)    # [batch_size, max_length, embedding_size, 1]

        conv_outputs = []
        for conv_layer in self.conv_layers:
            conv = conv_layer(x)
            conv = tf.squeeze(conv, 2)  # squeezing out the 'channels' dimension
            conv_outputs.append(conv)

        # the minimum sequence length across all convolution outputs
        min_length = min([conv.shape[1] for conv in conv_outputs])

        # trimmikng all convolution outputs to the same sequence length
        conv_outputs = [conv[:, :min_length, :] for conv in conv_outputs]

        if len(conv_outputs) > 1:
            rnn_inputs = tf.concat(conv_outputs, -1)  # concat along the last dimension
        else:
            rnn_inputs = conv_outputs[0]

        # feed it to the LSTM
        rnn_outputs = self.lstm(rnn_inputs)

        rnn_outputs = self.dropout(rnn_outputs, training=training)

        # final output layer
        logits = self.fc(rnn_outputs)
        return logits

In [None]:
# hyperparameters
BATCH_SIZE = 64
EPOCHS = 10
MAX_LEN = 500  # max length of sequences (padded or truncated)
VOCAB_SIZE = 5000  # the vocabulary
EMBEDDING_DIM = 300  # glove embedding dimensions

RUNNING THE MODEL ON THE IMDB DATASET

In [None]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)

# padding the sequences to ensure uniform input size
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)


def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider the top 'vocab_size' words
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                # Words not found in the embedding index will be all zeros.
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, EMBEDDING_DIM)

word_index = imdb.get_word_index()

embedding_matrix = create_embedding_matrix(word_index, glove_embeddings, VOCAB_SIZE, EMBEDDING_DIM)

class Config:
    max_length = MAX_LEN
    num_classes = 2  # imdb is binary classification (positive/negative)
    vocab_size = VOCAB_SIZE
    embedding_size = EMBEDDING_DIM  # glove embedding dimension
    filter_sizes = "3,4,5"
    num_filters = 64
    num_layers = 1
    l2_reg_lambda = 0.1
    keep_prob = 0.5

config = Config()
model = CLSTMClassifier(config, embedding_matrix)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    verbose=1
)

test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/10




[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 100ms/step - accuracy: 0.5800 - loss: 0.8195 - val_accuracy: 0.7029 - val_loss: 0.5979
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 90ms/step - accuracy: 0.8033 - loss: 0.4441 - val_accuracy: 0.8528 - val_loss: 0.3574
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 92ms/step - accuracy: 0.8873 - loss: 0.2962 - val_accuracy: 0.7836 - val_loss: 0.5183
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 93ms/step - accuracy: 0.9085 - loss: 0.2516 - val_accuracy: 0.8848 - val_loss: 0.3031
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 95ms/step - accuracy: 0.9355 - loss: 0.2020 - val_accuracy: 0.8888 - val_loss: 0.3031
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 96ms/step - accuracy: 0.9487 - loss: 0.1719 - val_accuracy: 0.8908 - val_loss: 0.2935
Epoch 7/10
[1m391/391[0m

In [None]:
!pip install datasets



RUNNING THE MODEL ON THE AMAZON REVIEWS DATASET



In [None]:
from datasets import load_dataset
import numpy as np

dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)

reviews = [item['text'] for item in dataset['full']]
labels = [1 if item['rating'] >= 3 else 0 for item in dataset['full']]

print(f"First review: {reviews[0]}")
print(f"First label: {labels[0]}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


First review: This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!
First label: 1


In [None]:
review_lengths = [len(review.split()) for review in reviews]
avg_review_length = np.mean(review_lengths)
max_review_length = np.max(review_lengths)
print(f"Average review length: {avg_review_length}")
print(f"Maximum review length: {max_review_length}")

tokeniser = Tokenizer()
tokeniser.fit_on_texts(reviews)
total_unique_words = len(tokeniser.word_index)
print(f"Total unique words in the dataset: {total_unique_words}")


import collections
rating_distribution = collections.Counter([item['rating'] for item in dataset['full']])
print(f"Rating distribution: {rating_distribution}")

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

tokeniser = Tokenizer(num_words=10000)  # setting vocab_size to 10,000 as per the updated config
tokeniser.fit_on_texts(reviews)  # fitting the tokeniser on the Amazon reviews dataset
sequences = tokeniser.texts_to_sequences(reviews)

# pad the sequences to the max length of 300
x_data = pad_sequences(sequences, maxlen=300)

# labels to numpy array
y_data = np.array(labels)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)


In [21]:
def load_glove_embeddings(glove_file_path, embedding_dim):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding_vector
    return embeddings_index

def create_embedding_matrix(word_index, glove_embeddings, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:  # Only consider the top 'vocab_size' words
            embedding_vector = glove_embeddings.get(word)
            if embedding_vector is not None:
                # Words not found in the embedding index will be all zeros.
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

glove_file_path = '/content/drive/MyDrive/glove/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path, 300)

# embedding matrix
word_index = tokenizer.word_index
embedding_matrix = create_embedding_matrix(word_index, glove_embeddings, 10000, 300)


In [22]:
class Config:
    max_length = 300  # based on the average review length analysis
    num_classes = 2  # binary classification (positive/negative)
    vocab_size = 10000  # limit vocabulary size to top 10,000 words
    embedding_size = 300  #  300-dimensional GloVe embeddings
    filter_sizes = "3,4,5"  # convolution filter sizes
    num_filters = 64  # num of filters for each filter size
    num_layers = 1  #  LSTM layer
    l2_reg_lambda = 0.1  # L2 regularisation to prevent overfitting
    keep_prob = 0.5  # Dropout probability

# init the C-LSTM model with the updated configuration and embedding matrix
config = Config()
model = CLSTMClassifier(config, embedding_matrix)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [None]:
BATCH_SIZE = 64
EPOCHS = 10

history = model.fit(
    x_train, y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    verbose=1
)


Epoch 1/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m420s[0m 47ms/step - accuracy: 0.8978 - loss: 0.2671 - val_accuracy: 0.9256 - val_loss: 0.1901
Epoch 2/10
[1m8770/8770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m420s[0m 48ms/step - accuracy: 0.9300 - loss: 0.1800 - val_accuracy: 0.9276 - val_loss: 0.1838
Epoch 3/10
[1m4132/8770[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m3:21[0m 43ms/step - accuracy: 0.9386 - loss: 0.1619

In [None]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')


RUNNING THE MODEL ON THE YELP REVIEWS DATASET