In [2]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            try:
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
            except:
                pass
    return embeddings_index

In [4]:
glove_path = '/projects/elopez22/AAW/glove/glove.840B.300d.txt'

In [5]:
glove_embeddings = load_glove_embeddings(glove_path)

In [6]:
list(glove_embeddings.keys())[100:110]

['than', 'into', 'only', '3', 'how', 'its', 'first', 'said', 'i', 'If']

In [7]:
glove_embeddings['than']

array([-3.9611e-01,  1.8991e-01, -2.0033e-02, -3.9995e-01,  1.9013e-01,
       -1.9520e-01,  1.7977e-01,  1.8693e-01,  9.7576e-02,  2.6032e+00,
       -9.8194e-02,  1.3384e-01,  5.9189e-02, -1.0991e-01, -1.4721e-01,
        1.8844e-02,  1.4202e-01,  1.0236e+00, -2.9939e-01, -1.8452e-01,
       -2.3512e-01,  1.4060e-01, -1.4630e-02, -2.9891e-01, -1.4654e-01,
       -2.0150e-01, -2.4721e-01, -1.2147e-01,  4.0998e-01,  1.7997e-01,
        2.2555e-02,  1.3101e-01,  2.5632e-02, -2.0560e-01,  2.7974e-01,
       -1.6087e-01, -4.5666e-01,  3.5454e-02,  2.8487e-01, -2.5985e-02,
        5.9670e-02,  4.2833e-01,  1.8573e-01,  3.1357e-01,  6.6385e-02,
       -1.6109e-01, -2.5460e-01, -1.7968e-01,  2.5063e-01, -7.5260e-02,
       -4.9286e-01, -3.9815e-02,  7.8590e-02, -6.3624e-01,  2.9654e-01,
       -4.1694e-02, -1.9570e-01, -1.3311e-01,  8.3527e-03,  1.4585e-01,
        1.6051e-01, -2.1103e-01, -2.5591e-01,  2.6947e-01,  1.4012e-01,
       -3.2852e-01, -2.0914e-02,  4.4786e-01, -1.7867e-01, -1.43

In [8]:
embedding_dim = 300 # based on your glove file choice

In [11]:
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data()

# Load the word index mapping from IMDB
word_index = tf.keras.datasets.imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

# Decode integer-encoded reviews into strings
def decode_review(encoded_review):
    return " ".join([reverse_word_index.get(i, "?") for i in encoded_review])

# Convert all reviews to text
train_texts = [decode_review(review) for review in train_data]
test_texts = [decode_review(review) for review in test_data]

In [12]:
train_texts[0]

"the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but wh

In [49]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_texts)
vocab_size = len(tokenizer.word_index) + 1

In [50]:
vocab_size

88582

In [51]:
# Convert texts to sequences
x_train = tokenizer.texts_to_sequences(train_texts)
x_test = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to ensure uniform input length
maxlen = 100
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [53]:
# Create an embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, index in tokenizer.word_index.items():
    if index < vocab_size:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector


In [55]:
embedding_matrix[1]

array([ 6.02159984e-02,  2.17989996e-01, -4.24900018e-02, -3.86180013e-01,
       -1.53880000e-01,  3.46349999e-02,  2.22430006e-01,  2.17179999e-01,
        6.84829988e-03,  2.43750000e+00, -2.74179995e-01,  1.35720000e-01,
        3.10860008e-01, -6.32060021e-02,  3.82250000e-04, -1.85969993e-01,
       -1.93330005e-01,  1.44470000e+00, -3.85410011e-01, -2.85490006e-01,
        7.56269991e-02, -3.67989987e-02, -4.60680008e-01, -1.68350004e-02,
        1.98210001e-01, -9.27459970e-02,  1.89539999e-01, -3.26479989e-04,
       -1.70809999e-01,  5.03589988e-01,  4.62559998e-01,  2.69010007e-01,
       -1.22560002e-01,  2.47130007e-01,  6.93050027e-02, -2.07770005e-01,
       -4.45600003e-01,  3.02230000e-01, -9.83440038e-03,  3.27719986e-01,
        1.10380001e-01,  4.12710011e-01, -1.58539996e-01, -5.69830015e-02,
        3.89180005e-01, -2.11579993e-01, -1.33070007e-01,  4.04060006e-01,
        1.74899995e-01,  5.39489985e-02,  1.09839998e-01, -1.84760004e-01,
       -5.40140010e-02,  

In [81]:
model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=maxlen,
        trainable=False  # Freeze embeddings, or set to True to fine-tune
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    Dense(2, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [80]:
train_labels_cat = to_categorical(train_labels, num_classes=2) 

In [79]:
train_labels_cat

array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [82]:
history = model.fit(
    x_train, train_labels_cat,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.5553 - loss: 0.6718 - val_accuracy: 0.6796 - val_loss: 0.5736
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7160 - loss: 0.5428 - val_accuracy: 0.7296 - val_loss: 0.5322
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7918 - loss: 0.4404 - val_accuracy: 0.7472 - val_loss: 0.4992
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8438 - loss: 0.3575 - val_accuracy: 0.7836 - val_loss: 0.4658
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8948 - loss: 0.2539 - val_accuracy: 0.7634 - val_loss: 0.6330
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9321 - loss: 0.1753 - val_accuracy: 0.7746 - val_loss: 0.6379
Epoch 7/10
[1m625/625[0m 

In [83]:
# New input text
input_text = "This movie was really really great"

# Tokenize the input text
input_sequence = tokenizer.texts_to_sequences([input_text])

# Pad the sequence to match the model's input length
input_padded = pad_sequences(input_sequence, maxlen=maxlen)

In [84]:
predicted_score = model.predict(input_padded)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step


In [85]:
predicted_score

array([[0.00471607, 0.99438053]], dtype=float32)

In [86]:
# New input text
input_text = 'The movie was terrible, worst movie ever!'

# Tokenize the input text
input_sequence = tokenizer.texts_to_sequences([input_text])

# Pad the sequence to match the model's input length
input_padded = pad_sequences(input_sequence, maxlen=maxlen)

predicted_score = model.predict(input_padded)
predicted_score

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


array([[0.80786264, 0.21712498]], dtype=float32)

In [88]:
# New input text
input_text = 'The movie was terrible, worst movie ever! It was also a really good movie!'

# Tokenize the input text
input_sequence = tokenizer.texts_to_sequences([input_text])

# Pad the sequence to match the model's input length
input_padded = pad_sequences(input_sequence, maxlen=maxlen)

predicted_score = model.predict(input_padded)
predicted_score

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


array([[0.00942444, 0.9873355 ]], dtype=float32)

In [89]:
# New input text
input_text = 'It was a really good movie! The movie was terrible, worst movie ever! '

# Tokenize the input text
input_sequence = tokenizer.texts_to_sequences([input_text])

# Pad the sequence to match the model's input length
input_padded = pad_sequences(input_sequence, maxlen=maxlen)

predicted_score = model.predict(input_padded)
predicted_score

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


array([[0.21898414, 0.7889716 ]], dtype=float32)

In [3]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [4]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [5]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

2024-11-25 10:10:39.371693: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:376] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
2024-11-25 10:10:41.654700: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [6]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but'],
      dtype='<U14')

In [7]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [8]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [9]:
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10


I0000 00:00:1732547519.610913 2384320 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 44ms/step - accuracy: 0.5169 - loss: 0.6794 - val_accuracy: 0.7568 - val_loss: 0.4556
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 43ms/step - accuracy: 0.8007 - loss: 0.4255 - val_accuracy: 0.8339 - val_loss: 0.3497
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 43ms/step - accuracy: 0.8483 - loss: 0.3458 - val_accuracy: 0.8573 - val_loss: 0.3158
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 43ms/step - accuracy: 0.8599 - loss: 0.3248 - val_accuracy: 0.8609 - val_loss: 0.3433
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 43ms/step - accuracy: 0.8596 - loss: 0.3174 - val_accuracy: 0.8594 - val_loss: 0.3127
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 43ms/step - accuracy: 0.8680 - loss: 0.3035 - val_accuracy: 0.8323 - val_loss: 0.3480
Epoch 7/10
[1m391/391[0m 

2024-11-25 10:14:50.600052: W tensorflow/core/kernels/data/cache_dataset_ops.cc:914] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [13]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')

# Ensure the input is in a format the model expects (batch of strings)
prediction = model.predict(tf.constant([sample_text]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


In [14]:
prediction

array([[0.68256414]], dtype=float32)

In [15]:
sample_text = ('The movie was terrible, worst movie ever!')

# Ensure the input is in a format the model expects (batch of strings)
prediction = model.predict(tf.constant([sample_text]))

ValueError: Unrecognized data type: x=The movie was terrible, worst movie ever! (of type <class 'str'>)

In [16]:
prediction

array([[-3.8897943]], dtype=float32)