In [1]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds
from helper import build_LSTM_model, compile_LSTM_model, train_LSTM_model, create_input_output

## Data Loading

In [2]:
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_data = dataset['train'].map(lambda x, y: x)
test_data = dataset['test'].map(lambda x, y: x)

In [3]:
#Limit only first 650 rows in train_data and 150 in test_data

train_texts = list(train_data.take(1000))
test_texts = list(test_data.take(250))

2024-11-21 19:38:19.475321: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2024-11-21 19:38:19.504921: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### Create Bigrams

In [4]:
def tokenize(text):
    return text.numpy().decode('utf-8').split()

In [5]:
def extract_bigrams(text):
    words = tokenize(text)
    bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
    return bigrams

### Extract trigrams

In [6]:
train_bigrams = [bigram for text in train_texts for bigram in extract_bigrams(text)]
test_bigrams = [bigram for text in test_texts for bigram in extract_bigrams(text)]

### Building vocabulary

In [7]:
train_words = [w for bigram in train_bigrams for w in bigram]
test_words = [w for bigram in test_bigrams for w in bigram]

In [8]:
vocab = list(set(train_words))
vocab_size = len(vocab)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
word_to_idx["<UNK>"] = vocab_size  # Unknown token
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

### Convert Words to Indices

In [9]:
train_sequences = [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in train_words]
test_sequences = [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in test_words]

## Create input-output sequences

In [10]:
X_train, y_train = create_input_output(train_sequences)
X_test, y_test = create_input_output(test_sequences)

## Model building, compiling, and training

In [11]:
model = build_LSTM_model(vocab_size, X_train.shape[1])
compile_LSTM_model(model)
with tf.device('/GPU:0'):
    train_LSTM_model(model, X_train, y_train, X_test, y_test)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Calculate cross-entropy loss and perplexity

In [12]:
def calculate_perplexity(model, X, y):
    """Calculate perplexity of the model."""
    with tf.device('/GPU:0'):
        predictions = model.predict(X)
        log_prob_sum = 0
        N = len(y)
        
        for i in range(N):
            prob = predictions[i, y[i]]
            log_prob_sum += np.log(prob + 1e-10)  # Smoothing to avoid log(0)
        
        perplexity = np.exp(-log_prob_sum / N)
        return perplexity

perplexity = calculate_perplexity(model, X_test, y_test)
print(f'Perplexity: {perplexity}')

