In [22]:
import os

os.environ["KERAS_BACKEND"] = "torch"  # or jax, or tensorflow

import keras_hub

import keras
from keras import layers
from keras.layers import TextVectorization

from dataclasses import dataclass
import pandas as pd
import numpy as np
import glob
import re
from pprint import pprint

In [23]:
@dataclass
class Config:
    MAX_LEN = 256
    BATCH_SIZE = 32
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model
    NUM_LAYERS = 1
    path=r"C:/Users/a3318/Downloads/aclImdb/"


config = Config()

In [24]:
def get_text_list_from_files(files):
    text_list=[]
    for name in files:
        with open(name,encoding='utf-8') as f:
            for line in f:
                text_list.append(line)

    return text_list

def get_data_from_text_files(folder_name):
    pos_files = glob.glob(config.path + folder_name + "/pos/*.txt")
    pos_texts = get_text_list_from_files(pos_files)
    neg_files = glob.glob(config.path + folder_name + "/neg/*.txt")
    neg_texts = get_text_list_from_files(neg_files)
    df = pd.DataFrame(
        {
            "review": pos_texts + neg_texts,
            "sentiment": [0] * len(pos_texts) + [1] * len(neg_texts),
        }
    )
    df = df.sample(len(df)).reset_index(drop=True)
    return df

In [25]:
train_df = get_data_from_text_files("train")
test_df = get_data_from_text_files("test")

In [53]:
train_df['review']

0        I rented this by mistake. I thought, after a c...
1        TESS OF THE STORM COUNTRY is possibly the best...
2        This movie must be in line for the most boring...
3        This movie had it all,action,comedy,heroics,an...
4        This miserable film is a remake of a 1927 film...
                               ...                        
24995    This film is the worst excuse for a motion pic...
24996    I'm going to review the 2 films as a whole bec...
24997    Kirk and crew land on a lonely planet where th...
24998    A very strange and compelling movie. It's abou...
24999    There's a theory of time that posits that all ...
Name: review, Length: 25000, dtype: object

In [27]:
all_data = pd.concat([train_df, test_df], ignore_index=True)

In [28]:
import tensorflow as tf

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape("!#$%&'()*+,-./:;<=>?@\^_`{|}~"), ""
    )

In [29]:
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
    """Build Text vectorization layer

    Args:
      texts (list): List of string i.e input texts
      vocab_size (int): vocab size
      max_seq (int): Maximum sequence length.
      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].

    Returns:
        layers.Layer: Return TextVectorization Keras Layer
    """
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    vectorize_layer.adapt(texts)

    # Insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"]
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer


vectorize_layer = get_vectorize_layer(
    all_data.review.values.tolist(),
    config.VOCAB_SIZE,
    config.MAX_LEN,
    special_tokens=["[mask]"],
)

In [30]:
def encode(texts):
    encoded_texts = vectorize_layer(texts)
    return encoded_texts.numpy()

In [46]:
inp_mask = np.random.rand(25000,256) < 0.15
mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]

In [47]:
def get_masked_input_and_labels(encoded_texts):
    # 15% BERT masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    # Do not mask special tokens
    inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[inp_mask_2mask] = (
        mask_token_id  # mask token is the last in the dict
    )

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels, sample_weights

In [48]:
x_train = encode(train_df.review.values)

In [57]:
train_classifier_ds

<_BatchDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [34]:
y_train = train_df.sentiment.values
train_classifier_ds = (
    tf.data.Dataset.from_tensor_slices((x_train, y_train))
    .shuffle(1000)
    .batch(config.BATCH_SIZE)
)

In [35]:
x_test = encode(test_df.review.values)
y_test = test_df.sentiment.values
test_classifier_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(
    config.BATCH_SIZE
)

# Dataset for end to end model input (will be used at the end)
test_raw_classifier_ds = test_df

In [58]:
x_all_review = encode(all_data.review.values)
x_masked_train, y_masked_labels, sample_weights = get_masked_input_and_labels(
    x_all_review
)

mlm_ds = tf.data.Dataset.from_tensor_slices(
    (x_masked_train, y_masked_labels, sample_weights)
)
mlm_ds = mlm_ds.shuffle(1000).batch(config.BATCH_SIZE)

In [161]:
y_masked_labels.shape

(50000, 256)

In [298]:
def bert_module(query, key, value, i):
    attention_output=keras.layers.MultiHeadAttention(num_heads=config.NUM_HEAD,key_dim=config.EMBED_DIM//config.NUM_HEAD,
                                                     name="encoder_{}_multiheadattention".format(i))(key,query,value)
    attention_output = layers.Dropout(0.1, name="encoder_{}_att_dropout".format(i))(
        attention_output
    )
    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}_att_layernormalization".format(i)
    )(query + attention_output)

    ffn= keras.Sequential([
        layers.Dense(config.FF_DIM,activation='relu'),
        layers.Dense(config.EMBED_DIM),
        
    ],name="encoder_{}_ffn".format(i))

    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name="encoder_{}_ffn_dropout".format(i))(
        ffn_output
    )
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}_ffn_layernormalization".format(i)
    )(attention_output + ffn_output)
    return sequence_output



In [286]:
loss_fn = keras.losses.SparseCategoricalCrossentropy(reduction=None)
loss_tracker = keras.metrics.Mean(name="loss")

In [287]:
class MaskedLanguageModel(keras.Model):

    def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):

        loss = loss_fn(y, y_pred, sample_weight)
        loss_tracker.update_state(loss, sample_weight=sample_weight)
        return keras.ops.sum(loss)

    def compute_metrics(self, x, y, y_pred, sample_weight):

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]

In [295]:
def create_masked_language_bert_model():
    inputs=layers.Input(shape=(config.MAX_LEN,),dtype='int64')
    word_embeddings = layers.Embedding(
        config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
    )(inputs)
    position_embeddings = keras_hub.layers.PositionEmbedding(
        sequence_length=config.MAX_LEN
    )(word_embeddings)
    embeddings = word_embeddings + position_embeddings

    encoder_output = embeddings

    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)

    mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
        encoder_output
    )
    mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_model.compile(optimizer=optimizer)
    return mlm_model

id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}

In [299]:
class MaskedTextGenerator(keras.callbacks.Callback):
    def __init__(self, sample_tokens, top_k=5):
        self.sample_tokens = sample_tokens
        self.k = top_k

    def decode(self, tokens):
        return " ".join([id2token[t] for t in tokens if t != 0])

    def convert_ids_to_tokens(self, id):
        return id2token[id]

    def on_epoch_end(self, epoch, logs=None):
        prediction = self.model.predict(self.sample_tokens)

        masked_index = np.where(self.sample_tokens == mask_token_id)
        masked_index = masked_index[1]
        mask_prediction = prediction[0][masked_index]

        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
        values = mask_prediction[0][top_indices]

        for i in range(len(top_indices)):
            p = top_indices[i]
            v = values[i]
            tokens = np.copy(sample_tokens[0])
            tokens[masked_index[0]] = p
            result = {
                "input_text": self.decode(sample_tokens[0].numpy()),
                "prediction": self.decode(tokens),
                "probability": v,
                "predicted mask token": self.convert_ids_to_tokens(p),
            }
            pprint(result)


sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
generator_callback = MaskedTextGenerator(sample_tokens.numpy())

bert_masked_model = create_masked_language_bert_model()
bert_masked_model.summary()

In [None]:
bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])

Epoch 1/5
[1m 203/1563[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m3:21:50[0m 9s/step - loss: 7.9634

In [162]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

y_true = [2, 0]                     # Class IDs
y_pred = [[0.1, 0.3, 0.6], [0.7, 0.2, 0.1]]

loss = SparseCategoricalCrossentropy()(y_true, y_pred)
print("SparseCategoricalCrossentropy loss:", loss.numpy())


ValueError: `a` and `b` don't have the same structure. Received: structure of a=PyTreeSpec([*, *], NoneIsLeaf), structure of b=PyTreeSpec([[*, *, *], [*, *, *]], NoneIsLeaf)

In [166]:
import numpy as np

x=np.random.random((5,256,1000))
y=np.random.randint(0, 256, size=(5, 256), dtype=np.uint8)

In [173]:
loss = SparseCategoricalCrossentropy(reduction='none')(y, x)

In [174]:
loss

tensor([[ 8.4878,  6.2419, 10.1478,  ...,  9.9982,  6.7376,  6.2814],
        [ 6.2541,  6.7639,  7.3305,  ...,  6.6423,  6.5763,  7.0525],
        [ 6.8582, 10.1261,  7.9103,  ...,  7.4049,  6.7026,  6.8705],
        [ 6.5660,  6.1991,  6.4111,  ...,  6.5698,  9.2167,  7.7192],
        [ 8.3977,  7.8481,  8.1594,  ...,  6.2281,  6.2781,  6.8752]])

In [144]:
xx=x1(x)

In [152]:
xy = np.random.randint(0, 256, size=(5, 256), dtype=np.uint8)

In [133]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')

In [130]:
t1,t2=np.array([[0.51455494, 0.53035161, 0.64331915, 0.59038161],[0.51455494, 0.53035161, 0.64331915, 0.59038161]]),np.array([3,1])

In [112]:
t1.shape

(4,)

In [155]:
loss_fn(xy,xx)

tensor([[10.0939, 10.2668, 10.7452,  ..., 10.0202, 10.7854, 10.3159],
        [11.1202, 10.1544, 10.1013,  ..., 10.0233, 10.1789, 10.4505],
        [11.5520, 10.4220, 11.9781,  ..., 10.8833, 10.7110, 10.2547],
        [ 9.7282, 10.1410,  9.8785,  ...,  9.6930, 10.9449, 11.6794],
        [ 9.9433, 11.0040, 11.4024,  ..., 10.8093, 11.7266,  9.8967]],
       grad_fn=<NegBackward0>)