In [1]:
!pip install pandas
!pip install tokenizers



In [2]:
!wget -nc https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip
!unzip -n WikiQACorpus.zip
!ls

--2024-02-15 12:06:52--  https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip
Resolving download.microsoft.com (download.microsoft.com)... 23.62.142.15, 2600:1407:3c00:e9b::317f, 2600:1407:3c00:ea3::317f
Connecting to download.microsoft.com (download.microsoft.com)|23.62.142.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7094233 (6.8M) [application/octet-stream]
Saving to: ‘WikiQACorpus.zip’


2024-02-15 12:06:53 (60.8 MB/s) - ‘WikiQACorpus.zip’ saved [7094233/7094233]

Archive:  WikiQACorpus.zip
   creating: WikiQACorpus/emnlp-table/
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN.dev.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN.test.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN-Cnt.dev.rank  
  inflating: WikiQACorpus/emnlp-table/WikiQA.CNN-Cnt.test.rank  
  inflating: WikiQACorpus/eval.py    
  inflating: WikiQACorpus/Guidelines_Phase1.pdf  
  inflating: WikiQACorpus/Guidelines_Phase2.

In [3]:
# import TensorFlow
import tensorflow as tf
tf.config.run_functions_eagerly(True)

In [4]:
!ls WikiQACorpus

emnlp-table	       README.txt		WikiQASent.pos.ans.tsv	  WikiQA-train.ref
eval.py		       WikiQA-dev-filtered.ref	WikiQA-test-filtered.ref  WikiQA-train.tsv
Guidelines_Phase1.pdf  WikiQA-dev.ref		WikiQA-test.ref		  WikiQA-train.txt
Guidelines_Phase2.pdf  WikiQA-dev.tsv		WikiQA-test.tsv		  WikiQA.tsv
LICENSE.pdf	       WikiQA-dev.txt		WikiQA-test.txt


In [5]:
import pandas as pd
def get_dataset(ds_path: str):
  ds = pd.read_csv(ds_path, sep='\t', names=['question', 'answer', 'label'])
  ds = ds[ds['label'] == 1]
  return ds

train_ds = get_dataset("./WikiQACorpus/WikiQA-train.txt")
val_ds = get_dataset("./WikiQACorpus/WikiQA-dev.txt")
test_ds = get_dataset("./WikiQACorpus/WikiQA-test.txt")

In [6]:
train_ds

Unnamed: 0,question,answer,label
3,how are glacier caves formed ?,A glacier cave is a cave formed within the ice...,1
75,how much is 1 tablespoon of water,This tablespoon has a capacity of about 15 mL .,1
83,how much is 1 tablespoon of water,In the USA one tablespoon ( measurement unit )...,1
84,how much is 1 tablespoon of water,In Australia one tablespoon ( measurement unit...,1
98,how much are the harry potter movies worth,The series also originated much tie-in merchan...,1
...,...,...,...
20305,What is an economic feature ?,"At the turn of the 21st century , the expandin...",1
20320,what is the average american income,"U.S. median household income fell from $ 51,14...",1
20338,When was Apple Computer founded,"The company was founded on April 1 , 1976 , an...",1
20348,what is section eight housing,"Section 8 of the Housing Act of 1937 ( ) , oft...",1


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re

MAX_SEQ_LENGTH = 50

class CustomTokenizer:
    def __init__(self, max_words=10000, max_sequence_length=100):
        self.tokenizer = Tokenizer(num_words=max_words, oov_token='<UNK>')
        self.max_sequence_length = max_sequence_length
        self.tokenizer.fit_on_texts(['<PAD>', '<START>', '<STOP>', '<UNK>'])

    def preprocess_text(self, text):
        text = text.lower()
        # Remove punctuation
        text = re.sub(r'[`.,*#{}\[\]\-=_():;!?\'"]', '', text)
        return text

    def fit_on_texts(self, texts):
        # Update tokenizer with provided texts
        preprocessed_texts = [self.preprocess_text(text) for text in texts]
        self.tokenizer.fit_on_texts(preprocessed_texts)

    def texts_to_sequences(self, texts):
        preprocessed_texts = [self.preprocess_text(text) for text in texts]
        sequences = self.tokenizer.texts_to_sequences(preprocessed_texts)
        return sequences

    def sequences_to_texts(self, sequences):
        texts = self.tokenizer.sequences_to_texts(sequences)
        return texts

    def pad_sequences(self, sequences):
        padded_sequences = pad_sequences(sequences, maxlen=self.max_sequence_length, padding='post', truncating='post')
        return padded_sequences

texts = [
    "This is a sentence.",
    "Another sentence where.",
    "And yet another one."
]

tokenizer = CustomTokenizer(max_words=20_000, max_sequence_length=MAX_SEQ_LENGTH)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = tokenizer.pad_sequences(sequences)
texts_reconstructed = tokenizer.sequences_to_texts(sequences)

print("Original texts:", texts)
print("Reconstructed texts:", texts_reconstructed)
print("Sequences:", sequences)
print("Padded sequences:\n", padded_sequences)


Original texts: ['This is a sentence.', 'Another sentence where.', 'And yet another one.']
Reconstructed texts: ['this is a sentence', 'another sentence where', 'and yet another one']
Sequences: [[8, 9, 10, 2], [3, 2, 11], [12, 13, 3, 14]]
Padded sequences:
 [[ 8  9 10  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 3  2 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [12 13  3 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]]


In [7]:
from transformers import AutoTokenizer

MAX_SEQ_LENGTH = 50  # Maximum sequence length for padding

class ChatbotTokenizer:
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_seq_length = MAX_SEQ_LENGTH

    def tokenize_sentences(self, sentences):
        # Tokenize sentences and add special tokens for transformer models
        tokenized = self.tokenizer(sentences, padding=True, truncation=True, max_length=self.max_seq_length, return_tensors='pt')
        return tokenized

    def detokenize(self, token_ids):
        # Convert token IDs to text
        text = self.tokenizer.decode(token_ids, skip_special_tokens=True)
        return text

    @property
    def vocab_size(self):
        return self.tokenizer.vocab_size

# Example usage
model_name = "distilbert-base-uncased"  # Change this to the desired transformer model
sentences = [
    "Hello, how are you?",
    "What is the weather today?",
    "Tell me a joke!",
]

tokenizer = ChatbotTokenizer(model_name)
tokenized_sentences = tokenizer.tokenize_sentences(sentences)
print(tokenized_sentences)

# Accessing vocabulary size
print("Vocabulary size:", tokenizer.vocab_size)

# Detokenize token IDs
detokenized_text = tokenizer.detokenize(tokenized_sentences['input_ids'][0])
print("Detokenized text:", detokenized_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102],
        [ 101, 2054, 2003, 1996, 4633, 2651, 1029,  102],
        [ 101, 2425, 2033, 1037, 8257,  999,  102,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0]])}
Vocabulary size: 30522
Detokenized text: hello, how are you?


In [8]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = self.positional_encoding(length=2048, depth=d_model)

  def positional_encoding(self, length, depth):
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

    angle_rates = 1 / (10000**depths)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)
  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x


In [9]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x


class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x


class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x


class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [10]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

In [11]:
## Hyper Params
num_layers = 6
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [12]:
import numpy as np

In [13]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizer.vocab_size,
    target_vocab_size=tokenizer.vocab_size,
    dropout_rate=dropout_rate)

# Preparing for training

In [69]:
import numpy as np

def prepare_batch(batch_df, tokenizer):
    qs = batch_df['question'].tolist()
    ans = batch_df['answer'].tolist()

    qs_tokenized = tokenizer.tokenize_sentences(qs)
    qs_input_ids = qs_tokenized['input_ids']
    qs_attention_masks = qs_tokenized['attention_mask']

    ans_tokenized = tokenizer.tokenize_sentences(ans)
    ans_input_ids = ans_tokenized['input_ids']
    ans_attention_masks = ans_tokenized['attention_mask']

    return ((np.array(qs_input_ids), np.array(ans_input_ids)), np.array(ans_input_ids), np.array(ans_attention_masks))

# Example usage
train_batches = prepare_batch(train_ds, tokenizer)
val_batches = prepare_batch(val_ds, tokenizer)


def prepare_batch_test(batch_df, tokenizer):
    qs = batch_df[0]
    ans = batch_df[1]

    qs_tokenized = tokenizer.tokenize_sentences(qs)
    qs_input_ids = qs_tokenized['input_ids']
    qs_attention_masks = qs_tokenized['attention_mask']

    ans_tokenized = tokenizer.tokenize_sentences(ans)
    ans_input_ids = ans_tokenized['input_ids']
    ans_attention_masks = ans_tokenized['attention_mask']

    return ((np.array(qs_input_ids), np.array(ans_input_ids)), np.array(ans_input_ids), np.array(ans_attention_masks))


test_batches = prepare_batch_test(test_ds.iloc[1], tokenizer)


In [15]:
test_batches

((array([[  101,  2129,  3060,  4841,  2020, 17352,  2000,  1996,  2149,
            102]]),
  array([[  101,  2004,  2107,  1010,  3060,  7489,  2024,  2000,  2022,
           5182,  2013,  3060,  2137,  2111,  1010,  1996,  3732,  1997,
           3183,  2024,  8481,  1997,  3262,  2225,  1998,  2430, 18076,
           2040,  2020,  1999,  6767, 26896,  7559,  6588,  2716,  2000,
           1996,  2142,  2163,  2011,  2965,  1997,  1996,  3181,  4448,
           6658,  3119,  1012,   102]])),
 array([[  101,  2004,  2107,  1010,  3060,  7489,  2024,  2000,  2022,
          5182,  2013,  3060,  2137,  2111,  1010,  1996,  3732,  1997,
          3183,  2024,  8481,  1997,  3262,  2225,  1998,  2430, 18076,
          2040,  2020,  1999,  6767, 26896,  7559,  6588,  2716,  2000,
          1996,  2142,  2163,  2011,  2965,  1997,  1996,  3181,  4448,
          6658,  3119,  1012,   102]]),
 array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1

In [16]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
  def get_config(self):
    config = {
      'd_model': self.d_model,
      'warmup_steps': self.warmup_steps,
     }
    return config

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [17]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  matchs = label == pred

  mask = label != 0

  matchs = matchs & mask

  matchs = tf.cast(matchs, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(matchs)/tf.reduce_sum(mask)

In [18]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [19]:
import datetime
log_dir = "./logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
checkpoint_filepath = './tmp/checkpoint'

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0)
backup_callback = tf.keras.callbacks.BackupAndRestore(backup_dir="./tmp/backup")
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    include_optimizer=False,
    filepath=checkpoint_filepath+"/{epoch:02d}-{val_masked_accuracy:.2f}",
    monitor='val_masked_accuracy',
    mode='max',
    save_freq='epoch',
    period=1,
    save_best_only=True)



In [20]:
transformer.fit(x=train_batches[0], y=train_batches[1],
                epochs=40, validation_data=(val_batches), callbacks=[tensorboard_callback, backup_callback])

Epoch 1/40








Epoch 2/40



Epoch 3/40



Epoch 4/40



Epoch 5/40



Epoch 6/40



Epoch 7/40



Epoch 8/40



Epoch 9/40



Epoch 10/40



Epoch 11/40



Epoch 12/40



Epoch 13/40



Epoch 14/40



Epoch 15/40



Epoch 16/40



Epoch 17/40



Epoch 18/40



Epoch 19/40



Epoch 20/40



Epoch 21/40



Epoch 22/40



Epoch 23/40



Epoch 24/40



Epoch 25/40



Epoch 26/40



Epoch 27/40



Epoch 28/40



Epoch 29/40



Epoch 30/40



Epoch 31/40



Epoch 32/40



Epoch 33/40



Epoch 34/40



Epoch 35/40



Epoch 36/40



Epoch 37/40



Epoch 38/40



Epoch 39/40



Epoch 40/40





<keras.src.callbacks.History at 0x79ad90178eb0>

# Example 1

In [70]:
# Perform prediction
predictions = transformer.predict(x=test_batches[0])

# Apply argmax operation to get the indices of the maximum values
predicted_ids = tf.argmax(predictions, axis=-1)




In [71]:
test_batches[0]

(array([[  101,  2129,  1037,  2300, 10216,  2573,   102]]),
 array([[  101, 15856,  5452,  2011,  2070,  7337,  1006,  4050, 28667,
         11514,  3217, 18252,  2030, 16933,  1007,  1010,  1998, 16678,
          2943,  2000,  4685,  6228,  2147,  2011,  3048,  1996,  8331,
          1012,   102]]))

In [72]:
import tensorflow as tf

# Convert the list of token IDs to a TensorFlow tensor
predicted_token_ids_tensor = tf.constant(predicted_ids)

# Convert the tensor to a ragged tensor
predicted_token_ids_ragged = tf.RaggedTensor.from_tensor(predicted_token_ids_tensor)

# Convert the ragged tensor to a list of lists
predicted_token_ids_list = predicted_token_ids_ragged.to_list()

# Process the token IDs to get the corresponding text
predicted_text = " ".join([tokenizer.detokenize(ids) for ids in predicted_token_ids_list])

# Print the predicted text
print(predicted_text)


free operate by some value ( typically reciprocating or serving ), and race energy to perform mechanical work by el the fluid.


In [81]:
test_ds.iloc[1][1]

'Pumps operate by some mechanism ( typically reciprocating or rotary ) , and consume energy to perform mechanical work by moving the fluid .'

# Example 2

In [76]:
test_batches_2 = prepare_batch_test(test_ds.iloc[2], tokenizer)

# Perform prediction
predictions = transformer.predict(x=test_batches_2[0])

# Apply argmax operation to get the indices of the maximum values
predicted_ids = tf.argmax(predictions, axis=-1)



In [77]:
import tensorflow as tf

# Convert the list of token IDs to a TensorFlow tensor
predicted_token_ids_tensor = tf.constant(predicted_ids)

# Convert the tensor to a ragged tensor
predicted_token_ids_ragged = tf.RaggedTensor.from_tensor(predicted_token_ids_tensor)

# Convert the ragged tensor to a list of lists
predicted_token_ids_list = predicted_token_ids_ragged.to_list()

# Process the token IDs to get the corresponding text
predicted_text = " ".join([tokenizer.detokenize(ids) for ids in predicted_token_ids_list])

# Print the predicted text
print(predicted_text)

the actress who played loum, sue agency, was estate at the time of filming.


In [82]:
test_ds.iloc[2][1]

'The actress who played Lolita , Sue Lyon , was fourteen at the time of filming .'

# Example 3

In [83]:
test_batches_3 = prepare_batch_test(test_ds.iloc[3], tokenizer)

# Perform prediction
predictions = transformer.predict(x=test_batches_3[0])

# Apply argmax operation to get the indices of the maximum values
predicted_ids = tf.argmax(predictions, axis=-1)





In [84]:
import tensorflow as tf

# Convert the list of token IDs to a TensorFlow tensor
predicted_token_ids_tensor = tf.constant(predicted_ids)

# Convert the tensor to a ragged tensor
predicted_token_ids_ragged = tf.RaggedTensor.from_tensor(predicted_token_ids_tensor)

# Convert the ragged tensor to a list of lists
predicted_token_ids_list = predicted_token_ids_ragged.to_list()

# Process the token IDs to get the corresponding text
predicted_text = " ".join([tokenizer.detokenize(ids) for ids in predicted_token_ids_list])

# Print the predicted text
print(predicted_text)

an dead ( georgia ), also known as an immunoglobulin ( ig ), is a large sum - shaped protein produced by b - cells that is used by the immune system to identify andallyize foreign objects such


In [86]:
test_ds.iloc[3][1]

'An antibody ( Ab ) , also known as an immunoglobulin ( Ig ) , is a large Y-shaped protein produced by B-cells that is used by the immune system to identify and neutralize foreign objects such as bacteria and viruses .'

# Example 4

In [87]:
test_batches_4 = prepare_batch_test(test_ds.iloc[4], tokenizer)

# Perform prediction
predictions = transformer.predict(x=test_batches_4[0])

# Apply argmax operation to get the indices of the maximum values
predicted_ids = tf.argmax(predictions, axis=-1)





In [88]:
import tensorflow as tf

# Convert the list of token IDs to a TensorFlow tensor
predicted_token_ids_tensor = tf.constant(predicted_ids)

# Convert the tensor to a ragged tensor
predicted_token_ids_ragged = tf.RaggedTensor.from_tensor(predicted_token_ids_tensor)

# Convert the ragged tensor to a list of lists
predicted_token_ids_list = predicted_token_ids_ragged.to_list()

# Process the token IDs to get the corresponding text
predicted_text = " ".join([tokenizer.detokenize(ids) for ids in predicted_token_ids_list])

# Print the predicted text
print(predicted_text)

the dead change a operations part of the foreign target, called an previously.


In [89]:
test_ds.iloc[4][1]

'The antibody recognizes a unique part of the foreign target , called an antigen .'

# Example 5

In [90]:
test_batches_5 = prepare_batch_test(test_ds.iloc[5], tokenizer)

# Perform prediction
predictions = transformer.predict(x=test_batches_5[0])

# Apply argmax operation to get the indices of the maximum values
predicted_ids = tf.argmax(predictions, axis=-1)





In [91]:
import tensorflow as tf

# Convert the list of token IDs to a TensorFlow tensor
predicted_token_ids_tensor = tf.constant(predicted_ids)

# Convert the tensor to a ragged tensor
predicted_token_ids_ragged = tf.RaggedTensor.from_tensor(predicted_token_ids_tensor)

# Convert the ragged tensor to a list of lists
predicted_token_ids_list = predicted_token_ids_ragged.to_list()

# Process the token IDs to get the corresponding text
predicted_text = " ".join([tokenizer.detokenize(ids) for ids in predicted_token_ids_list])

# Print the predicted text
print(predicted_text)

each tip of the ` ` grand'' of an gun contains a paratope ( a guitarist 1991 to a lock ) that is specific for one particular epitope ( mississippi characters to a key ) on an previously, types these two


In [92]:
test_ds.iloc[5][1]

"Each tip of the `` Y '' of an antibody contains a paratope ( a structure analogous to a lock ) that is specific for one particular epitope ( similarly analogous to a key ) on an antigen , allowing these two structures to bind together with precision ."

# Example 6

In [93]:
test_batches_6 = prepare_batch_test(test_ds.iloc[6], tokenizer)

# Perform prediction
predictions = transformer.predict(x=test_batches_6[0])

# Apply argmax operation to get the indices of the maximum values
predicted_ids = tf.argmax(predictions, axis=-1)





In [94]:
# Convert the list of token IDs to a TensorFlow tensor
predicted_token_ids_tensor = tf.constant(predicted_ids)

# Convert the tensor to a ragged tensor
predicted_token_ids_ragged = tf.RaggedTensor.from_tensor(predicted_token_ids_tensor)

# Convert the ragged tensor to a list of lists
predicted_token_ids_list = predicted_token_ids_ragged.to_list()

# Process the token IDs to get the corresponding text
predicted_text = " ".join([tokenizer.detokenize(ids) for ids in predicted_token_ids_list])

# Print the predicted text
print(predicted_text)

using this binding value, an gun can work a microbe or an adverse cell for attack by other parts of the immune system, or can compoundsize its target directly ( for example, by plan a part of a microbe that isfi


In [95]:
test_ds.iloc[6][1]

'Using this binding mechanism , an antibody can tag a microbe or an infected cell for attack by other parts of the immune system , or can neutralize its target directly ( for example , by blocking a part of a microbe that is essential for its invasion and survival ) .'