<a href="https://colab.research.google.com/github/arko-14/attention/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
    with open("/content/drive/MyDrive/wmt14_translate_de-en_train.csv", 'r', encoding='utf-8', errors='ignore') as file:
        text_sample = file.read(100000)
    print(f"Successfully loaded {len(text_sample)} characters")
except Exception as e:
    print(f"Error loading file: {e}")

Successfully loaded 100000 characters


In [4]:
words = text_sample.lower().split()[:100000]
print(f"Working with {len(words)} words")


word_freq = {}
for word in words:
    word = word.strip(".,!?:;\"'()[]{}")
    if word:
        word_freq[word] = word_freq.get(word, 0) + 1


top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:100]
print("\nTop 10 words:")
for word, count in top_words[:10]:
    print(f"{word}: {count}")

Working with 14877 words

Top 10 words:
the: 562
in: 301
die: 297
of: 259
and: 253
der: 252
to: 245
und: 229
a: 164
is: 139


In [None]:

words = text_sample.lower().split()[:30000]
print(f"Working with {len(words)} words")

word_freq = {}
for word in words:
    word = word.strip(".,!?:;\"'()[]{}")
    if word:
        word_freq[word] = word_freq.get(word, 0) + 1


top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:100]
print("\nTop 10 words:")
for word, count in top_words[:10]:
    print(f"{word}: {count}")

In [5]:

input_file = "/content/drive/MyDrive/wmt14_translate_de-en_train.csv"
output_file = "/content/drive/MyDrive/wmt14_30k_sample.csv"
word_limit = 30000
word_count = 0
collected_sentences = []

try:
    import pandas as pd
    try:
        df = pd.read_csv(input_file)
        if 'translation' in df.columns:
            text_column = 'translation'
        elif 'text' in df.columns:
            text_column = 'text'
        else:
            text_column = df.columns[0]


        for idx, row in df.iterrows():
            sentence = str(row[text_column])
            words_in_sentence = sentence.split()

            if word_count + len(words_in_sentence) <= word_limit:
                collected_sentences.append(sentence)
                word_count += len(words_in_sentence)
            else:
                words_needed = word_limit - word_count
                if words_needed > 0:
                    partial_sentence = ' '.join(words_in_sentence[:words_needed])
                    collected_sentences.append(partial_sentence)
                word_count = word_limit
                break

    except Exception as e:
        print(f"CSV reading failed: {e}")
        with open(input_file, 'r', encoding='utf-8', errors='ignore') as file:
            for line in file:
                words_in_line = line.split()
                if word_count + len(words_in_line) <= word_limit:
                    collected_sentences.append(line.strip())
                    word_count += len(words_in_line)
                else:
                    words_needed = word_limit - word_count
                    if words_needed > 0:
                        partial_line = ' '.join(words_in_line[:words_needed])
                        collected_sentences.append(partial_line)
                    word_count = word_limit
                    break


    with open(output_file, 'w', encoding='utf-8') as out_file:
        for sentence in collected_sentences:
            out_file.write(sentence + '\n')

    print(f"Successfully created a dataset with {word_count} words")
    print(f"Saved {len(collected_sentences)} sentences to {output_file}")

except Exception as e:
    print(f"Error in dataset creation: {e}")

CSV reading failed: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.

Successfully created a dataset with 30000 words
Saved 684 sentences to /content/drive/MyDrive/wmt14_30k_sample.csv


In [6]:
import numpy as np
from collections import Counter


with open("/content/drive/MyDrive/wmt14_30k_sample.csv", 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Preprocess the text
sentences = [line.strip() for line in lines if line.strip()]
all_words = []
for sentence in sentences:
    words = sentence.lower().split()
    all_words.extend(words)

print(f"Loaded {len(sentences)} sentences with {len(all_words)} words")

# Build vocabulary
word_counts = Counter(all_words)
print(f"Unique words: {len(word_counts)}")

# Create vocabulary mapping
vocab = ["<PAD>", "<UNK>"] + [word for word, _ in word_counts.most_common(10000)]
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

# Tokenize sentences
tokenized_sentences = []
for sentence in sentences:
    words = sentence.lower().split()
    tokens = [word_to_idx.get(word, 1) for word in words]
    tokenized_sentences.append(tokens)

# Print some statistics
max_len = max(len(tokens) for tokens in tokenized_sentences)
avg_len = sum(len(tokens) for tokens in tokenized_sentences) / len(tokenized_sentences)
print(f"Max sentence length: {max_len} tokens")
print(f"Average sentence length: {avg_len:.2f} tokens")

# Display a few tokenized sentences
print("\nSample tokenized sentences:")
for i, tokens in enumerate(tokenized_sentences[:3]):
    print(f"Sentence {i+1}: {tokens}")


import pickle
with open("/content/drive/MyDrive/tokenized_data_30k.pkl", 'wb') as f:
    pickle.dump({
        'tokenized_sentences': tokenized_sentences,
        'word_to_idx': word_to_idx,
        'idx_to_word': {idx: word for word, idx in word_to_idx.items()}
    }, f)

print("\nTokenized data saved to 'tokenized_data_30k.pkl'")

Loaded 684 sentences with 30000 words
Unique words: 10858
Max sentence length: 188 tokens
Average sentence length: 43.86 tokens

Sample tokenized sentences:
Sentence 1: [2994]
Sentence 2: [2995, 7, 1441, 2996, 437, 37, 3, 2997, 7, 65, 2998, 2999, 7, 29, 3000, 12, 530, 438, 1442, 3001, 2, 677, 3002, 5, 2, 1443, 11, 3003, 3004, 100, 3005, 1444, 8, 3006, 54, 2, 3007, 3008]
Sentence 3: [127, 3009, 4, 1445, 31, 274, 531, 9, 4, 439, 31, 1446, 4, 933, 532, 1447, 3010, 3011, 3012, 47, 26, 148, 533, 275, 1448, 166, 16, 1449, 3013, 123, 934, 3, 3014, 8, 534, 1450, 8, 2, 3015, 1451, 3016, 6, 8, 56, 2, 203, 15, 204, 52, 1452, 935, 49, 678, 15, 42, 1453, 187, 1454, 8, 92, 936, 310, 3, 2, 361, 3017]

Tokenized data saved to 'tokenized_data_30k.pkl'


In [8]:


import tensorflow as tf
import numpy as np


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

max_length = min(100, max(len(tokens) for tokens in tokenized_sentences))
vocab_size = len(word_to_idx)

# Pad sequences to the same length
def pad_sequences(sequences, max_len):
    return [seq + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len]
            for seq in sequences]

padded_sequences = pad_sequences(tokenized_sentences, max_length)
padded_sequences = np.array(padded_sequences)

# Create input and target data for language modeling
input_sequences = padded_sequences[:, :-1]
target_sequences = padded_sequences[:, 1:]

# Convert to TensorFlow dataset
dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
dataset = dataset.shuffle(len(tokenized_sentences))
dataset = dataset.batch(8)


num_layers = 2  # Reduced from 6
d_model = 64    # Reduced from 512
num_heads = 2   # Reduced from 8
dff = 256       # Reduced from 2048



In [31]:
import tensorflow as tf
import numpy as np

def scaled_dot_product_attention(query, key, value, mask=None):
    """Calculate the attention weights."""
    matmul_qk = tf.matmul(query, key, transpose_b=True)

    # Scale matmul_qk
    dk = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # Add the mask to the scaled tensor
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # Softmax is normalized on the last axis (seq_len_k)
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, value)

    return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention,
                                    (batch_size, -1, self.d_model))

        output = self.dense(concat_attention)
        return output, attention_weights

def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask=None):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads) # This line was missing, causing the error
        #self.mha1 = MultiHeadAttention(d_model, num_heads)
        #self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask=None):
        attn_output, _ = self.mha(x, x, x, mask)  # Use self.mha here as well
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2 # Return out2, which is the final output of the DecoderLayer

class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                 target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                             input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                             target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, enc_padding_mask,
             look_ahead_mask, dec_padding_mask): # Remove default values here
        enc_output = self.encoder(inp, training=training, mask=enc_padding_mask) # Pass 'training' as keyword argument

        dec_output, attention_weights = self.decoder(
            tar, enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=dec_padding_mask)

        final_output = self.final_layer(dec_output)

        return final_output, attention_weights

class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
                 maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding,
                                              self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                          for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask=None):
        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)

        return x
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(…):
        super().__init__()
        self.self_attn   = MultiHeadAttention(d_model, num_heads)
        self.cross_attn  = MultiHeadAttention(d_model, num_heads)
        # …
    def call(self, x, enc_output, training, look_ahead_mask=None, padding_mask=None):
        # 1) masked self-attn
        attn1, _ = self.self_attn(x, x, x, look_ahead_mask)
        out1 = self.layernorm1(x + self.dropout1(attn1, training=training))

        # 2) encoder-decoder cross-attn
        attn2, _ = self.cross_attn(enc_output, enc_output, out1, padding_mask)
        out2 = self.layernorm2(out1 + self.dropout2(attn2, training=training))

        # 3) FFN
        ffn_out = self.ffn(out2)
        out3 = self.layernorm3(out2 + self.dropout3(ffn_out, training=training))
        return out3

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
                 maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                          for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training,
             look_ahead_mask=None, padding_mask=None):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
           x = self.dec_layers[i](x, training=training, mask=look_ahead_mask)

            #attention_weights[f'decoder_layer{i+1}_block1'] = block1
            #attention_weights[f'decoder_layer{i+1}_block2'] = block2

        return x, attention_weights

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

def create_masks(inp, tar):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(inp)


    dec_padding_mask = create_padding_mask(inp)


    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def create_model(vocab_size, num_layers=6, d_model=512, num_heads=8,
                dff=2048, maximum_position_encoding=10000, rate=0.1):  # Added rate
    """Create the Transformer model"""

    model = Transformer(
        num_layers=num_layers,
        d_model=d_model,
        num_heads=num_heads,
        dff=dff,
        input_vocab_size=vocab_size,
        target_vocab_size=vocab_size,
        pe_input=maximum_position_encoding,
        pe_target=maximum_position_encoding,
        rate=rate  # Pass rate to Transformer
    )

    return model

In [32]:
# Create the model
model = create_model(vocab_size, num_layers, d_model, num_heads, dff, max_length)

# Define loss and optimizer
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

# Define training metrics
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

# Define the training step
@tf.function
def train_step(inp, tar):
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, inp)

    with tf.GradientTape() as tape:
        predictions, _ = model(inp, tar, training=True, enc_padding_mask=enc_padding_mask,
                               look_ahead_mask=combined_mask, dec_padding_mask=dec_padding_mask)
        loss = loss_function(tar, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(tar, predictions)

EPOCHS = 5
for epoch in range(EPOCHS):
    train_loss.reset_state()
    train_accuracy.reset_state()

    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

        if batch % 10 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')



Epoch 1 Batch 0 Loss 9.2261 Accuracy 0.0000
Epoch 1 Batch 10 Loss 9.2197 Accuracy 0.0001
Epoch 1 Batch 20 Loss 9.2172 Accuracy 0.0001
Epoch 1 Batch 30 Loss 9.2166 Accuracy 0.0001
Epoch 1 Batch 40 Loss 9.2147 Accuracy 0.0001
Epoch 1 Batch 50 Loss 9.2113 Accuracy 0.0001
Epoch 1 Batch 60 Loss 9.2083 Accuracy 0.0002
Epoch 1 Batch 70 Loss 9.2047 Accuracy 0.0003
Epoch 1 Batch 80 Loss 9.2016 Accuracy 0.0005
Epoch 1 Loss 9.2001 Accuracy 0.0006
Epoch 2 Batch 0 Loss 9.1424 Accuracy 0.0114
Epoch 2 Batch 10 Loss 9.1371 Accuracy 0.0061
Epoch 2 Batch 20 Loss 9.1316 Accuracy 0.0063
Epoch 2 Batch 30 Loss 9.1239 Accuracy 0.0071
Epoch 2 Batch 40 Loss 9.1161 Accuracy 0.0072
Epoch 2 Batch 50 Loss 9.1076 Accuracy 0.0076
Epoch 2 Batch 60 Loss 9.0980 Accuracy 0.0082
Epoch 2 Batch 70 Loss 9.0876 Accuracy 0.0098
Epoch 2 Batch 80 Loss 9.0781 Accuracy 0.0115
Epoch 2 Loss 9.0734 Accuracy 0.0119
Epoch 3 Batch 0 Loss 8.9543 Accuracy 0.0278
Epoch 3 Batch 10 Loss 8.9542 Accuracy 0.0217
Epoch 3 Batch 20 Loss 8.9413 Ac

In [33]:

INITIAL_EPOCH = 5  # Start from epoch 5
EPOCHS = 10  # Train until epoch 15

for epoch in range(INITIAL_EPOCH, EPOCHS +1):
    train_loss.reset_state()
    train_accuracy.reset_state()

    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

        if batch % 10 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')


Epoch 6 Batch 0 Loss 7.8929 Accuracy 0.0316
Epoch 6 Batch 10 Loss 7.7945 Accuracy 0.0482
Epoch 6 Batch 20 Loss 7.7959 Accuracy 0.0450
Epoch 6 Batch 30 Loss 7.7999 Accuracy 0.0421
Epoch 6 Batch 40 Loss 7.7855 Accuracy 0.0431
Epoch 6 Batch 50 Loss 7.7841 Accuracy 0.0455
Epoch 6 Batch 60 Loss 7.7551 Accuracy 0.0467
Epoch 6 Batch 70 Loss 7.7356 Accuracy 0.0468
Epoch 6 Batch 80 Loss 7.7258 Accuracy 0.0466
Epoch 6 Loss 7.7089 Accuracy 0.0472
Epoch 7 Batch 0 Loss 7.5351 Accuracy 0.0265
Epoch 7 Batch 10 Loss 7.4764 Accuracy 0.0354
Epoch 7 Batch 20 Loss 7.4177 Accuracy 0.0346
Epoch 7 Batch 30 Loss 7.3872 Accuracy 0.0340
Epoch 7 Batch 40 Loss 7.3495 Accuracy 0.0327
Epoch 7 Batch 50 Loss 7.3122 Accuracy 0.0326
Epoch 7 Batch 60 Loss 7.2809 Accuracy 0.0324
Epoch 7 Batch 70 Loss 7.2436 Accuracy 0.0317
Epoch 7 Batch 80 Loss 7.2188 Accuracy 0.0319
Epoch 7 Loss 7.2068 Accuracy 0.0317
Epoch 8 Batch 0 Loss 7.0999 Accuracy 0.0366
Epoch 8 Batch 10 Loss 6.8857 Accuracy 0.0351
Epoch 8 Batch 20 Loss 6.9095 Ac

In [44]:
INITIAL_EPOCH = 11  # Start from epoch 11
EPOCHS = 19  # Train until epoch 20

for epoch in range(INITIAL_EPOCH, EPOCHS +1):
    train_loss.reset_state()
    train_accuracy.reset_state()

    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

        if batch % 10 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')


Epoch 12 Batch 0 Loss 5.1073 Accuracy 0.1894
Epoch 12 Batch 10 Loss 4.9211 Accuracy 0.1494
Epoch 12 Batch 20 Loss 4.8270 Accuracy 0.1589
Epoch 12 Batch 30 Loss 4.7913 Accuracy 0.1638
Epoch 12 Batch 40 Loss 4.7983 Accuracy 0.1628
Epoch 12 Batch 50 Loss 4.7528 Accuracy 0.1662
Epoch 12 Batch 60 Loss 4.7393 Accuracy 0.1702
Epoch 12 Batch 70 Loss 4.7395 Accuracy 0.1699
Epoch 12 Batch 80 Loss 4.7406 Accuracy 0.1702
Epoch 12 Loss 4.7207 Accuracy 0.1709
Epoch 13 Batch 0 Loss 4.5191 Accuracy 0.1755
Epoch 13 Batch 10 Loss 4.4222 Accuracy 0.1876
Epoch 13 Batch 20 Loss 4.4041 Accuracy 0.1846
Epoch 13 Batch 30 Loss 4.3746 Accuracy 0.1922
Epoch 13 Batch 40 Loss 4.4131 Accuracy 0.1887
Epoch 13 Batch 50 Loss 4.3604 Accuracy 0.1888
Epoch 13 Batch 60 Loss 4.3461 Accuracy 0.1890
Epoch 13 Batch 70 Loss 4.3386 Accuracy 0.1877
Epoch 13 Batch 80 Loss 4.3293 Accuracy 0.1876
Epoch 13 Loss 4.3466 Accuracy 0.1872
Epoch 14 Batch 0 Loss 4.1985 Accuracy 0.2298
Epoch 14 Batch 10 Loss 4.1677 Accuracy 0.2017
Epoch 14 

In [50]:
INITIAL_EPOCH = 20  # Start from epoch 5
EPOCHS = 40  # Train until epoch 15

for epoch in range(INITIAL_EPOCH, EPOCHS +1):
    train_loss.reset_state()
    train_accuracy.reset_state()

    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

        if batch % 10 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

Epoch 21 Batch 0 Loss 1.6351 Accuracy 0.5126
Epoch 21 Batch 10 Loss 1.8771 Accuracy 0.3527
Epoch 21 Batch 20 Loss 1.9104 Accuracy 0.3715
Epoch 21 Batch 30 Loss 1.8778 Accuracy 0.3721
Epoch 21 Batch 40 Loss 1.8661 Accuracy 0.3712
Epoch 21 Batch 50 Loss 1.8645 Accuracy 0.3635
Epoch 21 Batch 60 Loss 1.8742 Accuracy 0.3579
Epoch 21 Batch 70 Loss 1.8858 Accuracy 0.3565
Epoch 21 Batch 80 Loss 1.8977 Accuracy 0.3560
Epoch 21 Loss 1.8864 Accuracy 0.3559
Epoch 22 Batch 0 Loss 1.5140 Accuracy 0.3775
Epoch 22 Batch 10 Loss 1.6284 Accuracy 0.4292
Epoch 22 Batch 20 Loss 1.6730 Accuracy 0.3888
Epoch 22 Batch 30 Loss 1.6527 Accuracy 0.3857
Epoch 22 Batch 40 Loss 1.6599 Accuracy 0.3803
Epoch 22 Batch 50 Loss 1.6525 Accuracy 0.3847
Epoch 22 Batch 60 Loss 1.6375 Accuracy 0.3822
Epoch 22 Batch 70 Loss 1.6347 Accuracy 0.3786
Epoch 22 Batch 80 Loss 1.6230 Accuracy 0.3848
Epoch 22 Loss 1.6296 Accuracy 0.3832
Epoch 23 Batch 0 Loss 1.5544 Accuracy 0.3939
Epoch 23 Batch 10 Loss 1.3502 Accuracy 0.3811
Epoch 23 

In [54]:
INITIAL_EPOCH = 40  # Start from epoch 40
EPOCHS = 60  # Train until epoch 61

for epoch in range(INITIAL_EPOCH, EPOCHS +1):
    train_loss.reset_state()
    train_accuracy.reset_state()

    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

        if batch % 10 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

Epoch 41 Batch 0 Loss 0.0001 Accuracy 0.4091
Epoch 41 Batch 10 Loss 0.0001 Accuracy 0.3838
Epoch 41 Batch 20 Loss 0.0001 Accuracy 0.3783
Epoch 41 Batch 30 Loss 0.0001 Accuracy 0.4022
Epoch 41 Batch 40 Loss 0.0001 Accuracy 0.4155
Epoch 41 Batch 50 Loss 0.0001 Accuracy 0.4170
Epoch 41 Batch 60 Loss 0.0001 Accuracy 0.4181
Epoch 41 Batch 70 Loss 0.0001 Accuracy 0.4228
Epoch 41 Batch 80 Loss 0.0001 Accuracy 0.4245
Epoch 41 Loss 0.0001 Accuracy 0.4262
Epoch 42 Batch 0 Loss 0.0001 Accuracy 0.4558
Epoch 42 Batch 10 Loss 0.0001 Accuracy 0.4622
Epoch 42 Batch 20 Loss 0.0001 Accuracy 0.4293
Epoch 42 Batch 30 Loss 0.0001 Accuracy 0.4227
Epoch 42 Batch 40 Loss 0.0001 Accuracy 0.4275
Epoch 42 Batch 50 Loss 0.0001 Accuracy 0.4286
Epoch 42 Batch 60 Loss 0.0001 Accuracy 0.4292
Epoch 42 Batch 70 Loss 0.0001 Accuracy 0.4323
Epoch 42 Batch 80 Loss 0.0001 Accuracy 0.4277
Epoch 42 Loss 0.0001 Accuracy 0.4262
Epoch 43 Batch 0 Loss 0.0000 Accuracy 0.3611
Epoch 43 Batch 10 Loss 0.0000 Accuracy 0.4100
Epoch 43 

In [60]:
# prompt: code to save the model

model.save('my_model.keras') # Or 'my_model.h5'

In [47]:
# Assuming 'word_to_idx' is your target vocabulary mapping:
idx_to_word_tgt = {idx: word for word, idx in word_to_idx.items()}

def decode_tokens(tokens):
    words = [idx_to_word_tgt.get(token, "<UNK>") for token in tokens]
    return " ".join(words)

# Get the actual token corresponding to 500
print(decode_tokens([500]))

network
