<a href="https://colab.research.google.com/github/arvishcdoshi/NN-Transformer/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
import numpy as np

# Load and preprocess text
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

file_path = "hp_1.txt"  # Ensure you have this file in your Colab or local directory
text = load_data(file_path).lower()

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(oov_token='') # Out-Of-Vocabulary token
                                        # If a word not seen during training appears later, it will be replaced with
                                        # Helps handle unknown words instead of ignoring them
tokenizer.fit_on_texts([text]) # analyzes the input text and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1 #  0 is usually reserved for padding

# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input text into a list of numbers based on the word index
seq_length = 50  # Each input sequence contains 50 words

# First seq_length tokens (input): Used for training the model.
# Last token (target): Used as the label the model tries to predict.
# so total of (50 + 1) in one input_sequence index

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split inputs/targets
# after this X will have inputs and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode the labels , note- there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Transformer Model

Multi Head Attention

In [5]:
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout

# We'll consider MultiHead attention like a layer.
class MultiHeadAttention(Layer):
  # When we've multi headed attention, the things we need are : no. of self-attention heads
  # and the dimentions of the embedding.
  def __init__(self, embed_dim, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.embed_dim = embed_dim # 512 ( Total embedding dimension : i.e 64*8 => check "Attention is all you need" paper for more info)
    self.num_heads = num_heads # Suppose - 8
    self.projection_dim = embed_dim // num_heads # 64

    self.query_dense = Dense(embed_dim) # Wq
    self.key_dense = Dense(embed_dim) # Wk
    self.value_dense = Dense(embed_dim) # Wv

    self.combine_dense = Dense(embed_dim)


  def attention(self, query, key, value):
    scores = tf.matmul(query, key, transpose_b=True) # Q * K Transpose
    scores /= tf.math.sqrt(tf.cast(self.projection_dim, tf.float32)) # Convert int to float 32

    attention_probs = tf.nn.softmax(scores, axis=-1) # softmax
    return tf.matmul(attention_probs, value) # Probabs * V


  # x - query, key, value
  def split_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    # new shape - batch_size, num_of_words (seq_len), num_heads, projection
    # shape we want - batch_size, num_heads, num of words (seq_len), projection
    # batch_size of ( 8 heads of (4 words * 64 dimension))

  def call(self, inputs):
    query, key, value = inputs # shape -> batch_size, num_of_words (seq_len), embed_dim
    batch_size = tf.shape(query)[0]

    query = self.split_heads(self.query_dense(query), batch_size)
    key = self.split_heads(self.key_dense(key), batch_size)
    value = self.split_heads(self.value_dense(value), batch_size)

    attention = self.attention(query, key, value)
    # shape i have -> batch_size, num_heads, num of words (seq_len), projection
    # shape i want -> batch_size, num of words (seq_len), num_heads, projection
    attention = tf.transpose(attention, perm=[0, 2, 1, 3])


    concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
    # How we concatenate above is :- For every batch, we merge all the heads into a single vector.

    # Afterwords, we need to return the final output. Which is to combine the heads.
    # So whatever attention we have, we're concatenating to get the final answer

    return self.combine_dense(concat_attention)

#

Transformer

In [None]:
class TransformerBlock(Layer):

  def __init__(self, embed_dim, num_heads, ff_dim): # ff_dim is Dimension of feed forward layer
    super(TransformerBlock, self).__init__()
    self.att = MultiHeadAttention(embed_dim, num_heads)
    self.ffn = tf.keras.Sequential([
        Dense(ff_dim, activation="relu"),
        Dense(embed_dim),
    ])
    # y = (x - mean) / root(variance + epsilon).
    # epsilon ensures we never divide by zero.
    # it is small enough not to affect the result but large enough to prevent instability.
    self.layernorm1 = LayerNormalization(epsilon=1e-6)
    self.layernorm2 = LayerNormalization(epsilon=1e-6)
    # Added two dropout layers - Drop neurons to avoid overfitting.
    # 0.1 indicates the % of neurons we wish to drop.
    self.dropout1 = Dropout(0.1)
    self.dropout2 = Dropout(0.1)


