## **Imports and Data Setup**

In [None]:
import tensorflow as tf         # For building and training deep learning models
import numpy as np
from collections import Counter # Helps count word frequencies to build vocabulary.
import random                   # Used to decide whether to apply teacher forcing during training.

In [None]:
# This is a small parallel corpus of English-Hindi sentence pairs used for training.
data = [ ("i am happy","मैं खुश हूँ"),
         ("You are sad","आप दुखी हैं"),
         ("she is tired", "वह थक गया है"),
         ("we are hungry","हम भूखें है"),
         ("they are busy","वे व्यस्त हैं"),
         ("i am cold","मुझे ठंड लग रही है"),
         ("you are late","तुम देरी से आए हो"),
         ("she is happy", "वह खुश है"),
         ("we are ready","हम तैयार हैं")]

In [None]:
data[0][0]

'i am happy'

In [None]:
data[0][1]

'मैं खुश हूँ'

## **Vocabulary Builder**

- Converts list of sentences into a vocabulary dictionary.

- Adds special tokens:
  - `<PAD>`: for padding sequences to equal length.
  - `<SOS>`: start of sentence.
  - `<EOS>`: end of sentence.

In [None]:
# Function to build vocabulary from sentences
def build_vocab(sentences, lang):
  tokens = Counter()                # To count word frequencies
  for sentence in sentences:
    tokens.update(sentence.split()) # split by space and update token counts

  vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2} # Initialize vocabulary with special tokens they will take the first 3 indices i.e 1,2,3
  for i,token in enumerate(tokens.keys(),3):   # assign index starting from 3 i.e. indice = 4
    vocab[token] = i
  return vocab

## **Build English and Hindi Vocabularies**
- Splits the parallel data into separate English and Hindi sentences and builds their respective vocabularies.

In [None]:
# Separate source (English) and target (Hindi) sentences
eng_sents = [pair[0] for pair in data]
hin_sent = [pair[1] for pair in data]

# Build vocabularies
eng_vocab = build_vocab(eng_sents, "eng")
hin_vocab = build_vocab(hin_sent, "hin")

print(f"English Vocabulary size : {len(eng_vocab)}")
print(f"Hindi Vocabulary size : {len(hin_vocab)}")

English Vocabulary size : 20
Hindi Vocabulary size : 27


## **Convert Sentences to Indices**
- Converts sentence into list of token IDs.
- Surrounds sentence with `<SOS>` and `<EOS>`.
- The EOS token signals to the encoder when it has processed the entire input sequence, and it allows the decoder to know when to stop generating output. The SOS token provides a starting point for the decoder, initiating the output generation process.

In [None]:
# Function to convert sentence into list of word indices
def sentence_to_indices(sent, vocab):
  indices = [vocab.get(token, vocab.get("<UNK>",0)) for token in sent.split()] # convert words to indices
  '''
  This looks up each word (token) in the vocabulary dictionary vocab.
  If the word is in the vocabulary, it returns its index.
  If the word is not in the vocabulary:
  It tries to return the index of "<UNK>" (unknown word token).
  If <UNK> is not even in the vocab, it defaults to 0 (usually <PAD> index).
  '''
  indices = [vocab['<SOS>']] + indices + [vocab['<EOS>']]  # add <SOS> and <EOS>
  return indices

## **Padding Sequences**
- Converts all English (source) and Hindi (target) sentences into padded tensors

In [None]:
# Function to convert all sentences and pad them to same length
def prepare_data(data, eng_vocab, hin_vocab):
  # convert english and hindi sentences to indices
  src_data = [sentence_to_indices(pair[0], eng_vocab) for pair in data]
  tgt_data = [sentence_to_indices(pair[1], hin_vocab) for pair in data]

  # Pad the sequences with <PAD> token to make them equal length
  src_padded = tf.keras.preprocessing.sequence.pad_sequences(src_data, padding='post', value=eng_vocab['<PAD>'])
  tgt_padded = tf.keras.preprocessing.sequence.pad_sequences(tgt_data, padding='post', value=hin_vocab['<PAD>'])
  return src_padded, tgt_padded

# Prepare padded data
src_data, tgt_data = prepare_data(data, eng_vocab, hin_vocab)

print(f"Source data shape : {src_data.shape}")
print(f"Target data shape : {tgt_data.shape}")

Source data shape : (9, 5)
Target data shape : (9, 7)


## **ENCODER: Converts input sentence to context vector**
- Encodes input sentence into a context vector (hidden & cell state).

**Encoder**

* Reads the **input sequence** word by word.
* Produces a **context vector** (i.e., final hidden + cell state).
* Think of it like compressing all the meaning of the input into a fixed-size vector.

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_size, embedding_size)
        # Embedding layer for source (input) tokens

        self.lstm = tf.keras.layers.LSTM(
            hidden_size, return_sequences=True, return_state=True
        )
        # LSTM processes embedded input sequence.
        # return_sequences=True: return output at all time steps (we don’t use it here).
        # return_state=True: return final hidden and cell state (context vector).

    def call(self, x):
        """
        Forward pass for the encoder.

        x: input sequence [batch_size, seq_len] — token indices of source sentence
        """
        embedded = self.embedding(x)
        # Convert input tokens to dense vector embeddings: [batch_size, seq_len, embedding_size]

        _, hidden, cell = self.lstm(embedded)
        # LSTM processes the entire sequence and returns:
        #   - hidden: final hidden state
        #   - cell: final cell state
        #   - we discard the full sequence output (_) since decoder only needs the final context

        return hidden, cell
        # Return the context vector (hidden & cell) for initializing decoder

## **DECODER: Generates target sentence from context vector**
- Takes in the previous token and generates the next token probability distribution.

**Decoder**

* Takes the **context vector** and generates the **output sequence**, one word at a time.
* It uses **previous outputs** to predict the **next word**.

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, output_size, embedding_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(output_size, embedding_size)
        # Embedding layer converts target tokens into dense vectors of size [embedding_size]

        self.lstm = tf.keras.layers.LSTM(
            hidden_size, return_sequences=True, return_state=True
        )
        # LSTM layer processes embeddings.
        # return_sequences=True: returns output at each time step
        # return_state=True: returns final hidden & cell state (needed for next decoding step)

        self.fc = tf.keras.layers.Dense(output_size)
        # Final dense layer maps LSTM outputs to vocabulary size (used to predict next token)

    def call(self, x, hidden, cell):
        """
        Performs a single forward pass for one decoding step.

        x      : input token(s) to the decoder [batch_size, 1]
        hidden : hidden state from previous time step or encoder
        cell   : cell state from previous time step or encoder
        """
        embedded = self.embedding(x)
        # Converts token indices into dense vector representation: [batch_size, 1, embedding_size]

        lstm_out, hidden, cell = self.lstm(embedded, initial_state=[hidden, cell])
        # LSTM processes the embedded input using the given hidden and cell state
        # Outputs:
        #   - lstm_out: output at current step for each batch
        #   - hidden, cell: updated states to be used in the next step

        output = self.fc(lstm_out)
        # Apply linear layer to LSTM output to produce logits for each token in the vocabulary

        return output, hidden, cell
        # Return predictions and updated LSTM states

**Summary till now**

| Component                 | Purpose                                                                                   |
| ------------------------- | ----------------------------------------------------------------------------------------- |
| **Encoder**               | Converts the entire source sentence into a context vector (final hidden and cell states). |
| **Decoder**               | Generates the output sentence one token at a time, using the context from the encoder.    |
| **Embedding layers**      | Convert word indices to dense vector representations.                                     |
| **LSTM**                  | Captures temporal dependencies and remembers long-term patterns.                          |
| **Dense layer (Decoder)** | Maps LSTM output to vocabulary distribution for next-token prediction.                    |


---
## **SEQ2SEQ WRAPPER: Connects Encoder and Decoder**
- Implements auto-regressive decoding with optional teacher forcing.
- During inference, it feeds predicted token back to decoder.

🔍 What is a **Seq2Seq (Sequence-to-Sequence) Model**?

---

A **Seq2Seq model** is a type of neural network architecture **designed to transform one sequence into another**. It's widely used in tasks where **input and output are both sequences**, but not necessarily of the same length.

---

🔄 **Basic Architecture: Encoder–Decoder**

```text
INPUT SEQUENCE              OUTPUT SEQUENCE
-----------------          -------------------
"I am happy"    ───▶      ["<SOS>", "मैं", "खुश", "हूँ", "<EOS>"]
```

---

🧩 **Detailed Workflow**

```text
[“i”, “am”, “happy”]
       |
     Encoder
       ↓
[context vector (hidden + cell)]
       ↓
     Decoder
       |
["<SOS>" → "मैं" → "खुश" → "हूँ" → "<EOS>"]
```

---

🧠 **How It Works Under the Hood**

1. **Input Processing:**

   * Input is tokenized and converted to numbers.
   * Padded to fixed length (e.g., `[1, 3, 4, 2, 0]`).

2. **Encoder (LSTM/RNN):**

   * Reads input and produces `hidden_state` and `cell_state`.

3. **Decoder (LSTM):**

   * Uses `<SOS>` as first input.
   * Predicts one word at a time using:

     * Previous output word
     * Previous hidden/cell state
     * Context vector from encoder

4. **Training:**

   * Uses **Teacher Forcing** to feed the correct previous word during training.
---
🔥 Enhancements Over Basic Seq2Seq

| Technique                 | Purpose                                                                              |
| ------------------------- | ------------------------------------------------------------------------------------ |
| **Attention**             | Helps decoder focus on relevant parts of input (solves bottleneck of context vector) |
| **Beam Search**           | Generates better translations by considering multiple predictions                    |
| **Bidirectional Encoder** | Allows encoder to look at future as well as past                                     |
| **Transformer**           | Replaces RNNs/LSTMs with self-attention (used in BERT, GPT, etc.)                    |

---

✅ Summary

| Term          | Meaning                                                  |
| ------------- | -------------------------------------------------------- |
| **Seq2Seq**   | Neural architecture for sequence input → sequence output |
| **Encoder**   | Reads input and summarizes as context vector             |
| **Decoder**   | Generates output sequence using context                  |
| **Use cases** | Translation, Chatbots, Summarization, etc.               |

---


In [None]:
class Seq2Seq(tf.keras.Model):  # Creates a custom model by extending tf.keras.Model.
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        """
        Accepts two prebuilt models: an Encoder and a Decoder.
        Stores them as class attributes.
        """

    def call(self, inputs, training=False, teacher_forcing_ratio=0.5):
        """
        This is the forward pass.
        inputs: a tuple/list containing:
                src – source (input) sequences (English)
                tgt – target (output) sequences (Hindi)
        training: if True, apply teacher forcing
        teacher_forcing_ratio: controls how often we use actual ground truth instead of the model's previous prediction.
        """
        src, tgt = inputs  # Unpack input pair
        batch_size = tf.shape(src)[0]
        tgt_len = tf.shape(tgt)[1]
        tgt_vocab_size = len(hin_vocab)

        hidden, cell = self.encoder(src)  # Encode the source sentence

        outputs = []
        input_token = tgt[:, 0:1]  # Start with <SOS>

        for t in range(1, tgt_len): # Loop Through Each Time Step in Decoder
            # Skip the first token <SOS> and loop through rest of tgt
            output, hidden, cell = self.decoder(input_token, hidden, cell)  # Decode step
            outputs.append(output) # Store the decoder's output for the current time step.

            # Teacher forcing: sometimes use ground truth next token
            if training: # If training is True
                teacher_force = random.random() < teacher_forcing_ratio
                input_token = tgt[:, t:t+1] if teacher_force else tf.argmax(output, axis=-1, output_type=tf.int32)
            else:
                input_token = tf.argmax(output, axis=-1, output_type=tf.int32)

        return tf.concat(outputs, axis=1) if outputs else tf.zeros((batch_size, 1, tgt_vocab_size))
        """
        Combine all time step outputs into a single tensor.
        Each output is of shape: [batch_size, 1, vocab_size]
        Final output shape: [batch_size, seq_len - 1, vocab_size]
        """


## **Training Function**
- Standard training loop with masked loss to ignore `<PAD>` tokens.

In [None]:
def train(model, src_data, tgt_data, epochs=100):
    """
    Trains the Seq2Seq model on the provided source and target data.

    model      : instance of the Seq2Seq model
    src_data   : padded source sequences (e.g., English sentences as indices)
    tgt_data   : padded target sequences (e.g., Hindi sentences as indices)
    epochs     : number of training iterations
    """
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    # Adam optimizer is used for updating weights

    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    """
    Loss function:
    - SparseCategoricalCrossentropy is used because targets are integers (not one-hot vectors).
    - from_logits=True because the decoder's output layer does not use softmax.
    - reduction='none' allows us to compute loss per token and apply masking manually.
    """

    def train_step(src, tgt):
        """
        A single training step for one batch (entire dataset here).
        src: batch of source sequences
        tgt: batch of target sequences
        """
        with tf.GradientTape() as tape:
            outputs = model([src, tgt], training=True, teacher_forcing_ratio=1.0)
            # Forward pass with teacher forcing enabled (always use ground truth tokens).

            target_labels = tgt[:, 1:]
            # Remove the <SOS> token from the targets (as decoder should predict from position 1 onward)

            outputs = outputs[:, :tf.shape(target_labels)[1], :]
            # Ensure decoder outputs match the shape of the target (align dimensions)

            mask = tf.cast(target_labels != hin_vocab["<PAD>"], tf.float32)
            # Create a binary mask to ignore loss from <PAD> tokens.

            loss = loss_fn(target_labels, outputs)
            # Compute the loss between actual tokens and predicted outputs.

            loss *= mask
            # Apply the mask to ignore padding tokens in loss calculation.

            total_loss = tf.reduce_sum(loss)
            total_tokens = tf.reduce_sum(mask)
            mean_loss = total_loss / (total_tokens + 1e-8)
            # Compute average loss per token (add small epsilon to avoid divide-by-zero)

        gradients = tape.gradient(mean_loss, model.trainable_variables)
        # Compute gradients of loss w.r.t. model parameters

        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        # Update model parameters using optimizer and gradients

        return mean_loss
        # Return average loss for logging

    print("Starting Training....")
    for epoch in range(epochs):
        loss = train_step(src_data, tgt_data)  # Train for one epoch on full data
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss : {loss.numpy():.4f}")  # Print loss every 10 epochs

    print("Training completed")  # Training loop ends

## **Model Initialization**

In [None]:
input_size = len(eng_vocab)
output_size = len(hin_vocab)
embed_size = 50
hidden_size = 100

print(f"Initializing model with :")
print(f"Input vocab size : {input_size}")
print(f"Output vocab size : {output_size}")
print(f"Embedding size : {embed_size}")
print(f"Hidden size (lstm) : {hidden_size}")

# Create encoder-decoder and seq2seq model
encoder = Encoder(input_size, embed_size, hidden_size)
decoder = Decoder(output_size, embed_size, hidden_size)
model = Seq2Seq(encoder, decoder)

# Dummy call to build the model
dummy_src = tf.zeros((1,5), dtype=tf.int32)
dummy_tgt = tf.zeros((1,5), dtype=tf.int32)
_ = model([dummy_src, dummy_tgt], training=False)

# Train the model
train(model, src_data, tgt_data, epochs=50)

Initializing model with :
Input vocab size : 20
Output vocab size : 27
Embedding size : 50
Hidden size (lstm) : 100


1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: '''SymbolicTensor' object cannot be interpreted as an integer''


Starting Training....
Epoch 0, Loss : 3.2954
Epoch 10, Loss : 2.2147
Epoch 20, Loss : 0.8862
Epoch 30, Loss : 0.1943
Epoch 40, Loss : 0.0270
Training completed


## **Translation Function**
- Takes an English sentence, converts it into tokens, encodes it, and decodes into Hindi using the trained model.

In [None]:
def translate(model, sentence, eng_vocab, hin_vocab, max_len=15):
    """
    Translates an English sentence into Hindi using the trained Seq2Seq model.

    model     : Trained Seq2Seq model containing encoder and decoder
    sentence  : English sentence as input string
    eng_vocab : Vocabulary mapping for English words to indices
    hin_vocab : Vocabulary mapping for Hindi words to indices
    max_len   : Maximum number of tokens to decode (prevents infinite loops)
    """

    print(f"Translating your sentence ---> : {sentence}")

    tokens = sentence.split()
    # Tokenize the input sentence by splitting on spaces

    indices = [eng_vocab["<SOS>"]] + [eng_vocab.get(token, 0) for token in tokens] + [eng_vocab["<EOS>"]]
    # Convert tokens to indices using English vocab
    # Add <SOS> at the start and <EOS> at the end
    # If word not found, use 0 (usually <PAD> or <UNK>)

    print(f"Input tokens :{tokens}")
    print(f"Input Indices : {indices}")

    src_tensor = tf.convert_to_tensor([indices], dtype=tf.int32)
    # Convert list of indices into a tensor of shape [1, sequence_length] (batch size = 1)

    hidden, cell = model.encoder(src_tensor)
    # Pass the input tensor to the encoder to get the context vector (hidden and cell states)

    print(f"Encoded to context vector of shape : {hidden.shape}")

    input_token = tf.convert_to_tensor([[hin_vocab['<SOS>']]], dtype=tf.int32)
    # Initialize decoder input with <SOS> token from Hindi vocab (shape: [1,1])

    output_tokens = []
    # List to store predicted token indices

    print("Decoding Steps: ")

    for step in range(max_len):
        # Loop to generate each token of the translation (max_len is the upper limit)

        output, hidden, cell = model.decoder(input_token, hidden, cell)
        # Pass input token and context vector to decoder
        # Receive output logits and updated hidden, cell states

        predicted_token = tf.argmax(output, axis=-1).numpy()[0, 0]
        # Choose the token index with highest probability from output logits

        inv_hin_vocab = {v: k for k, v in hin_vocab.items()}
        # Create a reverse vocabulary to convert indices back to Hindi words

        predicted_word = inv_hin_vocab.get(predicted_token, "<UNK>")
        # Get word corresponding to predicted token index (use <UNK> if missing)

        print(f"  Step {step+1}:{predicted_word} (token {predicted_token})")

        if predicted_token == hin_vocab["<EOS>"]:
            # If end-of-sentence token is generated, stop decoding
            print("   Reached EOS token, stopping the process!")
            break

        output_tokens.append(predicted_token)
        # Add predicted token to the output sequence

        input_token = tf.convert_to_tensor([[predicted_token]], dtype=tf.int32)
        # Set predicted token as input to the decoder for the next step

    translated_words = [inv_hin_vocab.get(idx, "<UNK>") for idx in output_tokens]
    # Convert list of predicted token indices to words

    translation = " ".join(translated_words)
    # Join words to form final translated sentence

    print(f"Final Translation : {translation}")
    return translation
    # Return the translated Hindi sentence as output

**summary of above cell**

| Feature                  | Description                                 |
| ------------------------ | ------------------------------------------- |
| `model.encoder()`        | Converts input sentence into context vector |
| `model.decoder()`        | Decodes one word at a time                  |
| `argmax`                 | Chooses highest probability word            |
| `input_token` loop       | Sequential decoding                         |
| `<SOS>` / `<EOS>` tokens | Manage sentence start/end                   |
| `max_len`                | Limits the translation length               |


**Testing Translation**

In [None]:
test_sentences = ["i am happy", "You are sad", "we are ready", "tired is hungry"]

for sentence in test_sentences:
    expected = next((hin for eng, hin in data if eng == sentence), None)
    translation = translate(model, sentence, eng_vocab, hin_vocab)
    print(f"Expected : '{expected}'")
    print(f"Got :      '{translation}'")
    print("**"*40)

Translating your sentence ---> : i am happy
Input tokens :['i', 'am', 'happy']
Input Indices : [1, 3, 4, 5, 2]
Encoded to context vector of shape : (1, 100)
Decoding Steps: 
  Step 1:मैं (token 3)
  Step 2:खुश (token 4)
  Step 3:हूँ (token 5)
  Step 4:<EOS> (token 2)
   Reached EOS token, stopping the process!
Final Translation : मैं खुश हूँ
Expected : 'मैं खुश हूँ'
Got :      'मैं खुश हूँ'
********************************************************************************
Translating your sentence ---> : You are sad
Input tokens :['You', 'are', 'sad']
Input Indices : [1, 6, 7, 8, 2]
Encoded to context vector of shape : (1, 100)
Decoding Steps: 
  Step 1:आप (token 6)
  Step 2:दुखी (token 7)
  Step 3:हैं (token 8)
  Step 4:<EOS> (token 2)
   Reached EOS token, stopping the process!
Final Translation : आप दुखी हैं
Expected : 'आप दुखी हैं'
Got :      'आप दुखी हैं'
********************************************************************************
Translating your sentence ---> : we are ready
In