# Milestone 2

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import os, zipfile , json , random, requests
import re
from pathlib import Path
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras import  Sequential

2025-04-21 17:07:57.443469: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745255277.645923      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745255277.704311      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Explorting dataset:

In [2]:

def is_kaggle():
    # Kaggle kernels always set this env var
    return 'KAGGLE_URL_BASE' in os.environ

def is_colab():
    return (not is_kaggle()) and os.path.exists('/content')

def maybe_mount_drive():
    if is_colab():
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive')

def get_data_path():
    if is_kaggle():
        return '/kaggle/input/squad-2-0/'
    elif is_colab():
        return '/content/drive/MyDrive/SQuAD'
    else:
        return './data/'
def get_model_dir():
    if is_colab():
        model_dir = '/content/drive/MyDrive/models'
    elif is_kaggle():
        model_dir = '/kaggle/working/models'
    else:
        model_dir = './models'
    os.makedirs(model_dir, exist_ok=True)
    return model_dir

In [3]:
dataset_dir = get_data_path()
maybe_mount_drive()
os.makedirs(dataset_dir, exist_ok=True)

In [5]:
file_path = os.path.join(dataset_dir, 'train-v2.0.json')

In [6]:
with open(file_path, 'r', encoding='utf-8') as f:
    squad = json.load(f)

In [7]:
records = []
for article in squad['data']:
    for para in article['paragraphs']:
        ctx = para['context']
        for qa in para['qas']:
            answers = [a['text'] for a in qa.get('answers', [])]
            starts  = [a['answer_start'] for a in qa.get('answers', [])]
            ends    = [s + len(t) for s,t in zip(starts, answers)]
            records.append({
                'question': qa['question'],
                'answers': answers,
                'context': ctx,
                'answer_start': starts,
                'answer_end': ends
            })



In [8]:
df = pd.DataFrame(records)
df.head()

Unnamed: 0,question,answers,context,answer_start,answer_end
0,When did Beyonce start becoming popular?,[in the late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[269],[286]
1,What areas did Beyonce compete in when she was...,[singing and dancing],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[207],[226]
2,When did Beyonce leave Destiny's Child and bec...,[2003],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[526],[530]
3,In what city and state did Beyonce grow up?,"[Houston, Texas]",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[166],[180]
4,In which decade did Beyonce become famous?,[late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[276],[286]


In [9]:
print("Total QA pairs:", len(df))

Total QA pairs: 130319


In [10]:
#shuffling
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# only working on subset of 15k row
df_subset = df.head(15000).copy().reset_index(drop=True)

print("Subset size:", df_subset.shape)
df_subset.head()

Subset size: (15000, 5)


Unnamed: 0,question,answers,context,answer_start,answer_end
0,What year did the global recession that follow...,[2012],It threatened the collapse of large financial ...,[481],[485]
1,what was a popular club in ibiza that started ...,[Amnesia],"But house was also being developed on Ibiza,[c...",[251],[258]
2,In what century did Martin Luther honor Mary a...,[],Although Calvin and Huldrych Zwingli honored M...,[],[]
3,What is the climate like?,[varies from hot and subhumid tropical],"Due to extreme variation in elevation, great v...",[115],[152]
4,How many times has the Queen toured Canada?,[],The Queen addressed the United Nations for a s...,[],[]


## Data Cleaning

Dropping rows where answers are empty

In [11]:
df_subset = df_subset[df_subset['answers'].map(len) > 0].reset_index(drop=True)
print("Rows remaining after drop:", len(df_subset))

Rows remaining after drop: 10020


Removing Extra Whitespaces

In [12]:
def collapse_whitespace(s):
    if isinstance(s, str):
        return re.sub(r'\s+', ' ', s.strip())
    return s

In [13]:
for col in ['question', 'context', 'answers']:
    if col in df_subset.columns:
        df_subset[col] = df_subset[col].apply(collapse_whitespace)

**Lets explore the length of the sequences which will determine some hyperparameters in training the models**

In [14]:
df_subset['question'].str.len().max()

203

In [15]:
df_subset['context'].str.len().max()

3706

We just turn the array of the answers to a string since none have multiple answers

In [16]:
df_subset['answers']= df_subset['answers'].apply(lambda x: x[0])

In [17]:
df_subset['answers'].str.len().max()

202

## Embeddings

In [18]:
!pip install --quiet gensim

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.
nilearn 0.11.1 requires scikit-learn>=1.4.0, but you have scikit-learn 1.2.2 which is incompatible.
bigframes 1.36.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.2.2 which is incompatible.
plotnine 0.14.5 requires matplotlib>=3.8.0, but you have matplotlib 3.7.5 which is incompatible.
mlxtend 0.23.4 requires scikit-learn>=1.3

**Tokenizer for phase 1 only**

In [19]:
tokenizer_phase_1 = Tokenizer(
    num_words=20000,
    oov_token='[UNK]',
    filters='''!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'''
)
tokenizer_phase_1.fit_on_texts(df_subset["question"].tolist()+df_subset["answers"].tolist())

q_seqs = tokenizer_phase_1.texts_to_sequences(df_subset['question'])
a_seqs = tokenizer_phase_1.texts_to_sequences(df_subset['answers'])

In [20]:
vocab_size = len(tokenizer_phase_1.word_index)
print("Total unique tokens:", vocab_size)

Total unique tokens: 18733


**Tokenizer for phase 2**

In [21]:
def truncate_context(context: str, ans_start: int, ans_end: int, max_len: int) -> str:
    """
    Return a substring of `context` of length up to max_len characters,
    centered on the character span [ans_start:ans_end], adjusted to word boundaries.
    """
    ans_len = ans_end - ans_start
    extra   = max_len - ans_len
    pre     = extra // 2
    post    = extra - pre

    # ideal window
    start = ans_start - pre
    end   = ans_end   + post

    # shift if off left edge
    if start < 0:
        start = 0
        end   = min(max_len, len(context))

    # shift if off right edge
    if end > len(context):
        end   = len(context)
        start = max(0, len(context) - max_len)

    # adjust start backward to nearest whitespace to avoid cutting a word
    if start > 0 and not context[start].isspace():
        m = re.search(r'\s', context[:start][::-1])
        if m:
            # position of last whitespace before start
            start = start - m.start()

    # adjust end forward to nearest whitespace to avoid cutting a word
    if end < len(context) and not context[end].isspace():
        m = re.search(r'\s', context[end:])
        if m:
            end = end + m.start()

    # final slice
    return context[start:end]

def build_truncated_context(df, max_len: int):
    """
    Returns a list of truncated context strings for each row in df,
    preserving at least the answer span and cutting only at word boundaries.
    """
    contexts = []
    for ctx, starts, ends in zip(df['context'], df['answer_start'], df['answer_end']):
        # pick the first span
        s = starts[0]
        e = ends[0]

        window = truncate_context(ctx, s, e, max_len)
        contexts.append(window)

    return contexts

In [22]:
#2000 is the number of allowed characters
contexts =  build_truncated_context(df_subset, 2000)
len(contexts[10])

1012

In [23]:
SEP_TOKEN = "[SEP]"

questions    = df_subset['question'].astype(str).tolist()
answers = df_subset['answers'].astype(str).tolist()

tokenizer_phase_2 = Tokenizer(
    num_words=20000,
    oov_token='[UNK]',
    filters='''!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n'''
)

all_texts = questions + contexts + answers + [SEP_TOKEN]
tokenizer_phase_2.fit_on_texts(all_texts)

In [24]:
trunc_c_seqs = tokenizer_phase_2.texts_to_sequences(contexts)
max(len(x) for x in trunc_c_seqs)

374

In [25]:
sep_id = tokenizer_phase_2.word_index.get(SEP_TOKEN)
if sep_id is None:
    sep_id = max(tokenizer_phase_2.word_index.values()) + 1
    tokenizer_phase_2.word_index[SEP_TOKEN]   = sep_id
    tokenizer_phase_2.index_word[sep_id]      = SEP_TOKEN

print("✔️ SEP token id is", sep_id)

✔️ SEP token id is 57292


In [26]:
# assume VOCAB_SIZE = 20000
MAX_ALLOWED = tokenizer_phase_2.num_words - 1   # reserve 0 for padding

# get its current (too large) id
old_sep_id = tokenizer_phase_2.word_index[SEP_TOKEN]

# pick a new “legal” id
new_sep_id = MAX_ALLOWED

# find which token currently holds new_sep_id (if any)
swap_token = tokenizer_phase_2.index_word.get(new_sep_id)


if swap_token:
    tokenizer_phase_2.word_index[swap_token] = old_sep_id
    tokenizer_phase_2.index_word[old_sep_id] = swap_token


tokenizer_phase_2.word_index[SEP_TOKEN]   = new_sep_id
tokenizer_phase_2.index_word[new_sep_id]  = SEP_TOKEN

print(f"SEP_TOKEN remapped from {old_sep_id} → {new_sep_id}")


SEP_TOKEN remapped from 57292 → 19999


In [27]:
MAX_C_TRUNC   = max(len(seq) for seq in trunc_c_seqs)
MAX_ENCODER_LEN = max(len(s) for s in q_seqs) + 1 + MAX_C_TRUNC

**Load gloVe dictionary**

In [28]:
def prepare_glove(target_dim=100, work_subdir='glove',
                  input_dataset_slug='glove6b',
                  download_url='http://nlp.stanford.edu/data/glove.6B.zip'):
    maybe_mount_drive()

    if is_kaggle():
        work_dir = f'/kaggle/working/{work_subdir}'
        uploaded_zip = f'/kaggle/input/{input_dataset_slug}/glove.6B.zip'
    elif is_colab():
        work_dir = f'/content/drive/MyDrive/{work_subdir}'
        uploaded_zip = None
    else:
        work_dir = f'./data/{work_subdir}'
        uploaded_zip = None

    os.makedirs(work_dir, exist_ok=True)

    target_file = f'glove.6B.{target_dim}d.txt'
    txt_path = os.path.join(work_dir, target_file)
    zip_path = os.path.join(work_dir, os.path.basename(download_url))

    if os.path.exists(txt_path):
        return txt_path

    if is_kaggle() and uploaded_zip and os.path.exists(uploaded_zip):
        zip_path = uploaded_zip
    else:
        if requests is None:
            raise RuntimeError("`requests` not available; offline mode")
        with requests.get(download_url, stream=True) as r, open(zip_path, 'wb') as f:
            r.raise_for_status()
            for chunk in r.iter_content(8192):
                f.write(chunk)

    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extract(target_file, path=work_dir)

    if not os.path.exists(txt_path):
        raise RuntimeError(f"Failed to extract {target_file}")

    return txt_path

# Usage
glove_path = prepare_glove()
print("GloVe file:", glove_path)

GloVe file: /kaggle/working/glove/glove.6B.100d.txt


In [29]:
def create_embedding_layer(
    tokenizer,
    glove_path: str,
    embedding_dim: int,
    mask_zero: bool = True,
    trainable: bool = False,
    oov_token: str = '[UNK]'
) -> Embedding:
    """
    Build a Keras Embedding layer from a fitted tokenizer and a GloVe file.

    Args:
        tokenizer: a fitted keras.preprocessing.text.Tokenizer
        glove_path: path to a GloVe‑style file (word + embedding_dim floats)
        max_num_words: max vocabulary size (typically tokenizer.num_words)
        embedding_dim: dimensionality of the GloVe vectors
        mask_zero: if True, reserve index 0 for padding (and mask it)
        trainable: if False, freeze the embedding weights
        oov_token: the out‑of‑vocab token (must match tokenizer.oov_token)

    Returns:
        A tf.keras.layers.Embedding instance with pretrained weights.
    """
   
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.rstrip().split(" ")
            word = parts[0]
            coefs = np.asarray(parts[1:], dtype='float32')
            if coefs.shape[0] != embedding_dim:
                continue  # skip any lines that don't match expected dim
            embeddings_index[word] = coefs

    
    vocab_size =  len(tokenizer.word_index) + 1
    embedding_matrix = np.random.normal(
        scale=0.01,
        size=(vocab_size, embedding_dim)
    ).astype('float32')

    
    for word, idx in tokenizer.word_index.items():
        if idx == 0 or idx >= vocab_size:
            continue
        vec = embeddings_index.get(word)
        if vec is not None:
            embedding_matrix[idx] = vec

    # 4) Build the Keras layer
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        mask_zero=mask_zero,
        trainable=trainable,
        name='pretrained_embedding'
    )
    return embedding_layer


## Hyperparameters

In [30]:
MAX_Q_LEN = max(len(s) for s in q_seqs)
MAX_A_LEN = max(len(s) for s in a_seqs)
MAX_C_LEN   = max(len(s) for s in trunc_c_seqs)
MAX_ENCODER_LEN = max(len(s) for s in q_seqs) + 1 + MAX_C_TRUNC
VOCAB_SIZE  = 20000
EMB_DIM     = 100
UNITS       = 128
BATCH_SIZE  = 64

In [31]:
q_padded = pad_sequences(q_seqs, maxlen=MAX_Q_LEN, padding='post', truncating='post')
a_padded = pad_sequences(a_seqs, maxlen=MAX_A_LEN, padding='post', truncating='post')

## Phase One

In [32]:
EMB_DIM      = 100
GLOVE_PATH   = prepare_glove()

embedding_layer = create_embedding_layer(
    tokenizer=tokenizer_phase_1,
    glove_path=GLOVE_PATH,
    embedding_dim=EMB_DIM,
    mask_zero=True,      
    trainable=False      
)

I0000 00:00:1745255401.226533      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [33]:
decoder_input  = a_padded[:, :-1]
decoder_target = a_padded[:, 1:]

Xq_tr, Xq_val, Din_tr, Din_val, Dt_tr, Dt_val = train_test_split(
    q_padded, decoder_input, decoder_target,
    test_size=0.1, random_state=42
)

def make_ds(q, d_in, d_tar, batch_size=64):
    mask = tf.cast(tf.not_equal(d_tar, 0), tf.float32)
    ds = tf.data.Dataset.from_tensor_slices(
        ((q, d_in), d_tar, mask)
    )
    return ds.shuffle(2000).batch(batch_size).prefetch(1)

train_ds = make_ds(Xq_tr, Din_tr, Dt_tr, batch_size=BATCH_SIZE)
val_ds   = make_ds(Xq_val, Din_val, Dt_val, batch_size=BATCH_SIZE)

In [34]:
print("Train batches:", tf.data.experimental.cardinality(train_ds).numpy())
print("Val batches:", tf.data.experimental.cardinality(val_ds).numpy())

Train batches: 141
Val batches: 16


**Building the model**

In [35]:
class Seq2SeqLSTM(tf.keras.Model):
    def __init__(self,vocab_size,emb_dim,units,max_q_len,max_a_len,embedding_matrix=None,pad_token_id=0,**kwargs):
        super().__init__(**kwargs)
        self.pad_token_id = pad_token_id


        if embedding_matrix is not None:
            self.embedding = Embedding(vocab_size, emb_dim,weights=[embedding_matrix],trainable=False,mask_zero=True)
        else:
            self.embedding = Embedding(vocab_size, emb_dim,mask_zero=True)

        #units is the vector size of the hidden state
        #return_state if true returns the final h and c
        self.encoder_lstm = LSTM(units, return_state=True, name='encoder_lstm')

        #return sequence returns all the hidden states from h_1 to h_n
        #return sequence is for evaluation
        #return state is for inference because after each token generated we need to feed the model the states again
        self.decoder_lstm = LSTM(units,return_sequences=True,return_state=True, name='decoder_lstm')

        #the layer needed to predict the next word
        self.dense = Dense(vocab_size, activation='softmax', name='decoder_dense')

    def call(self, inputs, training=False):
        encoder_inputs, decoder_inputs = inputs

        x_enc = self.embedding(encoder_inputs)
        _, state_h, state_c = self.encoder_lstm(x_enc, training=training)
        encoder_states = [state_h, state_c]

        x_dec = self.embedding(decoder_inputs)
        dec_outputs, _, _ = self.decoder_lstm(x_dec, initial_state=encoder_states, training=training)
        return self.dense(dec_outputs)


In [None]:
model = Seq2SeqLSTM( vocab_size=vocab_size,
                    emb_dim=100,units=128,
                     max_q_len=MAX_Q_LEN,max_a_len=MAX_A_LEN,embedding_matrix=embedding_matrix,pad_token_id=0)

dummy_q = tf.zeros((1, MAX_Q_LEN), dtype=tf.int32)
dummy_a = tf.zeros((1, MAX_A_LEN-1), dtype=tf.int32)
_ = model((dummy_q, dummy_a))


In [None]:
model.compile(
  optimizer='adam',
  loss='sparse_categorical_crossentropy',
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)
model.summary()

In [None]:
!mkdir /content/drive/MyDrive/models

In [None]:
# MODEL_DIR = get_model_dir()
# CHECKPOINT_PATH = os.path.join(MODEL_DIR, 'seq2seq_lstm_best.keras')

# checkpoint_cb = ModelCheckpoint(
#     filepath=CHECKPOINT_PATH,
#     monitor='val_sparse_categorical_accuracy',
#     save_best_only=True,
#     mode='max',
#     verbose=1
# )
# history = model.fit(
#     train_ds,
#     validation_data=val_ds,
#     epochs=EPOCHS,
#     callbacks=[checkpoint_cb]
# )
# print(f"Best model will be saved to: {CHECKPOINT_PATH}")

## Phase 2 using a transformer

In [36]:
q_seqs = tokenizer_phase_2.texts_to_sequences(df_subset['question'])
a_seqs = tokenizer_phase_2.texts_to_sequences(df_subset['answers'])
trunc_c_seqs = tokenizer_phase_2.texts_to_sequences(contexts)

In [37]:
embedding_layer = create_embedding_layer(
    tokenizer=tokenizer_phase_2,
    glove_path=GLOVE_PATH,
    embedding_dim=EMB_DIM,
    mask_zero=True,      
    trainable=False      
)

In [38]:
sep_id = tokenizer_phase_2.word_index.get('[SEP]')
encoder_seqs = [
    q + [sep_id] + c
    for q, c in zip(q_seqs, trunc_c_seqs)
]

In [39]:
a_padded = pad_sequences(a_seqs,maxlen=MAX_A_LEN,padding='post',truncating='post')
encoder_inputs = pad_sequences(encoder_seqs,maxlen=MAX_ENCODER_LEN,padding='post',truncating='post')
decoder_inputs = a_padded[:, :-1] 
decoder_targets= a_padded[:,  1:]

(enc_tr, enc_val,
 decin_tr, decin_val,
 dectar_tr, dectar_val) = train_test_split(
    encoder_inputs,
    decoder_inputs,
    decoder_targets,
    test_size=0.1,
    random_state=42
)

In [40]:
def make_ds(enc, decin, dectar, batch_size=BATCH_SIZE):
    ds = tf.data.Dataset.from_tensor_slices(((enc, decin), dectar))
    return ds.shuffle(2000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_ds = make_ds(enc_tr, decin_tr, dectar_tr)
val_ds   = make_ds(enc_val, decin_val, dectar_val)
print("   • train batches:", tf.data.experimental.cardinality(train_ds).numpy())
print("   • val batches:  ", tf.data.experimental.cardinality(val_ds).numpy())

   • train batches: 141
   • val batches:   16


In [41]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, max_len: int, embed_dim: int, **kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True
        self.max_len   = max_len
        self.embed_dim = embed_dim

        # build the full pos‑encoding once
        pos = np.arange(max_len)[:, np.newaxis]                 
        dim = np.arange(embed_dim)[np.newaxis, :]                
        angle_rates = 1.0 / np.power(10000.0, (2 * (dim//2)) / embed_dim)
        angle_rads  = pos * angle_rates                          
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])        
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])        

        # store as a constant so it's on the TF graph
        self.pos_encoding = tf.constant(angle_rads[np.newaxis, ...], dtype=tf.float32)
        # shape = (1, max_len, embed_dim)

    def call(self, x):
        # x.shape = (batch, seq_len, embed_dim)
        seq_len = tf.shape(x)[1]
        return x + self.pos_encoding[:, :seq_len, :]

    def compute_mask(self, inputs, mask=None):
        
        return mask

    def get_config(self):
        cfg = super().get_config()
        cfg.update({"max_len": self.max_len, "embed_dim": self.embed_dim})
        return cfg

In [42]:
class Encoder(layers.Layer):
    def __init__(self,embed_dim: int, num_heads: int, ff_dim: int, **kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True 
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        self.layer1 = layers.LayerNormalization()
        self.layer2 = layers.LayerNormalization()
        self.ffn =  tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])

    def compute_mask(self, inputs, mask=None):
        return mask
    
    def call(self,pos_matrix, padding_mask, **kwargs):
        att_out= self.attention(key=pos_matrix, value= pos_matrix, query=pos_matrix,attention_mask=padding_mask)
        norm1 = self.layer1(att_out+pos_matrix)
        ff_out = self.ffn(norm1)
        return self.layer2(ff_out)

        

In [43]:
class Decoder(layers.Layer):
    def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        self.supports_masking = True 
        self.self_mha  = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.cross_mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn       = Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.norm1 = layers.LayerNormalization()
        self.norm2 = layers.LayerNormalization()
        self.norm3 = layers.LayerNormalization()

    def compute_mask(self, inputs, mask=None):
        return mask

    def call(self, x, enc_out,
             look_ahead_mask=None,
             padding_mask=None,
             training=False):
        att1 = self.self_mha(x, x, x,
                             attention_mask=look_ahead_mask,
                             training=training)
        out1 = self.norm1(x + att1)

        att2 = self.cross_mha(out1, enc_out, enc_out,
                              attention_mask=padding_mask,
                              training=training)
        out2 = self.norm2(out1 + att2)

        ffn_out = self.ffn(out2, training=training)
        return self.norm3(out2 + ffn_out)
        

In [44]:
class Seq2SeqTransformer(Model):
    def __init__(self,
                 vocab_size,
                 embed_dim,
                 num_heads,
                 ff_dim,
                 max_q_len,
                 max_c_len,
                 max_a_len,
                 embedding_layer,
                 pad_token_id=0,
                 **kwargs):
        super().__init__(**kwargs)
        if embedding_layer is None:
            raise ValueError("`embedding_layer` must be provided")
        self.token_emb   = embedding_layer
        self.pad_id      = pad_token_id
        self.pos_emb_enc = PositionalEmbedding(max_q_len + 1 + max_c_len, embed_dim)
        self.pos_emb_dec = PositionalEmbedding(max_a_len, embed_dim)
        self.encoder     = Encoder(embed_dim, num_heads, ff_dim)
        self.decoder     = Decoder(embed_dim, num_heads, ff_dim)
        self.final_dense = layers.Dense(vocab_size, activation="softmax")

    def create_padding_mask(self, seq):
        mask = tf.cast(tf.equal(seq, self.pad_id), tf.float32)
        return mask[:, None, None, :]

    def create_look_ahead_mask(self, size):
        return 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)

    def call(self, inputs, training=False):
        enc_seq, dec_input = inputs
    
        enc_pad_mask4d = self.create_padding_mask(enc_seq)    
        dec_pad_bool = tf.equal(dec_input, self.pad_id)
    
        
        dec_pad_mask2d = dec_pad_bool[:, tf.newaxis, :]
    
        Ldec = tf.shape(dec_input)[1]
        look2d = self.create_look_ahead_mask(Ldec)         
        look2d = tf.cast(look2d, tf.bool)
    
       
        look3d = tf.broadcast_to(look2d[tf.newaxis, ...],
                                 [tf.shape(dec_input)[0], Ldec, Ldec])
    
        
        self_attn_mask = tf.logical_or(dec_pad_mask2d, look3d)
    
        x = self.token_emb(enc_seq)
        x = self.pos_emb_enc(x)
        enc_out = self.encoder(
            x,
            padding_mask=enc_pad_mask4d,  # still 4‑D, Keras will reduce it
            training=training
        )
    
       
        y = self.token_emb(dec_input)
        y = self.pos_emb_dec(y)
        dec_out = self.decoder(
            y,
            enc_out,
            look_ahead_mask=self_attn_mask,   # 3‑D mask for self‑attention
            padding_mask=enc_pad_mask4d,      # 4‑D mask for cross‑attention
            training=training
        )
    
        return self.final_dense(dec_out)

In [45]:
model = Seq2SeqTransformer(
    vocab_size=tokenizer_phase_2.num_words,
    embed_dim=100,
    num_heads=8,
    ff_dim=512,
    max_q_len=MAX_Q_LEN,
    max_c_len=MAX_C_TRUNC,
    max_a_len=MAX_A_LEN - 1,   # because we shift target by 1
    embedding_layer=embedding_layer,     # or your glove matrix
    pad_token_id=0,
)

In [46]:
model.summary()
model.compile(
  optimizer='adam',
  loss='sparse_categorical_crossentropy',
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

In [47]:
MODEL_DIR = get_model_dir()
CHECKPOINT_PATH = os.path.join(MODEL_DIR, 'seq2seq_lstm_best.keras')

checkpoint_cb = ModelCheckpoint(
    filepath=CHECKPOINT_PATH,
    monitor='val_sparse_categorical_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=50,
    callbacks=[checkpoint_cb]
)
print(f"Best model will be saved to: {CHECKPOINT_PATH}")

Epoch 1/50


I0000 00:00:1745255452.382317      95 service.cc:148] XLA service 0x7ad58000ae00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745255452.383033      95 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


InvalidArgumentError: Graph execution error:

Detected at node gradient_tape/seq2_seq_transformer_1/encoder_1/multi_head_attention_1/softmax_1/add/BroadcastGradientArgs defined at (most recent call last):
<stack traces unavailable>
Incompatible shapes: [64,8,408,408] vs. [64,64,408,408]

Stack trace for op definition: 
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once
File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 499, in process_one
File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request
File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute
File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell
File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell
File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell
File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async
File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
File "/tmp/ipykernel_31/2051485659.py", line 11, in <cell line: 0>
File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit
File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator
File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data
File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 70, in train_step

	 [[{{node gradient_tape/seq2_seq_transformer_1/encoder_1/multi_head_attention_1/softmax_1/add/BroadcastGradientArgs}}]]
	tf2xla conversion failed while converting __inference_one_step_on_data_10659[]. Run with TF_DUMP_GRAPH_PREFIX=/path/to/dump/dir and --vmodule=xla_compiler=2 to obtain a dump of the compiled functions.
	 [[StatefulPartitionedCall]] [Op:__inference_one_step_on_iterator_10946]