# Milestone 2

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import os, zipfile , json , random, requests
import re
from pathlib import Path
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import layers
from tensorflow.keras import  Sequential
import string

ImportError: cannot import name 'pywrap_tensorflow' from 'tensorflow.python' (c:\Users\Ahmed Labib\AppData\Local\Programs\Python\Python312\Lib\site-packages\tensorflow\python\__init__.py)

## Explorting dataset:

In [2]:

def is_kaggle():
    # Kaggle kernels always set this env var
    return 'KAGGLE_URL_BASE' in os.environ

def is_colab():
    return (not is_kaggle()) and os.path.exists('/content')

def maybe_mount_drive():
    if is_colab():
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive')


def project_root(start: Path | None = None) -> Path:
    p = Path(start or Path.cwd()).resolve()
    for _ in range(6):
        if (p / ".git").exists() or any((p / d).exists() for d in ["milestone1", "milestone2", "milestone3"]):
            return p
        if p.parent == p:
            break
        p = p.parent
    return Path.cwd()

def get_data_path():
    if is_kaggle():
        return '/kaggle/input/squad-2-0/'
    elif is_colab():
        return '/content/drive/MyDrive/SQuAD'
    else:
        return str(project_root() / 'data') + os.sep
    

def get_model_dir():
    if is_colab():
        model_dir = '/content/drive/MyDrive/models'
    elif is_kaggle():
        model_dir = '/kaggle/working/models'
    else:
        model_dir = str(project_root() / 'models')
    os.makedirs(model_dir, exist_ok=True)
    return model_dir

NameError: name 'Path' is not defined

In [242]:
dataset_dir = get_data_path()
maybe_mount_drive()
os.makedirs(dataset_dir, exist_ok=True)

In [243]:
file_path = os.path.join(dataset_dir, 'train-v2.0.json')

In [244]:
with open(file_path, 'r', encoding='utf-8') as f:
    squad = json.load(f)

In [287]:
records = []
for article in squad['data']:
    for para in article['paragraphs']:
        ctx = para['context']
        for qa in para['qas']:
            answers = [a['text'] for a in qa.get('answers', [])]
            starts  = [a['answer_start'] for a in qa.get('answers', [])]
            ends    = [s + len(t) for s,t in zip(starts, answers)]
            records.append({
                'question': qa['question'],
                'answers': answers,
                'context': ctx,
                'answer_start': starts,
                'answer_end': ends
            })



In [288]:
df = pd.DataFrame(records)
df.head()

Unnamed: 0,question,answers,context,answer_start,answer_end
0,When did Beyonce start becoming popular?,[in the late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[269],[286]
1,What areas did Beyonce compete in when she was...,[singing and dancing],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[207],[226]
2,When did Beyonce leave Destiny's Child and bec...,[2003],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[526],[530]
3,In what city and state did Beyonce grow up?,"[Houston, Texas]",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[166],[180]
4,In which decade did Beyonce become famous?,[late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[276],[286]


In [289]:
print("Total QA pairs:", len(df))

Total QA pairs: 130319


In [290]:
#shuffling
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# only working on subset of 15k row
df_subset = df.head(15000).copy().reset_index(drop=True)

print("Subset size:", df_subset.shape)
df_subset.head()

Subset size: (15000, 5)


Unnamed: 0,question,answers,context,answer_start,answer_end
0,What year did the global recession that follow...,[2012],It threatened the collapse of large financial ...,[481],[485]
1,what was a popular club in ibiza that started ...,[Amnesia],"But house was also being developed on Ibiza,[c...",[251],[258]
2,In what century did Martin Luther honor Mary a...,[],Although Calvin and Huldrych Zwingli honored M...,[],[]
3,What is the climate like?,[varies from hot and subhumid tropical],"Due to extreme variation in elevation, great v...",[115],[152]
4,How many times has the Queen toured Canada?,[],The Queen addressed the United Nations for a s...,[],[]


## Data Cleaning

Dropping rows where answers are empty

In [291]:
df_subset = df_subset[df_subset['answers'].map(len) > 0].reset_index(drop=True)
print("Rows remaining after drop:", len(df_subset))

Rows remaining after drop: 10020


Removing Extra Whitespaces

In [292]:
def collapse_whitespace(s):
    if isinstance(s, str):
        return re.sub(r'\s+', ' ', s.strip())
    return s

In [293]:
for col in ['question', 'context', 'answers']:
    if col in df_subset.columns:
        df_subset[col] = df_subset[col].apply(collapse_whitespace)

**Lets explore the length of the sequences which will determine some hyperparameters in training the models**

In [294]:
df_subset['question'].str.len().max()

203

In [295]:
df_subset['context'].str.len().max()

3706

We just turn the array of the answers to a string since none have multiple answers

In [296]:
df_subset['answers']= df_subset['answers'].apply(lambda x: x[0])

In [297]:
df_subset['answers'].str.len().max()

202

## Embeddings

In [298]:
!pip install --quiet gensim

**Tokenizer for phase 1 only**

In [299]:
tokenizer_phase_1 = Tokenizer(
    num_words=20000,
    oov_token='[UNK]',
    filters='''!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'''
)
tokenizer_phase_1.fit_on_texts(df_subset["question"].tolist()+df_subset["answers"].tolist())

q_seqs = tokenizer_phase_1.texts_to_sequences(df_subset['question'])
a_seqs = tokenizer_phase_1.texts_to_sequences(df_subset['answers'])



In [300]:
vocab_size = len(tokenizer_phase_1.word_index)
print("Total unique tokens:", vocab_size)

Total unique tokens: 18733


**Tokenizer for phase 2**

In [301]:
def truncate_context(context: str, ans_start: int, ans_end: int, max_len: int) -> str:
    """
    Return a substring of `context` of length up to max_len characters,
    centered on the character span [ans_start:ans_end], adjusted to word boundaries.
    """
    ans_len = ans_end - ans_start
    extra   = max_len - ans_len
    pre     = extra // 2
    post    = extra - pre

    # ideal window
    start = ans_start - pre
    end   = ans_end   + post

    # shift if off left edge
    if start < 0:
        start = 0
        end   = min(max_len, len(context))

    # shift if off right edge
    if end > len(context):
        end   = len(context)
        start = max(0, len(context) - max_len)

    # adjust start backward to nearest whitespace to avoid cutting a word
    if start > 0 and not context[start].isspace():
        m = re.search(r'\s', context[:start][::-1])
        if m:
            # position of last whitespace before start
            start = start - m.start()

    # adjust end forward to nearest whitespace to avoid cutting a word
    if end < len(context) and not context[end].isspace():
        m = re.search(r'\s', context[end:])
        if m:
            end = end + m.start()

    # final slice
    return context[start:end]

def build_truncated_context(df, max_len: int):
    """
    Returns a list of truncated context strings for each row in df,
    preserving at least the answer span and cutting only at word boundaries.
    """
    contexts = []
    for ctx, starts, ends in zip(df['context'], df['answer_start'], df['answer_end']):
        # pick the first span
        s = starts[0]
        e = ends[0]

        window = truncate_context(ctx, s, e, max_len)
        contexts.append(window)

    return contexts

In [302]:

questions = df_subset['question'].astype(str).tolist()
answers   = df_subset['answers'].astype(str).tolist()
contexts  = build_truncated_context(df_subset, 2000)
tokenizer_phase_2 = Tokenizer(
    num_words=50000,
    oov_token='[UNK]',
    filters='''!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n'''
)
tokenizer_phase_2.fit_on_texts(questions + answers + contexts + ['[SEP]'])


**Adding sep sos and eos tag**

In [303]:
sep_token = "[SEP]"
sep_id = tokenizer_phase_2.word_index.get(sep_token)
if sep_id is None:
    sep_id = tokenizer_phase_2.num_words - 1
    occupant = tokenizer_phase_2.index_word.get(sep_id)
    if occupant:
        del tokenizer_phase_2.word_index[occupant] 
    tokenizer_phase_2.word_index[sep_token] = sep_id
    tokenizer_phase_2.index_word[sep_id]  = sep_token
print("✅ SEP_TOKEN id =", sep_id)


for token, offset in {"[SOS]":1, "[EOS]":2}.items():
    tok_id = tokenizer_phase_2.word_index.get(token)
    if tok_id is None:
        new_id = tokenizer_phase_2.num_words - offset - 1
        occupant = tokenizer_phase_2.index_word.get(new_id)
        if occupant:
            del tokenizer_phase_2.word_index[occupant]
        tokenizer_phase_2.word_index[token]      = new_id
        tokenizer_phase_2.index_word[new_id]     = token
        print(f"✅ {token} injected at id {new_id}")
    else:
        print(f"✅ {token} already at id {tok_id}")

sep_id = tokenizer_phase_2.word_index['[SEP]']
sos_id = tokenizer_phase_2.word_index['[SOS]']
eos_id = tokenizer_phase_2.word_index['[EOS]']
pad_id = 0

print("Special IDs:", {'SEP':sep_id, 'SOS':sos_id, 'EOS':eos_id, 'PAD':pad_id})


✅ SEP_TOKEN id = 49999
✅ [SOS] injected at id 49998
✅ [EOS] injected at id 49997
Special IDs: {'SEP': 49999, 'SOS': 49998, 'EOS': 49997, 'PAD': 0}


In [304]:
q_seqs = tokenizer_phase_2.texts_to_sequences(questions)
c_seqs = tokenizer_phase_2.texts_to_sequences(contexts)
a_raw  = tokenizer_phase_2.texts_to_sequences(answers)
a_seqs = [[sos_id] + seq + [eos_id] for seq in a_raw]

MAX_Q_LEN  = max(len(s) for s in q_seqs)
MAX_C_LEN  = max(len(s) for s in c_seqs)
MAX_A_LEN  = max(len(s) for s in a_seqs)
MAX_ENCODER_LEN = MAX_Q_LEN + 1 + MAX_C_LEN

print("Lengths:", {'Q':MAX_Q_LEN, 'C':MAX_C_LEN, 'A':MAX_A_LEN, 'Enc':MAX_ENCODER_LEN})

Lengths: {'Q': 33, 'C': 374, 'A': 32, 'Enc': 408}


**Add padding to the encoder and decoder inputs**

In [305]:
enc_seqs = [q + [sep_id] + c for q, c in zip(q_seqs, c_seqs)]
encoder_inputs = pad_sequences(
    enc_seqs,
    maxlen=MAX_ENCODER_LEN,
    padding='post',
    truncating='post',
    value=pad_id
)

a_padded = pad_sequences(
    a_seqs,
    maxlen=MAX_A_LEN,
    padding='post',
    truncating='post',
    value=pad_id
)

**Split the data**

In [306]:
decoder_inputs  = a_padded[:, :-1]  # begins with SOS
decoder_targets = a_padded[:,  1:]  # ends with EOS

print("encoder_inputs:", encoder_inputs.shape)
print("decoder_inputs:", decoder_inputs.shape)
print("decoder_targets:", decoder_targets.shape)
assert decoder_inputs.shape == decoder_targets.shape

(enc_tr, enc_val,
 decin_tr, decin_val,
 dectar_tr, dectar_val) = train_test_split(
    encoder_inputs,
    decoder_inputs,
    decoder_targets,
    test_size=0.1,
    random_state=42
)

encoder_inputs: (10020, 408)
decoder_inputs: (10020, 31)
decoder_targets: (10020, 31)


In [307]:
# for (enc_batch, _), _, _ in train_ds.take(1):
#     ids    = enc_batch[0].numpy()  # first example in the batch
#     tokens = [tokenizer_phase_2.index_word.get(i, "") for i in ids]
#     tokens = [t for t in tokens if t]
#     print("Token sequence:")
#     print(tokens)
#     print("\nReconstructed text:")
#     print(" ".join(tokens))
#     break
    

Token sequence:
['who', 'joining', 'that', 'the', 'second', 'songs', 'deemed', 'bird', 'influenced', 'change', 'flow', 'species', 'signal', 'at', 'the', 'european', 'that', 'the', 'second', 'songs', 'capital', 'several', '[SEP]', 'the', 'capital', 'several', 'had', 'have', 'response', 'birds', 'most', 'afro', 'phanerozoic', 'to', 'mw', 'unit', 'of', 'benjamin', 'ice', '1950s', 'the', 'bird', 'influenced', 'their', 'system', 'threat', 'birds', 'most', 'grotius', 'and', 'd–log', 'hugo', 'prior', 'from', 'the', 'although', 'on', 'locations', 'of', 'benjamin', 'adjacent', 'to', 'its', 'important', 'follow', 'on', 'its', "darwin's", 'are', 'continental', 'to', 'the', 'jesus', 'of', 'census', 'and', 'the', '1', 'building', 'some', 'the', 'administration', 'paris', 'mm', 'of', 'the', 'bhaktapur', 'capital', 'or', 'was', 'basic', 'to', 'last', '2', 'a', 'steel', 'hymns', 'that', 'would', 'cover', 'as', 'a', 'appear', 'diffusive', 'arline', 'for', 'the', 'capital', 'certain', 'on', 'from', '85'

**Load gloVe dictionary**

In [308]:
def prepare_glove(target_dim=100, work_subdir='glove',
                  input_dataset_slug='glove6b',
                  download_url='http://nlp.stanford.edu/data/glove.6B.zip'):
    maybe_mount_drive()

    if is_kaggle():
        work_dir = f'/kaggle/working/{work_subdir}'
        uploaded_zip = f'/kaggle/input/{input_dataset_slug}/glove.6B.zip'
    elif is_colab():
        work_dir = f'/content/drive/MyDrive/{work_subdir}'
        uploaded_zip = None
    else:
        work_dir = f'./data/{work_subdir}'
        uploaded_zip = None

    os.makedirs(work_dir, exist_ok=True)

    target_file = f'glove.6B.{target_dim}d.txt'
    txt_path = os.path.join(work_dir, target_file)
    zip_path = os.path.join(work_dir, os.path.basename(download_url))

    if os.path.exists(txt_path):
        return txt_path

    if is_kaggle() and uploaded_zip and os.path.exists(uploaded_zip):
        zip_path = uploaded_zip
    else:
        if requests is None:
            raise RuntimeError("`requests` not available; offline mode")
        with requests.get(download_url, stream=True) as r, open(zip_path, 'wb') as f:
            r.raise_for_status()
            for chunk in r.iter_content(8192):
                f.write(chunk)

    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extract(target_file, path=work_dir)

    if not os.path.exists(txt_path):
        raise RuntimeError(f"Failed to extract {target_file}")

    return txt_path

# Usage
glove_path = prepare_glove()
print("GloVe file:", glove_path)

GloVe file: /kaggle/working/glove/glove.6B.100d.txt


In [309]:
def create_embedding_layer(
    tokenizer,
    glove_path: str,
    embedding_dim: int,
    mask_zero: bool = True,
    trainable: bool = False,
    oov_token: str = '[UNK]'
) -> Embedding:
    """
    Build a Keras Embedding layer from a fitted tokenizer and a GloVe file.

    Args:
        tokenizer: a fitted keras.preprocessing.text.Tokenizer
        glove_path: path to a GloVe‑style file (word + embedding_dim floats)
        max_num_words: max vocabulary size (typically tokenizer.num_words)
        embedding_dim: dimensionality of the GloVe vectors
        mask_zero: if True, reserve index 0 for padding (and mask it)
        trainable: if False, freeze the embedding weights
        oov_token: the out‑of‑vocab token (must match tokenizer.oov_token)

    Returns:
        A tf.keras.layers.Embedding instance with pretrained weights.
    """
   
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.rstrip().split(" ")
            word = parts[0]
            coefs = np.asarray(parts[1:], dtype='float32')
            if coefs.shape[0] != embedding_dim:
                continue  # skip any lines that don't match expected dim
            embeddings_index[word] = coefs

    
    vocab_size =  len(tokenizer.word_index) + 1
    embedding_matrix = np.random.normal(
        scale=0.01,
        size=(vocab_size, embedding_dim)
    ).astype('float32')

    
    for word, idx in tokenizer.word_index.items():
        if idx == 0 or idx >= vocab_size:
            continue
        vec = embeddings_index.get(word)
        if vec is not None:
            embedding_matrix[idx] = vec

    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        mask_zero=mask_zero,
        trainable=trainable,
        name='pretrained_embedding'
    )
    return embedding_layer


## Phase One

In [None]:
MAX_Q_LEN = max(len(s) for s in q_seqs)
MAX_A_LEN = max(len(s) for s in a_seqs)
MAX_C_LEN   = max(len(s) for s in trunc_c_seqs)
MAX_ENCODER_LEN = max(len(s) for s in q_seqs) + 1 + MAX_C_TRUNC
VOCAB_SIZE  = 20000
EMB_DIM     = 100
UNITS       = 128
BATCH_SIZE  = 64

In [None]:
q_padded = pad_sequences(q_seqs, maxlen=MAX_Q_LEN, padding='post', truncating='post')
a_padded = pad_sequences(a_seqs, maxlen=MAX_A_LEN, padding='post', truncating='post')

In [None]:
EMB_DIM      = 100
GLOVE_PATH   = prepare_glove()

embedding_layer = create_embedding_layer(
    tokenizer=tokenizer_phase_1,
    glove_path=GLOVE_PATH,
    embedding_dim=EMB_DIM,
    mask_zero=True,      
    trainable=False      
)

In [None]:
decoder_input  = a_padded[:, :-1]
decoder_target = a_padded[:, 1:]

Xq_tr, Xq_val, Din_tr, Din_val, Dt_tr, Dt_val = train_test_split(
    q_padded, decoder_input, decoder_target,
    test_size=0.1, random_state=42
)

def make_ds(q, d_in, d_tar, batch_size=64):
    mask = tf.cast(tf.not_equal(d_tar, 0), tf.float32)
    ds = tf.data.Dataset.from_tensor_slices(
        ((q, d_in), d_tar, mask)
    )
    return ds.shuffle(2000).batch(batch_size).prefetch(1)

train_ds = make_ds(Xq_tr, Din_tr, Dt_tr, batch_size=BATCH_SIZE)
val_ds   = make_ds(Xq_val, Din_val, Dt_val, batch_size=BATCH_SIZE)

In [None]:
print("Train batches:", tf.data.experimental.cardinality(train_ds).numpy())
print("Val batches:", tf.data.experimental.cardinality(val_ds).numpy())

**Building the model**

In [None]:
class Seq2SeqLSTM(tf.keras.Model):
    def __init__(self,vocab_size,emb_dim,units,max_q_len,max_a_len,embedding_matrix=None,pad_token_id=0,**kwargs):
        super().__init__(**kwargs)
        self.pad_token_id = pad_token_id


        if embedding_matrix is not None:
            self.embedding = Embedding(vocab_size, emb_dim,weights=[embedding_matrix],trainable=False,mask_zero=True)
        else:
            self.embedding = Embedding(vocab_size, emb_dim,mask_zero=True)

        #units is the vector size of the hidden state
        #return_state if true returns the final h and c
        self.encoder_lstm = LSTM(units, return_state=True, name='encoder_lstm')

        #return sequence returns all the hidden states from h_1 to h_n
        #return sequence is for evaluation
        #return state is for inference because after each token generated we need to feed the model the states again
        self.decoder_lstm = LSTM(units,return_sequences=True,return_state=True, name='decoder_lstm')

        #the layer needed to predict the next word
        self.dense = Dense(vocab_size, activation='softmax', name='decoder_dense')

    def call(self, inputs, training=False):
        encoder_inputs, decoder_inputs = inputs

        x_enc = self.embedding(encoder_inputs)
        _, state_h, state_c = self.encoder_lstm(x_enc, training=training)
        encoder_states = [state_h, state_c]

        x_dec = self.embedding(decoder_inputs)
        dec_outputs, _, _ = self.decoder_lstm(x_dec, initial_state=encoder_states, training=training)
        return self.dense(dec_outputs)
    


In [None]:
model = Seq2SeqLSTM( vocab_size=vocab_size,
                    emb_dim=100,units=128,
                     max_q_len=MAX_Q_LEN,max_a_len=MAX_A_LEN,embedding_matrix=embedding_matrix,pad_token_id=0)

dummy_q = tf.zeros((1, MAX_Q_LEN), dtype=tf.int32)
dummy_a = tf.zeros((1, MAX_A_LEN-1), dtype=tf.int32)
_ = model((dummy_q, dummy_a))


In [None]:
model.compile(
  optimizer='adam',
  loss='sparse_categorical_crossentropy',
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)
model.summary()

In [None]:
# !mkdir /content/drive/MyDrive/models

In [None]:
# MODEL_DIR = get_model_dir()
# CHECKPOINT_PATH = os.path.join(MODEL_DIR, 'seq2seq_lstm_best.keras')

# checkpoint_cb = ModelCheckpoint(
#     filepath=CHECKPOINT_PATH,
#     monitor='val_sparse_categorical_accuracy',
#     save_best_only=True,
#     mode='max',
#     verbose=1
# )
# history = model.fit(
#     train_ds,
#     validation_data=val_ds,
#     epochs=EPOCHS,
#     callbacks=[checkpoint_cb]
# )
# print(f"Best model will be saved to: {CHECKPOINT_PATH}")

## Phase 2 using a transformer

In [310]:
EMB_DIM      = 100
GLOVE_PATH   = prepare_glove()

embedding_layer = create_embedding_layer(
    tokenizer=tokenizer_phase_2,
    glove_path=GLOVE_PATH,
    embedding_dim=EMB_DIM,
    mask_zero=False,      
    trainable=False      
)

In [311]:
PAD_ID = 0  
BATCH_SIZE= 64
def make_ds(enc, decin, dectar, batch_size=BATCH_SIZE):
    
    ds = tf.data.Dataset.from_tensor_slices(((enc, decin), dectar))
    
    
    def add_sample_weight(inputs, target):
        weights = tf.cast(tf.not_equal(target, PAD_ID), tf.float32)
        return inputs, target, weights
    
    return (
        ds
        .shuffle(2000)
        .map(add_sample_weight, num_parallel_calls=tf.data.AUTOTUNE)
        .batch(batch_size)
        .prefetch(tf.data.AUTOTUNE)
    )

train_ds = make_ds(enc_tr, decin_tr, dectar_tr)
val_ds   = make_ds(enc_val, decin_val, dectar_val)

print("   • train batches:", tf.data.experimental.cardinality(train_ds).numpy())
print("   • val batches:  ", tf.data.experimental.cardinality(val_ds).numpy())


   • train batches: 141
   • val batches:   16


In [312]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, max_len: int, embed_dim: int, **kwargs):
        super().__init__(**kwargs)
        # self.supports_masking = True
        self.max_len   = max_len
        self.embed_dim = embed_dim

        pos = np.arange(max_len)[:, np.newaxis]                 
        dim = np.arange(embed_dim)[np.newaxis, :]                
        angle_rates = 1.0 / np.power(10000.0, (2 * (dim//2)) / embed_dim)
        angle_rads  = pos * angle_rates                          
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])        
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])        
        self.pos_encoding = tf.constant(angle_rads[np.newaxis, ...], dtype=tf.float32)
        

    def call(self, x):
        seq_len = tf.shape(x)[1]
        return x + self.pos_encoding[:, :seq_len, :]

    def get_config(self):
        config = super().get_config()
        config.update({
            "max_len": self.max_len,
            "embed_dim": self.embed_dim,
        })
        return config

In [313]:
class Encoder(layers.Layer):
    def __init__(self,embed_dim: int, num_heads: int, ff_dim: int, **kwargs):
        super().__init__(**kwargs)
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        self.layer1 = layers.LayerNormalization()
        self.layer2 = layers.LayerNormalization()
        self.ffn =  tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])

    
    def call(self,pos_matrix, padding_mask, **kwargs):
        att_out= self.attention(query=pos_matrix, value=pos_matrix, key=pos_matrix,attention_mask=padding_mask)
        norm1 = self.layer1(att_out+pos_matrix)
        ff_out = self.ffn(norm1)
        return self.layer2(ff_out)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
        })
        return config

        

In [314]:
class Decoder(layers.Layer):
    def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, dropout=0.1, **kwargs):
        super().__init__(**kwargs)
        # no supports_masking = True
        self.self_mha  = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.cross_mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn       = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.norm1 = layers.LayerNormalization()
        self.norm2 = layers.LayerNormalization()
        self.norm3 = layers.LayerNormalization()

    def call(self, x, enc_out,
             look_ahead_mask=None,
             padding_mask=None,
             training=False):
        # 1) Decoder self‑attention with your 3D look‑ahead+pad mask
        att1 = self.self_mha(
            query=x, value=x, key=x,
            attention_mask=look_ahead_mask,
            training=training
        )
        out1 = self.norm1(x + att1, training=training)

        
        att2 = self.cross_mha(
            query=out1, value=enc_out, key=enc_out,
            attention_mask=padding_mask,
            training=training
        )
        out2 = self.norm2(out1 + att2, training=training)

        
        ffn_out = self.ffn(out2, training=training)
        return self.norm3(out2 + ffn_out, training=training)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "embedding_layer": serialize_keras_object(self.token_emb)
            
        })
        return config

    @classmethod
    def from_config(cls, config):
        
        emb_conf = config.pop("embedding_layer")
        
        embedding_layer = deserialize_keras_object(
            emb_conf,
            module_objects=globals(),       # globals() now contains Embedding
            custom_objects=None
        )
        return cls(embedding_layer=embedding_layer, **config)


In [315]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.utils import serialize_keras_object, deserialize_keras_object
from tensorflow.keras.layers import Embedding

class Seq2SeqTransformer(tf.keras.Model):
    def __init__(
        self,
        vocab_size,
        embed_dim,
        num_heads,
        ff_dim,
        max_enc_in_len,
        max_a_len,
        embedding_layer,
        pad_token_id=0,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.max_enc_in_len = max_enc_in_len
        self.max_a_len = max_a_len
        self.pad_token_id = pad_token_id

        if embedding_layer is None:
            raise ValueError("`embedding_layer` must be provided")
        self.token_emb = embedding_layer
        self.pos_emb_enc = PositionalEmbedding(max_enc_in_len, embed_dim)
        self.pos_emb_dec = PositionalEmbedding(max_a_len, embed_dim)
        self.encoder1 = Encoder(embed_dim, num_heads, ff_dim)
        self.encoder2 = Encoder(embed_dim, num_heads, ff_dim)
        self.decoder1 = Decoder(embed_dim, num_heads, ff_dim)
        self.decoder2 = Decoder(embed_dim, num_heads, ff_dim)
        self.final_dense = layers.Dense(vocab_size, activation="softmax")

    def create_padding_mask(self, seq):
        mask = tf.equal(seq, self.pad_token_id)
        return mask[:, None, None, :]

    def create_look_ahead_mask(self, size):
        return 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)

    def call(self, inputs, training=False):
        enc_seq, dec_seq = inputs
        dec_len = tf.shape(dec_seq)[1]
        enc_pad = self.create_padding_mask(enc_seq)
        dec_pad_bool = tf.equal(dec_seq, self.pad_token_id)
        look2d = tf.cast(self.create_look_ahead_mask(dec_len), tf.bool)
        dec_pad_3d = dec_pad_bool[:, None, :]
        look3d = look2d[None, :, :]
        self_attn = tf.logical_or(dec_pad_3d, look3d)

        enc_x = self.pos_emb_enc(self.token_emb(enc_seq))
        x = self.encoder1(enc_x, padding_mask=enc_pad, training=training)
        x = self.encoder2(x, padding_mask=enc_pad, training=training)

        dec_x = self.pos_emb_dec(self.token_emb(dec_seq))
        y = self.decoder1(
            dec_x,
            x,
            look_ahead_mask=self_attn,
            padding_mask=enc_pad,
            training=training
        )
        y = self.decoder2(
            y,
            x,
            look_ahead_mask=self_attn,
            padding_mask=enc_pad,
            training=training
        )
        return self.final_dense(y)

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ff_dim": self.ff_dim,
            "max_enc_in_len": self.max_enc_in_len,
            "max_a_len": self.max_a_len,
            "pad_token_id": self.pad_token_id,
            "embedding_layer": serialize_keras_object(self.token_emb),
        })
        return config

    @classmethod
    def from_config(cls, config):
        emb_conf = config.pop("embedding_layer")
        embedding_layer = deserialize_keras_object(
            emb_conf,
            module_objects=globals(),
            custom_objects={"Embedding": Embedding}
        )
        return cls(embedding_layer=embedding_layer, **config)


In [316]:
model = Seq2SeqTransformer(
    vocab_size=tokenizer_phase_2.num_words,
    embed_dim=100,
    num_heads=8,
    ff_dim=512,
    max_enc_in_len=MAX_ENCODER_LEN,
    max_a_len=MAX_A_LEN - 1, 
    embedding_layer=embedding_layer,
    pad_token_id=0,
)

### Custom Evaluation metrics

In [317]:
def normalize_answer(s):
    """Lower, strip punctuation/extra whitespace, collapse to tokens."""
    s = s.lower()
    s = re.sub(f"[{re.escape(string.punctuation)}]", " ", s)
    s = " ".join(s.split())
    return s

def exact_match_score(pred, truth):
    return int(normalize_answer(pred) == normalize_answer(truth))

def f1_score(pred, truth):
    pred_tokens = normalize_answer(pred).split()
    truth_tokens = normalize_answer(truth).split()
    if not pred_tokens or not truth_tokens:
        return int(pred_tokens == truth_tokens)
  
    common = {}
    for t in pred_tokens:
        common[t] = common.get(t, 0) + 1
    same = 0
    for t in truth_tokens:
        if common.get(t, 0) > 0:
            same += 1
            common[t] -= 1
    if same == 0:
        return 0.0
    prec = same / len(pred_tokens)
    rec  = same / len(truth_tokens)
    return 2 * prec * rec / (prec + rec)




In [318]:

def decode_batch(seqs, tokenizer, pad_id=0, sos_id=None, eos_id=None):
    texts = tokenizer.sequences_to_texts(seqs)
    clean_texts = []
    for txt in texts:
        tokens = txt.split()
        tokens = [
            t for t in tokens
            if t not in {tokenizer.index_word.get(pad_id, ""), 
                         tokenizer.index_word.get(sos_id, ""), 
                         tokenizer.index_word.get(eos_id, "")}
        ]
        clean_texts.append(" ".join(tokens))
    return clean_texts

class QAEvalCallback(tf.keras.callbacks.Callback):
    def __init__(self, val_ds, tokenizer, pad_id=0, sos_id=None, eos_id=None):
        super().__init__()
        self.val_ds    = val_ds
        self.tokenizer = tokenizer
        self.pad_id    = pad_id
        self.sos_id    = sos_id
        self.eos_id    = eos_id

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        total_em = 0.0
        total_f1 = 0.0
        count    = 0

        for (enc_batch, dec_in_batch), dec_tar_batch, _ in self.val_ds:
            logits = self.model((enc_batch, dec_in_batch), training=False)
            pred_ids = tf.argmax(logits, axis=-1).numpy()

            preds = decode_batch(pred_ids,
                                 tokenizer=self.tokenizer,
                                 pad_id=self.pad_id,
                                 sos_id=self.sos_id,
                                 eos_id=self.eos_id)
            reals = decode_batch(dec_tar_batch.numpy(),
                                 tokenizer=self.tokenizer,
                                 pad_id=self.pad_id,
                                 sos_id=self.sos_id,
                                 eos_id=self.eos_id)

            for p_str, r_str in zip(preds, reals):
                total_em += exact_match_score(p_str, r_str)
                total_f1 += f1_score(p_str, r_str)
                count   += 1

        val_em = 100.0 * total_em / count
        val_f1 = 100.0 * total_f1 / count
        logs["val_em"] = val_em
        logs["val_f1"] = val_f1
        print(f" — val_EM: {val_em:.2f}%  — val_F1: {val_f1:.2f}%")


### Model training

In [319]:
dummy_enc = tf.zeros((1, MAX_ENCODER_LEN), dtype=tf.int32)
dummy_dec = tf.zeros((1,MAX_A_LEN - 1 ), dtype=tf.int32)
_ = model((dummy_enc, dummy_dec), training=False)
model.summary()


model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=[]    
)


In [None]:
MODEL_DIR       = get_model_dir()
CHECKPOINT_PATH = os.path.join(MODEL_DIR, 'best_transformer_1.keras')

checkpoint_cb = ModelCheckpoint(
    filepath=CHECKPOINT_PATH,
    monitor='val_f1',      
    save_best_only=True,
    mode='max',
    verbose=1
)

qa_eval_cb = QAEvalCallback(
    val_ds,
    tokenizer_phase_2,
    pad_id=pad_id,
    sos_id=sos_id,
    eos_id=eos_id
)
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[qa_eval_cb,checkpoint_cb]
)

print(f"Best model will be saved to: {CHECKPOINT_PATH}")


In [322]:
best_model = load_model(
    '/kaggle/working/models/best_transformer_1.keras',
    custom_objects={
        "Seq2SeqTransformer": Seq2SeqTransformer,
        "PositionalEmbedding": PositionalEmbedding,
        "Encoder": Encoder,
        "Decoder": Decoder,
        "Embedding": Embedding,   
    }
)

best_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=[]
)


history = best_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=4,
    callbacks=[qa_eval_cb, checkpoint_cb]
)

Epoch 1/4
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step - loss: 0.0131 — val_EM: 0.00%  — val_F1: 10.74%

Epoch 1: val_f1 did not improve from 11.21321
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 224ms/step - loss: 0.0132 - val_loss: 0.4020 - val_em: 0.0000e+00 - val_f1: 10.7399
Epoch 2/4
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - loss: 0.0126 — val_EM: 0.00%  — val_F1: 11.47%

Epoch 2: val_f1 improved from 11.21321 to 11.46981, saving model to /kaggle/working/models/best_transformer_1.keras
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 155ms/step - loss: 0.0126 - val_loss: 0.4201 - val_em: 0.0000e+00 - val_f1: 11.4698
Epoch 3/4
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - loss: 0.0074 — val_EM: 0.00%  — val_F1: 11.02%

Epoch 3: val_f1 did not improve from 11.46981
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 150ms/step -

In [None]:


print("  • # train batches:", tf.data.experimental.cardinality(train_ds).numpy())
print("  • # val   batches:", tf.data.experimental.cardinality(val_ds).numpy())


for (enc_batch, decin_batch), dec_tar_batch, weight_batch in train_ds.take(1):
    print("Encoder shape:", enc_batch.shape)       # (B, enc_len)
    print("Decoder‑in shape:", decin_batch.shape)  # (B, ans_len)
    print("Decoder‑tar shape:", dec_tar_batch.shape)
    print("Weights shape:", weight_batch.shape)
    
    enc_ids = enc_batch[0].numpy().tolist()
    decin_ids = decin_batch[0].numpy().tolist()
    detar_ids = dec_tar_batch[0].numpy().tolist()
    
    
    enc_ids = [i for i in enc_ids if i != pad_id]
    decin_ids = [i for i in decin_ids if i not in (pad_id, sos_id, eos_id)]
    detar_ids = [i for i in detar_ids if i not in (pad_id, sos_id, eos_id)]
    
    q_c_text = tokenizer_phase_2.sequences_to_texts([enc_ids])[0]
    inp_ans  = tokenizer_phase_2.sequences_to_texts([decin_ids])[0]
    tar_ans  = tokenizer_phase_2.sequences_to_texts([detar_ids])[0]
    
    print("\nSample #1")
    print("  Q+SEP+C →", q_c_text[:200] + "…")
    print("  Decoder‑in →", inp_ans)
    print("  Decoder‑tar→", tar_ans)
    break


In [202]:

def show_errors_greedy(model, dataset, tokenizer, max_encoder_len, max_answer_len, sep_id, pad_id, sos_id, eos_id, n=5):
    it = dataset.as_numpy_iterator()
    samples = random.sample(list(it), n)
    for (enc, _), dec_tar, _ in samples:
        enc_row = [i for i in enc[0].tolist() if i != pad_id]
        sep_index = enc_row.index(sep_id)
        q_ids = enc_row[:sep_index]
        c_ids = enc_row[sep_index+1:]
        q_txt = tokenizer.sequences_to_texts([q_ids])[0]
        c_txt = tokenizer.sequences_to_texts([c_ids])[0]
        dec_input = [sos_id]
        for _ in range(max_answer_len):
            dec_pad = pad_sequences([dec_input], maxlen=max_answer_len, padding="post", value=pad_id)
            logits = model((enc, dec_pad), training=False)
            next_id = int(tf.argmax(logits[0, len(dec_input)-1]).numpy())
            if next_id == eos_id:
                break
            dec_input.append(next_id)
        pred_ids = [i for i in dec_input[1:] if i not in (pad_id, sos_id, eos_id)]
        true_ids = [i for i in dec_tar[0].tolist() if i not in (pad_id, sos_id, eos_id)]
        pred_txt = tokenizer.sequences_to_texts([pred_ids])[0]
        true_txt = tokenizer.sequences_to_texts([true_ids])[0]
        print(f"Q:  {q_txt!r}")
        print(f"C:  {c_txt[:200]!r}…")
        print(f"GT → {true_txt!r}")
        print(f"PR → {pred_txt!r}")
        print("-"*80)

show_errors_greedy(model, val_ds, tokenizer_phase_2, MAX_ENCODER_LEN, MAX_A_LEN-1, sep_id, pad_id, sos_id, eos_id, n=5)


Q:  'which criteria is used to rank the clubs'
C:  'there are 20 clubs in the premier league during the course of a season from august to may each club plays the others twice a double round robin system once at their home stadium and once at that of th'…
GT → 't'
PR → ''
--------------------------------------------------------------------------------
Q:  'who coordinates the study program of samskritam as a foreign language'
C:  'st james junior school in london england offers sanskrit as part of the curriculum in the united states since september 2009 high school students have been able to receive credits as independent study'…
GT → 's'
PR → ''
--------------------------------------------------------------------------------
Q:  'when did taiwanese hokkien have a fast change in development'
C:  'in the 1990s marked by the liberalization of language development and mother tongue movement in taiwan taiwanese hokkien had undergone a fast pace in its development in 1993 taiwan became the f

In [326]:
import pandas as pd
from collections import Counter

# Suppose decoder_targets is your unbatched numpy array of shape (N, seq_len)
# You padded to MAX_A_LEN, so decoder_targets exists as detar before train_test_split.
all_answers = []
for seq in decoder_targets[:500]:
    clean_ids = [i for i in seq if i not in (pad_id, sos_id, eos_id)]
    text = tokenizer_phase_2.sequences_to_texts([clean_ids])[0]
    all_answers.append(text)

answer_counts = Counter(all_answers)
df_top = pd.DataFrame(answer_counts.items(), columns=['answer', 'count']) \
    .sort_values('count', ascending=False) \
    .reset_index(drop=True)

print(df_top.head(20))


                                               answer  count
0                                                  45      2
1                                                1988      2
2                                               1 000      2
3                                               india      2
4                                           the bible      2
5                                                2010      2
6                                               eight      2
7                                            couplets      1
8                digitize and offer nara video online      1
9                          banking financial services      1
10            pollen either fails to reach the stigma      1
11            the communist party of the soviet union      1
12                                               rich      1
13                                      louis agassiz      1
14                                                 56      1
15                      

## Post Processing

In [352]:
def greedy_decode_contextual(model,
                             question: str,
                             context: str,
                             tokenizer,
                             max_encoder_len: int,
                             max_answer_len: int,
                             sep_token_id: int,
                             pad_token_id: int,
                             sos_token_id: int,
                             eos_token_id: int):
    # 1) build encoder input
    q_ids = tokenizer.texts_to_sequences([question])[0]
    c_ids = tokenizer.texts_to_sequences([context])[0]
    enc_seq = q_ids + [sep_token_id] + c_ids
    enc_input = pad_sequences(
        [enc_seq],
        maxlen=max_encoder_len,
        padding='post',
        truncating='post',
        value=pad_token_id
    )

    # 2) start decoding
    dec_input = [sos_token_id]

    # only context tokens + PAD are allowed at first step
    allowed_set = set(c_ids) | {pad_token_id}

    for t in range(max_answer_len):
        # after first token, allow EOS too
        if t > 0:
            allowed_set.add(eos_token_id)

        # build decoder input
        dec_pad = pad_sequences(
            [dec_input],
            maxlen=max_answer_len,
            padding='post',
            truncating='post',
            value=pad_token_id
        )

        # 3) run model and grab the logits for the last position
        preds = model((enc_input, dec_pad), training=False)          # shape (1, T, V)
        last_logits = preds[0, len(dec_input) - 1]                  # shape (V,)

        # 4) mask out everything but allowed_set
        probs = tf.nn.softmax(last_logits).numpy()
        mask = np.zeros_like(probs, dtype=bool)
        mask[list(allowed_set)] = True
        masked_probs = probs * mask

        # 5) pick next token
        next_id = int(np.argmax(masked_probs))
        if next_id == eos_token_id:
            break

        dec_input.append(next_id)

    # 6) convert IDs back to tokens
    decoded = [tokenizer.index_word.get(i, "") for i in dec_input[1:]]
    return " ".join(decoded)


# usage
sos_id = tokenizer_phase_2.word_index["[SOS]"]
eos_id = tokenizer_phase_2.word_index["[EOS]"]
sep_id = tokenizer_phase_2.word_index["[SEP]"]
pad_id = 0

question = "What is the capital of France?"
context  = ("France is a country in Western Europe. "
            "Its capital city is Paris, known for the Eiffel Tower.")

answer = greedy_decode_contextual(
    best_model,
    question,
    context,
    tokenizer_phase_2,
    max_encoder_len=MAX_ENCODER_LEN,
    max_answer_len=MAX_A_LEN-1,
    sep_token_id=sep_id,
    pad_token_id=pad_id,
    sos_token_id=sos_id,
    eos_token_id=eos_id
)

print("Answer:", answer)


Answer: a


In [328]:
question = "What is my name?"
context = "My name is ahmed and please my name is ahmed"

answer2 = greedy_decode(
    best_model,
    question,
    context,
    tokenizer_phase_2,
    max_encoder_len=MAX_ENCODER_LEN,
    max_answer_len=MAX_A_LEN-1,
    sep_token_id=sep_id,
    pad_token_id=pad_id,
    sos_token_id=sos_id,
    eos_token_id=eos_id
)
print("Answer: ", answer2 )

Answer:  bandurria


## Inference using the best model

In [337]:
import evaluate
from datasets import load_dataset
from tqdm.auto import tqdm

dev_ds = load_dataset("squad_v2", split="validation")

metric = evaluate.load("squad_v2")


Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [338]:
predictions = []
references  = []

for ex in tqdm(dev_ds, desc="Decoding & collecting"):
    pred = greedy_decode(
        best_model,
        ex["question"],
        ex["context"],
        tokenizer_phase_2,
        max_encoder_len=MAX_ENCODER_LEN,
        max_answer_len=MAX_A_LEN-1,
        sep_token_id=sep_id,
        pad_token_id=pad_id,
        sos_token_id=sos_id,
        eos_token_id=eos_id,
    )
    predictions.append({"id": ex["id"], "prediction_text": pred})
    references.append({"id": ex["id"], "answers": ex["answers"]})

results = metric.compute(predictions=predictions, references=references)
print(f"Exact Match = {results['exact']:.2f}%")
print(f"F1 Score     = {results['f1']:.2f}%")


Decoding & collecting:   0%|          | 0/11873 [00:00<?, ?it/s]

KeyError: 'no_answer_probability'

In [341]:
for pred in predictions:
    pred.setdefault("no_answer_probability", 0.0)

In [342]:
results = metric.compute(predictions=predictions, references=references)
print(f"Exact Match = {results['exact']:.2f}%")
print(f"F1 Score    = {results['f1']:.2f}%")

Exact Match = 0.00%
F1 Score    = 0.00%


In [355]:
predictions[0:5]

[{'id': '56ddde6b9a695914005b9628',
  'prediction_text': 'bandurria',
  'no_answer_probability': 0.0},
 {'id': '56ddde6b9a695914005b9629',
  'prediction_text': 'bandurria',
  'no_answer_probability': 0.0},
 {'id': '56ddde6b9a695914005b962a',
  'prediction_text': 'bandurria',
  'no_answer_probability': 0.0},
 {'id': '56ddde6b9a695914005b962b',
  'prediction_text': 'bandurria',
  'no_answer_probability': 0.0},
 {'id': '56ddde6b9a695914005b962c',
  'prediction_text': 'bandurria',
  'no_answer_probability': 0.0}]