# Milestone 2

In [92]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import os, zipfile , json , random, requests
import re
from pathlib import Path
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import layers

## Explorting dataset:

In [2]:

def is_kaggle():
    # Kaggle kernels always set this env var
    return 'KAGGLE_URL_BASE' in os.environ

def is_colab():
    return (not is_kaggle()) and os.path.exists('/content')

def maybe_mount_drive():
    if is_colab():
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive')

def get_data_path():
    if is_kaggle():
        return '/kaggle/input/squad-2-0/'
    elif is_colab():
        return '/content/drive/MyDrive/SQuAD'
    else:
        return './data/'
def get_model_dir():
    if is_colab():
        model_dir = '/content/drive/MyDrive/models'
    elif is_kaggle():
        model_dir = '/kaggle/working/models'
    else:
        model_dir = './models'
    os.makedirs(model_dir, exist_ok=True)
    return model_dir

In [3]:
dataset_dir = get_data_path()
maybe_mount_drive()
os.makedirs(dataset_dir, exist_ok=True)

In [4]:
file_path = os.path.join(dataset_dir, 'train-v2.0.json')

In [5]:
with open(file_path, 'r', encoding='utf-8') as f:
    squad = json.load(f)

In [6]:
records = []
for article in squad['data']:
    for para in article['paragraphs']:
        ctx = para['context']
        for qa in para['qas']:
            answers = [a['text'] for a in qa.get('answers', [])]
            starts  = [a['answer_start'] for a in qa.get('answers', [])]
            ends    = [s + len(t) for s,t in zip(starts, answers)]
            records.append({
                'question': qa['question'],
                'answers': answers,
                'context': ctx,
                'answer_start': starts,
                'answer_end': ends
            })



In [7]:
df = pd.DataFrame(records)
df.head()

Unnamed: 0,question,answers,context,answer_start,answer_end
0,When did Beyonce start becoming popular?,[in the late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[269],[286]
1,What areas did Beyonce compete in when she was...,[singing and dancing],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[207],[226]
2,When did Beyonce leave Destiny's Child and bec...,[2003],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[526],[530]
3,In what city and state did Beyonce grow up?,"[Houston, Texas]",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[166],[180]
4,In which decade did Beyonce become famous?,[late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[276],[286]


In [8]:
print("Total QA pairs:", len(df))

Total QA pairs: 130319


In [9]:
#shuffling
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# only working on subset of 15k row
df_subset = df.head(15000).copy().reset_index(drop=True)

print("Subset size:", df_subset.shape)
df_subset.head()

Subset size: (15000, 5)


Unnamed: 0,question,answers,context,answer_start,answer_end
0,What year did the global recession that follow...,[2012],It threatened the collapse of large financial ...,[481],[485]
1,what was a popular club in ibiza that started ...,[Amnesia],"But house was also being developed on Ibiza,[c...",[251],[258]
2,In what century did Martin Luther honor Mary a...,[],Although Calvin and Huldrych Zwingli honored M...,[],[]
3,What is the climate like?,[varies from hot and subhumid tropical],"Due to extreme variation in elevation, great v...",[115],[152]
4,How many times has the Queen toured Canada?,[],The Queen addressed the United Nations for a s...,[],[]


## Data Cleaning

Dropping rows where answers are empty

In [10]:
df_subset = df_subset[df_subset['answers'].map(len) > 0].reset_index(drop=True)
print("Rows remaining after drop:", len(df_subset))

Rows remaining after drop: 10020


Removing Extra Whitespaces

In [11]:
def collapse_whitespace(s):
    if isinstance(s, str):
        return re.sub(r'\s+', ' ', s.strip())
    return s

In [12]:
for col in ['question', 'context', 'answers']:
    if col in df_subset.columns:
        df_subset[col] = df_subset[col].apply(collapse_whitespace)

**Lets explore the length of the sequences which will determine some hyperparameters in training the models**

In [13]:
df_subset['question'].str.len().max()

203

In [14]:
df_subset['context'].str.len().max()

3706

We just turn the array of the answers to a string since none have multiple answers

In [15]:
df_subset['answers']= df_subset['answers'].apply(lambda x: x[0])

In [16]:
df_subset['answers'].str.len().max()

202

## Embeddings

In [17]:
!pip install --quiet gensim

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.
nilearn 0.11.1 requires scikit-learn>=1.4.0, but you have scikit-learn 1.2.2 which is incompatible.
bigframes 1.36.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.2.2 which is incompatible.
plotnine 0.14.5 requires matplotlib>=3.8.0, but you have matplotlib 3.7.5 which is incompatible.
mlxtend 0.23.4 requires scikit-learn>=1.3

In [18]:
all_texts = (
    df_subset['question'].tolist() +
    df_subset['context'].tolist() +
    df_subset['answers'].tolist()
)
tokenizer = Tokenizer(
    num_words=20000,
    oov_token='[UNK]',
    filters='''!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'''
)
tokenizer.fit_on_texts(all_texts)

q_seqs = tokenizer.texts_to_sequences(df_subset['question'])
c_seqs = tokenizer.texts_to_sequences(df_subset['context'])
a_seqs = tokenizer.texts_to_sequences(df_subset['answers'])

In [19]:
vocab_size = len(tokenizer.word_index)
print("Total unique tokens:", vocab_size)

Total unique tokens: 57044


**Load gloVe dictionary**

In [20]:
def prepare_glove(target_dim=100, work_subdir='glove',
                  input_dataset_slug='glove6b',
                  download_url='http://nlp.stanford.edu/data/glove.6B.zip'):
    maybe_mount_drive()

    if is_kaggle():
        work_dir = f'/kaggle/working/{work_subdir}'
        uploaded_zip = f'/kaggle/input/{input_dataset_slug}/glove.6B.zip'
    elif is_colab():
        work_dir = f'/content/drive/MyDrive/{work_subdir}'
        uploaded_zip = None
    else:
        work_dir = f'./data/{work_subdir}'
        uploaded_zip = None

    os.makedirs(work_dir, exist_ok=True)

    target_file = f'glove.6B.{target_dim}d.txt'
    txt_path = os.path.join(work_dir, target_file)
    zip_path = os.path.join(work_dir, os.path.basename(download_url))

    if os.path.exists(txt_path):
        return txt_path

    if is_kaggle() and uploaded_zip and os.path.exists(uploaded_zip):
        zip_path = uploaded_zip
    else:
        if requests is None:
            raise RuntimeError("`requests` not available; offline mode")
        with requests.get(download_url, stream=True) as r, open(zip_path, 'wb') as f:
            r.raise_for_status()
            for chunk in r.iter_content(8192):
                f.write(chunk)

    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extract(target_file, path=work_dir)

    if not os.path.exists(txt_path):
        raise RuntimeError(f"Failed to extract {target_file}")

    return txt_path

# Usage
glove_path = prepare_glove()
print("GloVe file:", glove_path)

GloVe file: /kaggle/working/glove/glove.6B.100d.txt


**Creating embeddings index (mapping words to vectors)**

In [21]:
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.rstrip().split(" ")
        word = parts[0]
        vec  = np.asarray(parts[1:], dtype='float32')
        embeddings_index[word] = vec

**Creating our look-up table (embedding matrix)**

In [22]:
vocab_size = 20000
emb_dim = 100
embedding_matrix = np.random.normal(size=(vocab_size, emb_dim)) * 0.01

In [23]:
for word, idx in tokenizer.word_index.items():
    if idx >= vocab_size:
        continue
    if word in embeddings_index:
        embedding_matrix[idx] = embeddings_index[word]

In [24]:
word = tokenizer.index_word[2]
print(word)
print(embedding_matrix[2])

the
[-0.038194   -0.24487001  0.72812003 -0.39961001  0.083172    0.043953
 -0.39140999  0.3344     -0.57545     0.087459    0.28786999 -0.06731
  0.30906001 -0.26383999 -0.13231    -0.20757     0.33395001 -0.33848
 -0.31742999 -0.48335999  0.1464     -0.37303999  0.34577     0.052041
  0.44946    -0.46970999  0.02628    -0.54154998 -0.15518001 -0.14106999
 -0.039722    0.28277001  0.14393     0.23464    -0.31020999  0.086173
  0.20397     0.52623999  0.17163999 -0.082378   -0.71787    -0.41531
  0.20334999 -0.12763     0.41367     0.55186999  0.57907999 -0.33476999
 -0.36559001 -0.54856998 -0.062892    0.26583999  0.30204999  0.99774998
 -0.80480999 -3.0243001   0.01254    -0.36941999  2.21670008  0.72201002
 -0.24978     0.92136002  0.034514    0.46744999  1.10790002 -0.19358
 -0.074575    0.23353    -0.052062   -0.22044     0.057162   -0.15806
 -0.30798    -0.41624999  0.37972     0.15006    -0.53211999 -0.20550001
 -1.25259995  0.071624    0.70564997  0.49744001 -0.42063001  0.2614

**Create embedding layer**

In [25]:
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=emb_dim,
    weights=[embedding_matrix],
    mask_zero=True,
    trainable=False,
    name='glove_embedding'
)

2025-04-20 21:44:31.294619: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


## Phase One

In [26]:
MAX_Q_LEN   = df_subset['question'].str.len().max()
MAX_A_LEN   = df_subset['answers'].str.len().max()
VOCAB_SIZE  = 20000
EMB_DIM     = embedding_matrix.shape[1]
UNITS       = 128
BATCH_SIZE  = 64
EPOCHS      = 30

In [27]:
q_padded = pad_sequences(q_seqs, maxlen=MAX_Q_LEN, padding='post', truncating='post')
a_padded = pad_sequences(a_seqs, maxlen=MAX_A_LEN, padding='post', truncating='post')

In [28]:
decoder_input  = a_padded[:, :-1]
decoder_target = a_padded[:, 1:]

Xq_tr, Xq_val, Din_tr, Din_val, Dt_tr, Dt_val = train_test_split(
    q_padded, decoder_input, decoder_target,
    test_size=0.1, random_state=42
)

def make_ds(q, d_in, d_tar, batch_size=64):
    mask = tf.cast(tf.not_equal(d_tar, 0), tf.float32)
    ds = tf.data.Dataset.from_tensor_slices(
        ((q, d_in), d_tar, mask)
    )
    return ds.shuffle(2000).batch(batch_size).prefetch(1)

train_ds = make_ds(Xq_tr, Din_tr, Dt_tr, batch_size=BATCH_SIZE)
val_ds   = make_ds(Xq_val, Din_val, Dt_val, batch_size=BATCH_SIZE)

In [29]:
print("Train batches:", tf.data.experimental.cardinality(train_ds).numpy())
print("Val   batches:", tf.data.experimental.cardinality(val_ds).numpy())

Train batches: 141
Val   batches: 16


**Building the model**

In [30]:
class Seq2SeqLSTM(tf.keras.Model):
    def __init__(self,vocab_size,emb_dim,units,max_q_len,max_a_len,embedding_matrix=None,pad_token_id=0,**kwargs):
        super().__init__(**kwargs)
        self.pad_token_id = pad_token_id


        if embedding_matrix is not None:
            self.embedding = Embedding(vocab_size, emb_dim,weights=[embedding_matrix],trainable=False,mask_zero=True)
        else:
            self.embedding = Embedding(vocab_size, emb_dim,mask_zero=True)

        #units is the vector size of the hidden state
        #return_state if true returns the final h and c
        self.encoder_lstm = LSTM(units, return_state=True, name='encoder_lstm')

        #return sequence returns all the hidden states from h_1 to h_n
        #return sequence is for evaluation
        #return state is for inference because after each token generated we need to feed the model the states again
        self.decoder_lstm = LSTM(units,return_sequences=True,return_state=True, name='decoder_lstm')

        #the layer needed to predict the next word
        self.dense = Dense(vocab_size, activation='softmax', name='decoder_dense')

    def call(self, inputs, training=False):
        encoder_inputs, decoder_inputs = inputs

        x_enc = self.embedding(encoder_inputs)
        _, state_h, state_c = self.encoder_lstm(x_enc, training=training)
        encoder_states = [state_h, state_c]

        x_dec = self.embedding(decoder_inputs)
        dec_outputs, _, _ = self.decoder_lstm(x_dec, initial_state=encoder_states, training=training)
        return self.dense(dec_outputs)


In [31]:
model = Seq2SeqLSTM( vocab_size=vocab_size,
                    emb_dim=100,units=128,
                     max_q_len=MAX_Q_LEN,max_a_len=MAX_A_LEN,embedding_matrix=embedding_matrix,pad_token_id=0)

dummy_q = tf.zeros((1, MAX_Q_LEN), dtype=tf.int32)
dummy_a = tf.zeros((1, MAX_A_LEN-1), dtype=tf.int32)
_ = model((dummy_q, dummy_a))


In [32]:
model.compile(
  optimizer='adam',
  loss='sparse_categorical_crossentropy',
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)
model.summary()

In [33]:
!mkdir /content/drive/MyDrive/models

mkdir: cannot create directory ‘/content/drive/MyDrive/models’: No such file or directory


In [34]:
# MODEL_DIR = get_model_dir()
# CHECKPOINT_PATH = os.path.join(MODEL_DIR, 'seq2seq_lstm_best.keras')

# checkpoint_cb = ModelCheckpoint(
#     filepath=CHECKPOINT_PATH,
#     monitor='val_sparse_categorical_accuracy',
#     save_best_only=True,
#     mode='max',
#     verbose=1
# )
# history = model.fit(
#     train_ds,
#     validation_data=val_ds,
#     epochs=EPOCHS,
#     callbacks=[checkpoint_cb]
# )
# print(f"Best model will be saved to: {CHECKPOINT_PATH}")

### Phase 2 using a transformer

**Putting context into the equation**

In [85]:
def truncate_context(context: str, ans_start: int, ans_end: int, max_len: int) -> str:
    """
    Return a substring of `context` of length at most max_len characters,
    centered on the character span [ans_start:ans_end].
    """
    ans_len = ans_end - ans_start
    extra   = max_len - ans_len
    pre     = extra // 2
    post    = extra - pre

    # ideal window
    start = ans_start - pre
    end   = ans_end   + post

    # shift right if off the left edge
    if start < 0:
        start = 0
        end   = min(max_len, len(context))

    # shift left if off the right edge
    if end > len(context):
        end   = len(context)
        start = max(0, len(context) - max_len)

    return context[start:end]


def build_truncated_context(df, max_len: int):
    """
    Returns a list of truncated context strings for each row in df,
    preserving at least the answer span.
    """
    contexts = []
    for ctx, starts, ends in zip(
        df['context'], df['answer_start'], df['answer_end']
    ):
        # pick the first span
        s = starts[0]
        e = ends[0]

        window = truncate_context(ctx, s, e, max_len)
        contexts.append(window)

    return contexts


In [88]:
# contexts =  build_truncated_context(df_subset, 500)
len(contexts[10])

500

In [38]:
df_subset.columns

Index(['question', 'answers', 'context', 'answer_start', 'answer_end'], dtype='object')

In [None]:
c_padded = pad_sequences(
    trunc_c_seqs,
    maxlen=MAX_C_LEN,
    padding='post',
    truncating='post'
)

In [95]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, max_length: int, embed_dim: int, **kwargs):
        super().__init__(**kwargs)
        self.max_length = max_length
        self.embed_dim   = embed_dim
        # learned positional embeddings
        self.pos_emb = layers.Embedding(
            input_dim=max_length, output_dim=embed_dim
        )

    def call(self, x):
        """
        x: Tensor of shape (batch, seq_len, embed_dim)
        returns x + positional embeddings
        """
        seq_len = tf.shape(x)[1]
        positions = tf.range(start=0, limit=seq_len, delta=1)
        pos_vectors = self.pos_emb(positions)           
        return x + pos_vectors                         

In [93]:
class TransformerEncoder(layers.Layer):
    def __init__(self,
                 embed_dim: int,
                 num_heads: int,
                 ff_dim: int,
                 dropout_rate: float = 0.1,
                 **kwargs):
        super().__init__(**kwargs)
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.norm1    = layers.LayerNormalization()
        self.norm2    = layers.LayerNormalization()
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def call(self, x, padding_mask=None, training=False):
        attn_output = self.attention(
            query=x, value=x, key=x,
            attention_mask=padding_mask,
            training=training
        )
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.norm1(x + attn_output)

        # Feed‑forward block
        ffn_output = self.ffn(out1, training=training)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.norm2(out1 + ffn_output)




In [94]:
class TransformerDecoder(layers.Layer):
    def __init__(self,
                 embed_dim: int,
                 num_heads: int,
                 ff_dim: int,
                 dropout_rate: float = 0.1,
                 **kwargs):
        super().__init__(**kwargs)
        self.self_attention = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        self.cross_attention = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim
        )
        # 3) feed‑forward
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        
        self.norm1    = layers.LayerNormalization()
        self.norm2    = layers.LayerNormalization()
        self.norm3    = layers.LayerNormalization()
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
        self.dropout3 = layers.Dropout(dropout_rate)

    def call(self,
             x,
             enc_output,
             look_ahead_mask=None,
             padding_mask=None,
             training=False):
        
        attn1 = self.self_attention(
            query=x, value=x, key=x,
            attention_mask=look_ahead_mask,
            training=training
        )
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.norm1(x + attn1)
        attn2 = self.cross_attention(
            query=out1,
            value=enc_output,
            key=enc_output,
            attention_mask=padding_mask,
            training=training
        )
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.norm2(out1 + attn2)

        ffn_out = self.ffn(out2, training=training)
        ffn_out = self.dropout3(ffn_out, training=training)
        return self.norm3(out2 + ffn_out)


In [97]:
class Seq2SeqTransformer(tf.keras.Model):
    def __init__(self, vocab_size, emb_dim, num_heads,
                 ff_dim, max_q_len, max_c_len, max_a_len,
                 embedding_matrix=None, pad_token_id=0, **kwargs):
        super().__init__(**kwargs)
        self.pad_id = pad_token_id

        
        if embedding_matrix is not None:
            self.token_emb = layers.Embedding(
                vocab_size, emb_dim,
                weights=[embedding_matrix],
                trainable=False,
                mask_zero=True
            )
        else:
            self.token_emb = layers.Embedding(
                vocab_size, emb_dim,
                mask_zero=True
            )

        
        self.pos_emb_enc = PositionalEmbedding(
            max_length=max_q_len + 1 + max_c_len,
            embed_dim=emb_dim
        )
        self.pos_emb_dec = PositionalEmbedding(
            max_length=max_a_len,
            embed_dim=emb_dim
        )

        
        self.encoder = TransformerEncoder(
            embed_dim, num_heads, ff_dim
        )
        self.decoder = TransformerDecoder(
            embed_dim, num_heads, ff_dim
        )

        
        self.final_dense = layers.Dense(vocab_size, activation="softmax")

        # store lengths for mask creation
        self.max_q_len = max_q_len
        self.max_c_len = max_c_len
        self.max_a_len = max_a_len

    def create_padding_mask(self, seq):
        mask = tf.cast(tf.equal(seq, self.pad_id), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]  

    def create_look_ahead_mask(self, size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask 

    def call(self, inputs, training=False):
        ctx_input, q_input, dec_input = inputs

        
        sep = tf.fill([tf.shape(q_input)[0], 1], self.pad_id)
        enc_seq = tf.concat([q_input, sep, ctx_input], axis=1)

        
        enc_padding_mask = self.create_padding_mask(enc_seq)
        look_ahead_mask  = self.create_look_ahead_mask(
            tf.shape(dec_input)[1]
        )

        
        x = self.token_emb(enc_seq)        
        x = self.pos_emb_enc(x)
        enc_out = self.encoder(x, padding_mask=enc_padding_mask,
                               training=training)  

        
        y = self.token_emb(dec_input)      
        y = self.pos_emb_dec(y)
        dec_out = self.decoder(
            y,
            enc_output=enc_out,
            look_ahead_mask=look_ahead_mask,
            padding_mask=enc_padding_mask,
            training=training
        )  
        return self.final_dense(dec_out)   # (B, La, V)
