# Milestone 2

In [23]:

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import os, zipfile , json , random, requests
import re
from pathlib import Path
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

## Explorting dataset:

In [60]:

def is_kaggle():
    # Kaggle kernels always set this env var
    return 'KAGGLE_URL_BASE' in os.environ

def is_colab():
    return (not is_kaggle()) and os.path.exists('/content')

def maybe_mount_drive():
    if is_colab():
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive')

def get_data_path():
    if is_kaggle():
        return '/kaggle/input/squad-2-0/'
    elif is_colab():
        return '/content/drive/MyDrive/SQuAD'
    else:
        return './data/'
def get_model_dir():
    if is_colab():
        model_dir = '/content/drive/MyDrive/models'
    elif is_kaggle():
        model_dir = '/kaggle/working/models'
    else:
        model_dir = './models'
    os.makedirs(model_dir, exist_ok=True)
    return model_dir

In [25]:
dataset_dir = get_data_path()
maybe_mount_drive()
os.makedirs(dataset_dir, exist_ok=True)

In [26]:
file_path = os.path.join(dataset_dir, 'train-v2.0.json')

In [27]:
with open(file_path, 'r', encoding='utf-8') as f:
    squad = json.load(f)

In [28]:
records = []
for article in squad['data']:
    for para in article['paragraphs']:
        ctx = para['context']
        for qa in para['qas']:
            answers = [a['text'] for a in qa.get('answers', [])]
            starts  = [a['answer_start'] for a in qa.get('answers', [])]
            ends    = [s + len(t) for s,t in zip(starts, answers)]
            records.append({
                'question': qa['question'],
                'answers': answers,
                'context': ctx,
                'answer_start': starts,
                'answer_end': ends
            })



In [29]:
df = pd.DataFrame(records)
df.head()

Unnamed: 0,question,answers,context,answer_start,answer_end
0,When did Beyonce start becoming popular?,[in the late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[269],[286]
1,What areas did Beyonce compete in when she was...,[singing and dancing],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[207],[226]
2,When did Beyonce leave Destiny's Child and bec...,[2003],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[526],[530]
3,In what city and state did Beyonce grow up?,"[Houston, Texas]",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[166],[180]
4,In which decade did Beyonce become famous?,[late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[276],[286]


In [30]:
print("Total QA pairs:", len(df))

Total QA pairs: 130319


In [31]:
#shuffling
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# only working on subset of 15k row
df_subset = df.head(15000).copy().reset_index(drop=True)

print("Subset size:", df_subset.shape)
df_subset.head()

Subset size: (15000, 5)


Unnamed: 0,question,answers,context,answer_start,answer_end
0,What year did the global recession that follow...,[2012],It threatened the collapse of large financial ...,[481],[485]
1,what was a popular club in ibiza that started ...,[Amnesia],"But house was also being developed on Ibiza,[c...",[251],[258]
2,In what century did Martin Luther honor Mary a...,[],Although Calvin and Huldrych Zwingli honored M...,[],[]
3,What is the climate like?,[varies from hot and subhumid tropical],"Due to extreme variation in elevation, great v...",[115],[152]
4,How many times has the Queen toured Canada?,[],The Queen addressed the United Nations for a s...,[],[]


## Data Cleaning

Dropping rows where answers are empty

In [32]:
df_subset = df_subset[df_subset['answers'].map(len) > 0].reset_index(drop=True)
print("Rows remaining after drop:", len(df_subset))

Rows remaining after drop: 10020


Removing Extra Whitespaces

In [33]:
def collapse_whitespace(s):
    if isinstance(s, str):
        return re.sub(r'\s+', ' ', s.strip())
    return s

In [34]:
for col in ['question', 'context', 'answers']:
    if col in df_subset.columns:
        df_subset[col] = df_subset[col].apply(collapse_whitespace)

**Lets explore the length of the sequences which will determine some hyperparameters in training the models**

In [35]:
df_subset['question'].str.len().max()

203

In [36]:
df_subset['context'].str.len().max()

3706

We just turn the array of the answers to a string since none have multiple answers

In [37]:
df_subset['answers']= df_subset['answers'].apply(lambda x: x[0])

In [38]:
df_subset['answers'].str.len().max()

202

## Embeddings

In [40]:
!pip install --quiet gensim

In [41]:
all_texts = (
    df_subset['question'].tolist() +
    df_subset['context'].tolist() +
    df_subset['answers'].tolist()
)
tokenizer = Tokenizer(
    num_words=20000,
    oov_token='[UNK]',
    filters='''!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'''
)
tokenizer.fit_on_texts(all_texts)

q_seqs = tokenizer.texts_to_sequences(df['question'])
c_seqs = tokenizer.texts_to_sequences(df['context'])
a_seqs = tokenizer.texts_to_sequences(df['answers'])

In [42]:
vocab_size = len(tokenizer.word_index)
print("Total unique tokens:", vocab_size)

Total unique tokens: 57044


In [43]:
q_seqs[0]

[18, 71, 44, 2, 961, 6123, 13, 577, 2, 468, 946, 3, 438, 170]

**Load gloVe dictionary**

In [46]:
def prepare_glove(target_dim=100, work_subdir='glove',
                  input_dataset_slug='glove6b',
                  download_url='http://nlp.stanford.edu/data/glove.6B.zip'):
    maybe_mount_drive()

    if is_kaggle():
        work_dir = f'/kaggle/working/{work_subdir}'
        uploaded_zip = f'/kaggle/input/{input_dataset_slug}/glove.6B.zip'
    elif is_colab():
        work_dir = f'/content/drive/MyDrive/{work_subdir}'
        uploaded_zip = None
    else:
        work_dir = f'./data/{work_subdir}'
        uploaded_zip = None

    os.makedirs(work_dir, exist_ok=True)

    target_file = f'glove.6B.{target_dim}d.txt'
    txt_path = os.path.join(work_dir, target_file)
    zip_path = os.path.join(work_dir, os.path.basename(download_url))

    if os.path.exists(txt_path):
        return txt_path

    if is_kaggle() and uploaded_zip and os.path.exists(uploaded_zip):
        zip_path = uploaded_zip
    else:
        if requests is None:
            raise RuntimeError("`requests` not available; offline mode")
        with requests.get(download_url, stream=True) as r, open(zip_path, 'wb') as f:
            r.raise_for_status()
            for chunk in r.iter_content(8192):
                f.write(chunk)

    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extract(target_file, path=work_dir)

    if not os.path.exists(txt_path):
        raise RuntimeError(f"Failed to extract {target_file}")

    return txt_path

# Usage
glove_path = prepare_glove()
print("GloVe file:", glove_path)

GloVe file: /kaggle/working/glove/glove.6B.100d.txt


**Creating embeddings index (mapping words to vectors)**

In [47]:
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.rstrip().split(" ")
        word = parts[0]
        vec  = np.asarray(parts[1:], dtype='float32')
        embeddings_index[word] = vec

**Creating our look-up table (embedding matrix)**

In [49]:
vocab_size = 20000
emb_dim = 100
embedding_matrix = np.random.normal(size=(vocab_size, emb_dim)) * 0.01

In [50]:
for word, idx in tokenizer.word_index.items():
    if idx >= vocab_size:
        continue
    if word in embeddings_index:
        embedding_matrix[idx] = embeddings_index[word]

In [51]:
word = tokenizer.index_word[2]
print(word)
print(embedding_matrix[2])

the
[-0.038194   -0.24487001  0.72812003 -0.39961001  0.083172    0.043953
 -0.39140999  0.3344     -0.57545     0.087459    0.28786999 -0.06731
  0.30906001 -0.26383999 -0.13231    -0.20757     0.33395001 -0.33848
 -0.31742999 -0.48335999  0.1464     -0.37303999  0.34577     0.052041
  0.44946    -0.46970999  0.02628    -0.54154998 -0.15518001 -0.14106999
 -0.039722    0.28277001  0.14393     0.23464    -0.31020999  0.086173
  0.20397     0.52623999  0.17163999 -0.082378   -0.71787    -0.41531
  0.20334999 -0.12763     0.41367     0.55186999  0.57907999 -0.33476999
 -0.36559001 -0.54856998 -0.062892    0.26583999  0.30204999  0.99774998
 -0.80480999 -3.0243001   0.01254    -0.36941999  2.21670008  0.72201002
 -0.24978     0.92136002  0.034514    0.46744999  1.10790002 -0.19358
 -0.074575    0.23353    -0.052062   -0.22044     0.057162   -0.15806
 -0.30798    -0.41624999  0.37972     0.15006    -0.53211999 -0.20550001
 -1.25259995  0.071624    0.70564997  0.49744001 -0.42063001  0.2614

**Create embedding layer**

In [53]:
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=emb_dim,
    weights=[embedding_matrix],
    mask_zero=True,
    trainable=False,
    name='glove_embedding'
)

## Phase One

In [54]:
MAX_Q_LEN   = df_subset['question'].str.len().max()
MAX_A_LEN   = df_subset['answers'].str.len().max()
VOCAB_SIZE  = len(tokenizer.word_index) + 1
EMB_DIM     = embedding_matrix.shape[1]
UNITS       = 128
BATCH_SIZE  = 64
EPOCHS      = 30

In [55]:
q_padded = pad_sequences(q_seqs, maxlen=MAX_Q_LEN, padding='post', truncating='post')
a_padded = pad_sequences(a_seqs, maxlen=MAX_A_LEN, padding='post', truncating='post')

In [56]:
decoder_input  = a_padded[:, :-1]
decoder_target = a_padded[:, 1:]

Xq_tr, Xq_val, Din_tr, Din_val, Dt_tr, Dt_val = train_test_split(
    q_padded, decoder_input, decoder_target,
    test_size=0.1, random_state=42
)

def make_ds(q, d_in, d_tar, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(((q, d_in), d_tar))
    return ds.shuffle(2000).batch(batch_size).prefetch(1)

train_ds = make_ds(Xq_tr, Din_tr, Dt_tr)
val_ds   = make_ds(Xq_val, Din_val, Dt_val)

**Building the model**

In [57]:
class Seq2SeqLSTM(tf.keras.Model):
    def __init__(self,vocab_size,emb_dim,units,max_q_len,max_a_len,embedding_matrix=None,pad_token_id=0,**kwargs):
        super().__init__(**kwargs)
        self.pad_token_id = pad_token_id


        if embedding_matrix is not None:
            self.embedding = Embedding(vocab_size, emb_dim,weights=[embedding_matrix],trainable=False,mask_zero=True)
        else:
            self.embedding = Embedding(vocab_size, emb_dim,mask_zero=True)

        #units is the vector size of the hidden state
        #return_state if true returns the final h and c
        self.encoder_lstm = LSTM(units, return_state=True, name='encoder_lstm')

        #return sequence returns all the hidden states from h_1 to h_n
        #return sequence is for evaluation
        #return state is for inference because after each token generated we need to feed the model the states again
        self.decoder_lstm = LSTM(units,return_sequences=True,return_state=True, name='decoder_lstm')

        #the layer needed to predict the next word
        self.dense = Dense(vocab_size, activation='softmax', name='decoder_dense')

    def call(self, inputs, training=False):
        encoder_inputs, decoder_inputs = inputs

        x_enc = self.embedding(encoder_inputs)
        _, state_h, state_c = self.encoder_lstm(x_enc, training=training)
        encoder_states = [state_h, state_c]

        x_dec = self.embedding(decoder_inputs)
        dec_outputs, _, _ = self.decoder_lstm(x_dec, initial_state=encoder_states, training=training)
        return self.dense(dec_outputs)

    # def compute_masked_accuracy(self, y_true, y_pred):
    #     pad = self.pad_token_id
    #     y_pred_id = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
    #     y_true    = tf.cast(y_true, tf.int32)
    #     mask      = tf.cast(tf.not_equal(y_true, pad), tf.float32)
    #     matches   = tf.cast(tf.equal(y_true, y_pred_id), tf.float32) * mask
    #     return tf.reduce_sum(matches) / tf.reduce_sum(mask)


In [58]:
model = Seq2SeqLSTM( vocab_size=vocab_size,
                    emb_dim=100,units=128,
                     max_q_len=MAX_Q_LEN,max_a_len=MAX_A_LEN,embedding_matrix=embedding_matrix,pad_token_id=0)

dummy_q = tf.zeros((1, MAX_Q_LEN), dtype=tf.int32)
dummy_a = tf.zeros((1, MAX_A_LEN-1), dtype=tf.int32)
_ = model((dummy_q, dummy_a))

I0000 00:00:1745171839.083770     129 cuda_dnn.cc:529] Loaded cuDNN version 90300


In [59]:
model.compile(
  optimizer='adam',
  loss='sparse_categorical_crossentropy',
  metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)
model.summary()

In [None]:
!mkdir /content/drive/MyDrive/models

In [None]:
MODEL_DIR = get_model_dir()
CHECKPOINT_PATH = os.path.join(MODEL_DIR, 'seq2seq_lstm_best.keras')

checkpoint_cb = ModelCheckpoint(
    filepath=CHECKPOINT_PATH,
    monitor='val_sparse_categorical_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[checkpoint_cb]
)
print(f"Best model will be saved to: {CHECKPOINT_PATH}")

Epoch 1/30
[1m1123/3666[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m3:31[0m 83ms/step - loss: 0.9587 - sparse_categorical_accuracy: 0.9927