# Milestone 2

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import os, zipfile , json , random, requests
import re
from pathlib import Path
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

## Explorting dataset:

In [None]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')


Drive not mounted, so nothing to flush and unmount.


In [None]:
drive_dir = '/content/drive/MyDrive/SQuAD'
os.makedirs(drive_dir, exist_ok=True)

In [None]:
file_path = os.path.join(drive_dir, 'train-v2.0.json')

In [None]:
with open(file_path, 'r', encoding='utf-8') as f:
    squad = json.load(f)

In [None]:
records = []
for article in squad['data']:
    for para in article['paragraphs']:
        ctx = para['context']
        for qa in para['qas']:
            answers = [a['text'] for a in qa.get('answers', [])]
            starts  = [a['answer_start'] for a in qa.get('answers', [])]
            ends    = [s + len(t) for s,t in zip(starts, answers)]
            records.append({
                'question': qa['question'],
                'answers': answers,
                'context': ctx,
                'answer_start': starts,
                'answer_end': ends
            })



In [None]:
df = pd.DataFrame(records)
df.head()

In [None]:
#random.shuffle(examples)
#subset = examples[:15000]
#len(subset)
print("Total QA pairs:", len(df))

In [None]:
#shuffling
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# only working on subset of 15k row
df_subset = df.head(15000).copy().reset_index(drop=True)

print("Subset size:", df_subset.shape)
df_subset.head()

##Data Cleaning

Dropping rows where answers are empty

In [None]:
df_subset = df_subset[df_subset['answers'].map(len) > 0].reset_index(drop=True)
print("Rows remaining after drop:", len(df))

Removing Extra Whitespaces

In [None]:
def collapse_whitespace(s):
    if isinstance(s, str):
        return re.sub(r'\s+', ' ', s.strip())
    return s

In [None]:
for col in ['question', 'context', 'answers']:
    if col in df_subset.columns:
        df_subset[col] = df_subset[col].apply(collapse_whitespace)

**Lets explore the length of the sequences which will determine some hyperparameters in training the models**

In [None]:
df_subset['question'].str.len().max()

In [None]:
df_subset['context'].str.len().max()

In [None]:
df_mult = df[df['answers'].map(len) > 1].reset_index(drop=True)
print("Rows with multiple answers:", df_mult.shape[0])
display(df_mult[['question', 'answers', 'answer_start', 'answer_end']].head())

We just turn the array of the answers to a string since none have multiple answers

In [None]:
df_subset['answers']= df_subset['answers'].apply(lambda x: x[0])

In [None]:
df_subset['answers'].str.len().max()

## Embeddings

In [None]:
!pip install --quiet gensim

In [None]:
all_texts = (
    df_subset['question'].tolist() +
    df_subset['context'].tolist() +
    df_subset['answers'].tolist()
)
tokenizer = Tokenizer(
    num_words=20000,
    oov_token='[UNK]',
    filters='''!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'''
)
tokenizer.fit_on_texts(all_texts)

q_seqs = tokenizer.texts_to_sequences(df['question'])
c_seqs = tokenizer.texts_to_sequences(df['context'])
a_seqs = tokenizer.texts_to_sequences(df['answers'])

In [None]:
vocab_size = len(tokenizer.word_index)
print("Total unique tokens:", vocab_size)


In [None]:
q_seqs[0]

**Load gloVe dictionary**

In [None]:
drive_dir = '/content/drive/MyDrive/glove'
os.makedirs(drive_dir, exist_ok=True)
zip_path = os.path.join(drive_dir, 'glove.6B.zip')
glove_path = os.path.join(drive_dir, 'glove.6B.100d.txt')

if not os.path.exists(glove_path):
    if not os.path.exists(zip_path):
        url = "http://nlp.stanford.edu/data/glove.6B.zip"
        with requests.get(url, stream=True) as r, open(zip_path, 'wb') as f:
            for chunk in r.iter_content(8192):
                f.write(chunk)
    with zipfile.ZipFile(zip_path, 'r') as z:
        z.extract('glove.6B.100d.txt', path=drive_dir)
print(f"GloVe ready at {glove_path}")


**Creating embeddings index (mapping words to vectors)**

In [None]:
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.rstrip().split(" ")
        word = parts[0]
        vec  = np.asarray(parts[1:], dtype='float32')
        embeddings_index[word] = vec

**Creating our look-up table (embedding matrix)**

In [None]:
vocab_size = len(tokenizer.word_index) + 1
emb_dim = 100
embedding_matrix = np.random.normal(size=(vocab_size, emb_dim)) * 0.01

In [None]:
for word, idx in tokenizer.word_index.items():
    if idx >= vocab_size:
        continue
    if word in embeddings_index:
        embedding_matrix[idx] = embeddings_index[word]

In [None]:
word = tokenizer.index_word[2]
print(word)
print(embedding_matrix[2])

**Create embedding layer**

In [None]:
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=emb_dim,
    weights=[embedding_matrix],
    mask_zero=True,
    trainable=False,
    name='glove_embedding'
)

##Phase One

In [None]:
MAX_Q_LEN   = df_subset['question'].str.len().max()
MAX_A_LEN   = df_subset['answers'].str.len().max()
VOCAB_SIZE  = len(tokenizer.word_index) + 1
EMB_DIM     = embedding_matrix.shape[1]
UNITS       = 128
BATCH_SIZE  = 64
EPOCHS      = 30

In [None]:
print(len(tokenizer.word_index))

In [None]:
q_padded = pad_sequences(q_seqs, maxlen=MAX_Q_LEN, padding='post', truncating='post')
a_padded = pad_sequences(a_seqs, maxlen=MAX_A_LEN, padding='post', truncating='post')

In [None]:
decoder_input  = a_padded[:, :-1]
decoder_target = a_padded[:, 1:]

Xq_tr, Xq_val, Din_tr, Din_val, Dt_tr, Dt_val = train_test_split(
    q_padded, decoder_input, decoder_target,
    test_size=0.1, random_state=42
)

def make_ds(q, d_in, d_tar, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(((q, d_in), d_tar))
    return ds.shuffle(2000).batch(batch_size).prefetch(1)

train_ds = make_ds(Xq_tr, Din_tr, Dt_tr)
val_ds   = make_ds(Xq_val, Din_val, Dt_val)

**Building encoder**

In [None]:
encoder_inputs = Input(shape=(MAX_Q_LEN,), batch_size= BATCH_SIZE, name='encoder_input')
enc_embedded   = embedding_layer(encoder_inputs)                # (batch, Q, emb_dim)
_, state_h, state_c = LSTM(UNITS, return_state=True, name='encoder_lstm')(enc_embedded)
encoder_states = [state_h, state_c]

**Building encoder**

In [None]:
decoder_inputs= Input(shape=(MAX_A_LEN-1,), batch_size=BATCH_SIZE ,name='decoder_input')
dec_embedded= embedding_layer(decoder_inputs)
dec_lstm = LSTM(UNITS, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outputs, _, _ = dec_lstm(dec_embedded, initial_state=encoder_states)
decoder_dense   = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(dec_outputs)

**Building the model**

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

In [1]:
#training the model
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS
)

NameError: name 'model' is not defined