# Milestone 2

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import os, zipfile , json , random
import re
from pathlib import Path
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

## Explorting dataset:

In [5]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')


Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [6]:
drive_dir = '/content/drive/MyDrive/SQuAD'
os.makedirs(drive_dir, exist_ok=True)

In [7]:
file_path = os.path.join(drive_dir, 'train-v2.0.json')

In [8]:
with open(file_path, 'r', encoding='utf-8') as f:
    squad = json.load(f)

In [6]:
#!tar -xzf /content/TriviaQA_RC.zip -C /content/TriviaQA_RC

In [7]:
#!find /content/TriviaQA_RC -maxdepth 2 | sed -e '1,5!d'

/content/TriviaQA_RC
/content/TriviaQA_RC/README
/content/TriviaQA_RC/qa
/content/TriviaQA_RC/qa/wikipedia-train.json
/content/TriviaQA_RC/qa/web-train.json


In [8]:
#!sed -n '1,50p' /content/TriviaQA_RC/README

-------------------------------------------------------------------------------------------------------
The University of Washington TriviaQA Dataset (version 1.0)
-------------------------------------------------------------------------------------------------------

TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts and independently gathered evidence documents, six per question on average, that provide high quality distant supervision for answering the questions. The details can be found in our paper

@InProceedings{JoshiTriviaQA2017,
  author    = {Joshi, Mandar  and  Choi, Eunsol  and  Weld, Daniel S. and Zettlemoyer, Luke},
  title     = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  mo

In [9]:
#adding answer end
records = []
for article in squad['data']:
    for para in article['paragraphs']:
        ctx = para['context']
        for qa in para['qas']:
            answers = [a['text'] for a in qa.get('answers', [])]
            starts  = [a['answer_start'] for a in qa.get('answers', [])]
            ends    = [s + len(t) for s,t in zip(starts, answers)]
            records.append({
                'question': qa['question'],
                'answers': answers,
                'context': ctx,
                'answer_start': starts,
                'answer_end': ends
            })



In [10]:
df = pd.DataFrame(records)
df.head()

Unnamed: 0,question,answers,context,answer_start,answer_end
0,When did Beyonce start becoming popular?,[in the late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[269],[286]
1,What areas did Beyonce compete in when she was...,[singing and dancing],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[207],[226]
2,When did Beyonce leave Destiny's Child and bec...,[2003],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[526],[530]
3,In what city and state did Beyonce grow up?,"[Houston, Texas]",Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[166],[180]
4,In which decade did Beyonce become famous?,[late 1990s],Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,[276],[286]


In [11]:
#random.shuffle(examples)
#subset = examples[:15000]
#len(subset)
print("Total QA pairs:", len(df))

Total QA pairs: 130319


In [12]:
#shuffling
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# only working on subset of 15k row
df_subset = df.head(15000).copy().reset_index(drop=True)

print("Subset size:", df_subset.shape)
df_subset.head()

Subset size: (15000, 5)


Unnamed: 0,question,answers,context,answer_start,answer_end
0,What year did the global recession that follow...,[2012],It threatened the collapse of large financial ...,[481],[485]
1,what was a popular club in ibiza that started ...,[Amnesia],"But house was also being developed on Ibiza,[c...",[251],[258]
2,In what century did Martin Luther honor Mary a...,[],Although Calvin and Huldrych Zwingli honored M...,[],[]
3,What is the climate like?,[varies from hot and subhumid tropical],"Due to extreme variation in elevation, great v...",[115],[152]
4,How many times has the Queen toured Canada?,[],The Queen addressed the United Nations for a s...,[],[]


##Data Cleaning

Dropping rows where answers are empty

In [21]:
df_subset = df_subset[df_subset['answers'].map(len) > 0].reset_index(drop=True)

print("Rows remaining after drop:", len(df))

Rows remaining after drop: 10020


Removing Extra Whitespaces

In [22]:
def collapse_whitespace(s):
    if isinstance(s, str):
        return re.sub(r'\s+', ' ', s.strip())
    return s


In [25]:
for col in ['question', 'context', 'answers']:
    if col in df_subset.columns:
        df_subset[col] = df_subset[col].apply(collapse_whitespace)


## Embeddings

In [16]:
!pip install --quiet gensim

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [28]:
vocab_size = len(tokenizer.word_index)
print("Total unique tokens:", vocab_size)


Total unique tokens: 63784


In [27]:
all_texts = (
    df_subset['question'].tolist() +
    df_subset['context'].tolist() +
    df_subset['answers'].tolist()
)
tokenizer = Tokenizer(
    num_words=20000,
    oov_token='[UNK]',
    filters='''!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'''
)
tokenizer.fit_on_texts(all_texts)

q_seqs = tokenizer.texts_to_sequences(df['question'])
c_seqs = tokenizer.texts_to_sequences(df['context'])
a_seqs = tokenizer.texts_to_sequences(df['answers'])

In [29]:
q_seqs[0]

[18, 71, 44, 2, 973, 6688, 13, 564, 2, 474, 974, 3, 435, 169]

**Load gloVe dictionary**

In [30]:
glove_zip = tf.keras.utils.get_file(
    fname="glove.6B.zip",
    origin="http://nlp.stanford.edu/data/glove.6B.zip",
    extract=False
)
glove_dir = os.path.dirname(glove_zip)

with zipfile.ZipFile(glove_zip, 'r') as z:
    files = z.namelist()
    target = "glove.6B.100d.txt"
    if target in files and not os.path.exists(os.path.join(glove_dir, target)):
        z.extract(target, path=glove_dir)

glove_path = os.path.join(glove_dir, "glove.6B.100d.txt")


Downloading data from http://nlp.stanford.edu/data/glove.6B.zip
[1m862182613/862182613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 0us/step


**Creating embeddings index (mapping words to vectors)**

In [31]:
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.rstrip().split(" ")
        word = parts[0]
        vec  = np.asarray(parts[1:], dtype='float32')
        embeddings_index[word] = vec

**Creating our look-up table (embedding matrix)**

In [32]:
vocab_size = len(tokenizer.word_index) + 1
emb_dim = 100   #features of each vector embeddings
# each word is a column vector
embedding_matrix = np.random.normal(size=(vocab_size, emb_dim)) * 0.01

In [33]:
for word, idx in tokenizer.word_index.items():
    if idx >= vocab_size:
        continue
    if word in embeddings_index:
        embedding_matrix[idx] = embeddings_index[word]

In [34]:
word = tokenizer.index_word[2]
print(word)
print(embedding_matrix[2])

the
[-0.038194   -0.24487001  0.72812003 -0.39961001  0.083172    0.043953
 -0.39140999  0.3344     -0.57545     0.087459    0.28786999 -0.06731
  0.30906001 -0.26383999 -0.13231    -0.20757     0.33395001 -0.33848
 -0.31742999 -0.48335999  0.1464     -0.37303999  0.34577     0.052041
  0.44946    -0.46970999  0.02628    -0.54154998 -0.15518001 -0.14106999
 -0.039722    0.28277001  0.14393     0.23464    -0.31020999  0.086173
  0.20397     0.52623999  0.17163999 -0.082378   -0.71787    -0.41531
  0.20334999 -0.12763     0.41367     0.55186999  0.57907999 -0.33476999
 -0.36559001 -0.54856998 -0.062892    0.26583999  0.30204999  0.99774998
 -0.80480999 -3.0243001   0.01254    -0.36941999  2.21670008  0.72201002
 -0.24978     0.92136002  0.034514    0.46744999  1.10790002 -0.19358
 -0.074575    0.23353    -0.052062   -0.22044     0.057162   -0.15806
 -0.30798    -0.41624999  0.37972     0.15006    -0.53211999 -0.20550001
 -1.25259995  0.071624    0.70564997  0.49744001 -0.42063001  0.2614

**Create embedding layer**

In [35]:
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=emb_dim,
    weights=[embedding_matrix],
    mask_zero=True,
    trainable=False,
    name='glove_embedding'
)

##Phase One

In [25]:
# Hyperparameters
MAX_Q_LEN   = 50
MAX_A_LEN   = 20
VOCAB_SIZE  = len(tokenizer.word_index) + 1
EMB_DIM     = embedding_matrix.shape[1]
UNITS       = 128
BATCH_SIZE  = 32
EPOCHS      = 10

In [27]:
print(len(tokenizer.word_index))

538110


In [25]:
#padding sequences
q_padded = pad_sequences(q_seqs, maxlen=MAX_Q_LEN, padding='post', truncating='post')
a_padded = pad_sequences(a_seqs, maxlen=MAX_A_LEN, padding='post', truncating='post')

In [28]:
decoder_input  = a_padded[:, :-1]
decoder_target = a_padded[:, 1:]
Xq_tr, Xq_val, Din_tr, Din_val, Dt_tr, Dt_val = train_test_split(
    q_padded, decoder_input, decoder_target,
    test_size=0.1, random_state=42
)
def make_ds(q, d_in, d_tar, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(((q, d_in), d_tar))
    return ds.shuffle(2000).batch(batch_size).prefetch(1)

train_ds = make_ds(Xq_tr, Din_tr, Dt_tr)
val_ds   = make_ds(Xq_val, Din_val, Dt_val)

In [35]:
#building encoder
encoder_inputs = Input(shape=(MAX_Q_LEN,), name='encoder_input')
enc_embedded   = embedding_layer(encoder_inputs)                # (batch, Q, emb_dim)
_, state_h, state_c = LSTM(UNITS, return_state=True, name='encoder_lstm')(enc_embedded)
encoder_states = [state_h, state_c]


In [36]:
#building decoder
decoder_inputs  = Input(shape=(MAX_A_LEN-1,), name='decoder_input')
dec_embedded    = embedding_layer(decoder_inputs)              # (batch, A-1, emb_dim)
dec_lstm        = LSTM(UNITS, return_sequences=True, return_state=True, name='decoder_lstm')
dec_outputs, _, _ = dec_lstm(dec_embedded, initial_state=encoder_states)
decoder_dense   = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(dec_outputs)

In [37]:
#assembling model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

In [None]:
#training the model
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10
)

Epoch 1/10
