# Milestone 2

In [152]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import pandas as pd
import os, zipfile , json , random
from pathlib import Path
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer


## Explorting dataset:

In [57]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
!wget -O /content/drive/MyDrive/TriviaQA_RC.zip "https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz"

--2025-04-13 11:20:19--  https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz
Resolving nlp.cs.washington.edu (nlp.cs.washington.edu)... 128.208.3.117, 2607:4000:200:12:3eec:efff:fe5e:6f68
Connecting to nlp.cs.washington.edu (nlp.cs.washington.edu)|128.208.3.117|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2665779500 (2.5G) [application/x-gzip]
Saving to: ‘/content/drive/MyDrive/TriviaQA_RC.zip’


2025-04-13 11:21:48 (28.6 MB/s) - ‘/content/drive/MyDrive/TriviaQA_RC.zip’ saved [2665779500/2665779500]



In [79]:
!cp "/content/drive/MyDrive/TriviaQA_RC.zip" /content/TriviaQA_RC.zip

In [85]:
mkdir -p /content/TriviaQA_RC

In [102]:
!tar -xzf /content/TriviaQA_RC.zip -C /content/TriviaQA_RC

^C


In [89]:
!find /content/TriviaQA_RC -maxdepth 2 | sed -e '1,5!d'

/content/TriviaQA_RC
/content/TriviaQA_RC/README
/content/TriviaQA_RC/qa
/content/TriviaQA_RC/qa/wikipedia-train.json
/content/TriviaQA_RC/qa/web-train.json


In [90]:
!sed -n '1,50p' /content/TriviaQA_RC/README

-------------------------------------------------------------------------------------------------------
The University of Washington TriviaQA Dataset (version 1.0)
-------------------------------------------------------------------------------------------------------

TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts and independently gathered evidence documents, six per question on average, that provide high quality distant supervision for answering the questions. The details can be found in our paper

@InProceedings{JoshiTriviaQA2017,
  author    = {Joshi, Mandar  and  Choi, Eunsol  and  Weld, Daniel S. and Zettlemoyer, Luke},
  title     = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  mo

In [103]:
base_dir = Path('/content/TriviaQA_RC')
qa_file = base_dir / 'qa' / 'wikipedia-train.json'
with open(qa_file, 'r', encoding='utf-8') as f:
    examples = json.load(f)
examples.keys()



dict_keys(['Data', 'Domain', 'Split', 'VerifiedEval', 'Version'])

In [104]:




with open(base_dir / 'qa' / 'wikipedia-train.json', 'r', encoding='utf-8') as f:
    wrapper = json.load(f)

# 2. Extract the list of examples
examples = wrapper['Data']
print(f"Total examples available: {len(examples)}\n")

# 3. Inspect the first 3 entries
for idx, sample in enumerate(examples[:3]):
    print(f"=== Example {idx+1} ===")
    print("QuestionId :", sample['QuestionId'])
    print("Question   :", sample['Question'])
    print("Answer     :", sample['Answer']['Value'])
    print("Answer Aliases:", sample['Answer']['Aliases'][:5], "…\n")

    page = sample['EntityPages'][0]
    ctx_path = base_dir / 'evidence' / 'wikipedia' / page['Filename']
    if ctx_path.exists():
        text = ctx_path.read_text(encoding='utf-8', errors='ignore')
        print(f"Context file ({page['Filename']}) snippet:")
        print(text[:300].replace('\n', ' '), "…\n")
    else:
        print(f"Context file not found: {ctx_path}\n")


Total examples available: 61888

=== Example 1 ===
QuestionId : tc_3
Question   : Where in England was Dame Judi Dench born?
Answer     : York
Answer Aliases: ['Park Grove (1895)', 'York UA', 'Yorkish', 'UN/LOCODE:GBYRK', 'York, UK'] …

Context file (England.txt) snippet:
England is a country that is part of the United Kingdom.   It shares land borders with Scotland to the north and Wales to the west. The Irish Sea lies northwest of England and the Celtic Sea lies to the southwest. England is separated from continental Europe by the North Sea to the east and the Engl …

=== Example 2 ===
QuestionId : tc_8
Question   : From which country did Angola achieve independence in 1975?
Answer     : Portugal
Answer Aliases: ['Portogało', 'Republic of Portugal', 'PORTUGAL', 'Portekiz', 'Portugallu'] …

Context file (Nation_state.txt) snippet:
A nation state is a type of state that conjoins the political entity of a state to the cultural entity of a nation, from which it aims to derive its politic

In [105]:
random.shuffle(examples)
subset = examples[:15000]
len(subset)

15000

In [112]:
subset[4]

{'Answer': {'Aliases': ["I'll Stand by You (Pretenders song)",
   "I'll Stand by You",
   "I'll Stand By You (Carrie Underwood song)",
   'I’ll Stand By You',
   "I'll Stand By You (Girls Aloud song)",
   "I'll Stand By You (song)",
   "I'll Stand By You",
   'Ill stand by you'],
  'MatchedWikiEntityName': "I'll Stand by You",
  'NormalizedAliases': ['i ll stand by you carrie underwood song',
   'i ll stand by you song',
   'ill stand by you',
   'i ll stand by you',
   'i ll stand by you pretenders song',
   'i ll stand by you girls aloud song'],
  'NormalizedMatchedWikiEntityName': 'i ll stand by you',
  'NormalizedValue': 'i ll stand by you',
  'Type': 'WikipediaEntity',
  'Value': 'I’ll Stand By You'},
 'EntityPages': [{'DocSource': 'TagMe',
   'Filename': 'The_Pretenders.txt',
   'Title': 'The Pretenders'},
  {'DocSource': 'TagMe',
   'Filename': 'Girls_Aloud.txt',
   'Title': 'Girls Aloud'},
  {'DocSource': 'Search',
   'Filename': "I'll_Stand_by_You.txt",
   'Title': "I'll Stand

##Data Cleaning

**Context needed**: so we add the context from each file to the subset list

In [113]:
# Method that adds context to each entry
def get_context(sample, base_dir):
    contexts = []
    for page in sample["EntityPages"]:
        fname = page["Filename"]
        ctx_path = base_dir / 'evidence' / 'wikipedia' / fname
        if ctx_path.exists():
            contexts.append(
                ctx_path.read_text(encoding='utf-8', errors='ignore')
            )
    sample["Context"] = contexts
    return sample


for sample in subset:
    get_context(sample, base_dir)

In [115]:
subset[0].keys()

dict_keys(['Answer', 'EntityPages', 'Question', 'QuestionId', 'QuestionSource', 'Context'])

**Keeping only important features**

In [116]:
def simplify_sample(sample):
    """
    Given a full TriviaQA sample (with Question, EntityPages, Context, Answer, etc.),
    return a dict with only the fields we need for modeling:
      - question: the question string
      - context : the first context passage (string)
      - answer  : the ground-truth answer string
    """
    # 1) Grab the question
    q = sample['Question']


    ctx = sample.get('Context', [])
    c = ctx[0] if ctx else ""


    a = sample['Answer']['Value']

    return {
        'question': q,
        'context' : c,
        'answer'  : a
    }


cleaned = [ simplify_sample(s) for s in subset ]



## Embeddings

In [119]:
!pip install --quiet gensim

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0m

In [121]:
tokenizer = Tokenizer(
    num_words=20000,
    oov_token='[UNK]'
)

questions = [s['question'] for s in cleaned]
contexts  = [s['context']  for s in cleaned]
answers   = [s['answer']   for s in cleaned]

tokenizer.fit_on_texts(questions + contexts + answers)

q_seqs = tokenizer.texts_to_sequences(questions)
c_seqs = tokenizer.texts_to_sequences(contexts)
a_seqs = tokenizer.texts_to_sequences(answers)

In [123]:
q_seqs[0]

[12, 182, 136, 254, 448, 1550, 9927, 8860, 8538, 681, 8987, 6064, 4, 6250]

**Load gloVe dictionary**

In [133]:
glove_zip = tf.keras.utils.get_file(
    fname="glove.6B.zip",
    origin="http://nlp.stanford.edu/data/glove.6B.zip",
    extract=False
)
glove_dir = os.path.dirname(glove_zip)

with zipfile.ZipFile(glove_zip, 'r') as z:
    files = z.namelist()
    target = "glove.6B.100d.txt"
    if target in files and not os.path.exists(os.path.join(glove_dir, target)):
        z.extract(target, path=glove_dir)

glove_path = os.path.join(glove_dir, "glove.6B.100d.txt")


Downloading data from http://nlp.stanford.edu/data/glove.6B.zip
[1m862182613/862182613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 0us/step


NameError: name 'zipfile' is not defined

**Creating embeddings index (mapping words to vectors)**

In [136]:
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.rstrip().split(" ")
        word = parts[0]
        vec  = np.asarray(parts[1:], dtype='float32')
        embeddings_index[word] = vec

TypeError: 'tuple' object is not callable

**Creating our look-up table (embedding matrix)**

In [143]:
vocab_size = len(tokenizer.word_index) + 1
emb_dim = 100   #features of each vector embeddings
# each word is a column vector
embedding_matrix = np.random.normal(size=(vocab_size, emb_dim)) * 0.01

In [144]:
for word, idx in tokenizer.word_index.items():
    if idx >= vocab_size:
        continue
    if word in embeddings_index:
        embedding_matrix[idx] = embeddings_index[word]

In [150]:
word = tokenizer.index_word[2]
print(word)
print(embedding_matrix[2])

the
[-0.038194   -0.24487001  0.72812003 -0.39961001  0.083172    0.043953
 -0.39140999  0.3344     -0.57545     0.087459    0.28786999 -0.06731
  0.30906001 -0.26383999 -0.13231    -0.20757     0.33395001 -0.33848
 -0.31742999 -0.48335999  0.1464     -0.37303999  0.34577     0.052041
  0.44946    -0.46970999  0.02628    -0.54154998 -0.15518001 -0.14106999
 -0.039722    0.28277001  0.14393     0.23464    -0.31020999  0.086173
  0.20397     0.52623999  0.17163999 -0.082378   -0.71787    -0.41531
  0.20334999 -0.12763     0.41367     0.55186999  0.57907999 -0.33476999
 -0.36559001 -0.54856998 -0.062892    0.26583999  0.30204999  0.99774998
 -0.80480999 -3.0243001   0.01254    -0.36941999  2.21670008  0.72201002
 -0.24978     0.92136002  0.034514    0.46744999  1.10790002 -0.19358
 -0.074575    0.23353    -0.052062   -0.22044     0.057162   -0.15806
 -0.30798    -0.41624999  0.37972     0.15006    -0.53211999 -0.20550001
 -1.25259995  0.071624    0.70564997  0.49744001 -0.42063001  0.2614

**Create embedding layer**

In [153]:
embedding_layer = Embedding(
    input_dim=vocab_size,
    output_dim=emb_dim,
    weights=[embedding_matrix],
    mask_zero=True,
    trainable=False,
    name='glove_embedding'
)