All of this is ran in a docker container using the following image:

nvcr.io/nvidia/tensorflow:23.12-tf2-py3

In [1]:
import os
import sys
import numpy as np
from tqdm import tqdm

# Add root directory (one level up from notebooks/)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

download and install wikiextractor

In [6]:
if not os.path.isdir(r"../wikiextractor-master"):
    # Step 1: Download the ZIP file
    !curl -L -o ../wikiextractor.zip https://github.com/attardi/wikiextractor/archive/refs/heads/master.zip

    # Step 2: Extract it
    import zipfile
    import os

    zip_path = r"../wikiextractor.zip"
    extract_to = r"../"

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

    # Step 3: Delete the ZIP file
    os.remove(zip_path)

    # Step 4: Install Wikiextractor
    !pip install -e ../wikiextractor-master
else:
    print("Wikiextractor already exists")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 49444    0 49444    0     0  58443      0 --:--:-- --:--:-- --:--:-- 58443
Obtaining file:///opt/files/Capstone/WikipediaNLP/wikiextractor-master
  Preparing metadata (setup.py) ... [?25ldone
[?25hInstalling collected packages: wikiextractor
  Running setup.py develop for wikiextractor
Successfully installed wikiextractor-3.0.7
[0m

Get wikipedia dump (takes like 2 hours to download)

In [4]:
os.makedirs("../data/raw", exist_ok=True)
if not os.path.isfile(r"../data/raw/enwiki-latest-pages-articles.xml.bz2"):
    !wget -P ../data/raw https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
else:
    print("Wikipedia dump already downloaded")

Wikipedia dump already downloaded


Extract xml data from wikidump

In [7]:
if not os.path.isdir(r"../data/raw/extracted_wikidata"):
    !python -m wikiextractor.WikiExtractor ../data/raw/enwiki-latest-pages-articles.xml.bz2 -o ../data/raw/extracted_wikidata --no-templates
else:
    print("Wikipedia XML extract already exists")

INFO: Starting page extraction from ../data/raw/enwiki-latest-pages-articles.xml.bz2.
INFO: Using 11 extract processes.
INFO: Extracted 100000 articles (859.4 art/s)
INFO: Extracted 200000 articles (1178.7 art/s)
INFO: Extracted 300000 articles (1529.5 art/s)
INFO: Extracted 400000 articles (1920.3 art/s)
INFO: Extracted 500000 articles (2589.3 art/s)
INFO: Extracted 600000 articles (2087.0 art/s)
INFO: Extracted 700000 articles (2244.5 art/s)
INFO: Extracted 800000 articles (2306.4 art/s)
INFO: Extracted 900000 articles (2479.6 art/s)
INFO: Extracted 1000000 articles (2615.2 art/s)
INFO: Extracted 1100000 articles (2713.7 art/s)
INFO: Extracted 1200000 articles (2815.1 art/s)
INFO: Extracted 1300000 articles (2705.0 art/s)
INFO: Extracted 1400000 articles (2814.9 art/s)
INFO: Extracted 1500000 articles (2917.8 art/s)
INFO: Extracted 1600000 articles (3070.0 art/s)
INFO: Extracted 1700000 articles (3071.0 art/s)
INFO: Extracted 1800000 articles (3152.4 art/s)
INFO: Extracted 1900000 ar

Create json data from wiki-dump

In [6]:
from utils.data_prep import traverse_directory

input_dir = r'../data/raw/extracted_wikidata'
output_dir = r'../data/processed/wikidata_json'

if not os.path.isdir(output_dir):
    traverse_directory(input_dir, output_dir)
else:
    print("wikidata_json already exists")

wikidata_json already exists


download and extract all wiki.sql files needed

In [None]:
if not os.path.isfile(r"../data/raw/enwiki-latest-page.sql"):
    !wget -P ../data/raw https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz
    !gunzip ../data/raw/enwiki-latest-page.sql.gz
else:
    print("Wikipedia pages already downloaded")

In [None]:
if not os.path.isfile(r"../data/raw/enwiki-latest-pagelinks.sql"):
    !wget -P ../data/raw https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pagelinks.sql.gz
    !gunzip ../data/raw/enwiki-latest-pagelinks.sql.gz
else:
    print("Wikipedia pagelinks already downloaded")

In [None]:
if not os.path.isfile(r"../data/raw/enwiki-latest-linktarget.sql"):
    !wget -P ../data/raw https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-linktarget.sql.gz
    !gunzip ../data/raw/enwiki-latest-linktarget.sql.gz
else:
    print("Wikipedia linktarget already downloaded")

Create sqlite3 db

In [None]:
from utils.sqlite_lookup import build_linkgraph_sqlite

WIKI_LINK_GRAPH_JSONL_PATH = r"../data/processed/wiki_link_graph_jsonl"
WIKI_LINK_GRAPH_DB_PATH = r"../data/processed/wiki_link_graph.db"

build_linkgraph_sqlite(WIKI_LINK_GRAPH_JSONL_PATH, WIKI_LINK_GRAPH_DB_PATH)

Create wiki_link_graph_jsonl dir

In [None]:
from utils.link_graph import export_link_graph_to_jsonl

In [None]:
page_sql_path = "../data/raw/enwiki-latest-page.sql"
pagelinks_sql_path = "../data/raw/enwiki-latest-pagelinks.sql"
linktarget_sql_path = "../data/raw/enwiki-latest-linktarget.sql"
jsonl_output_path = "../data/processed/wiki_link_graph_jsonl"

export_link_graph_to_jsonl(page_sql_path, pagelinks_sql_path, linktarget_sql_path, jsonl_output_path)

Initialize spark session for metadata and create random triplets

Create metadata from articles

Create Training data of random triplets using Pyspark from json wikidata

Had to install winutils for hadoop and pyspark to work on windows locally.

In [None]:
input_dir = r"../data/processed/wikidata_jsonl"
output_dir = r"../data/processed/triplets/parts"
metadata_path = r"../data/custom_model/article_metadata.json"

In [None]:
from utils.spark_functions import *
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
        .appName("Capstone") \
        .master("local[*]") \
        .config("spark.driver.memory", "20g") \
        .config("spark.sql.shuffle.partitions", "100") \
        .config("spark.local.dir", "../spark-temp") \
        .config("spark.driver.maxResultSize", "2g") \
        .getOrCreate()

# Calculate total lines so that we can determine epoch size
total_lines = spark.read.json(f"../data/processed/triplets/parallel_parts/*.jsonl", multiLine=False).count()
print("Total lines across training files:", total_lines)

# Stop spark
spark.stop()

                                                                                

Total lines across training files: 15123359


In [None]:
from utils.triplet_mining import load_articles_titles_only, mine_triplets_from_file, load_faiss_index
from multiprocessing import Pool
from tqdm import tqdm

os.makedirs(TRIPLET_OUTPUT_DIR, exist_ok=True)
article_titles = load_articles_titles_only(TITLES_JSON)
index, all_texts, text_to_meta = load_faiss_index(FAISS_INDEX_PATH, FAISS_META_PATH)

files = [os.path.join(root, file)
        for root, _, filenames in os.walk(WIKI_DATA_DIR_JSONL)
        for file in filenames if file.endswith(".jsonl")]

with Pool(processes=3, maxtasksperchild=10) as pool:
    results = list(tqdm(pool.imap_unordered(mine_triplets_from_file, 
                    files, 
                    chunksize=2), 
                    total=len(files), 
                    desc="Parallel Triplet Mining"))

print(f"Triplet Writing Complete")

Training embedding model using Mined Triplets Data

In [4]:
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint
from utils.custom_embedder import create_vectorizer, load_triplet_dataset_streamed, CustomEncoder, TripletTrainer

2025-06-04 03:24:47.660623: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-04 03:24:47.761044: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-04 03:24:47.761133: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-04 03:24:47.761365: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-04 03:24:47.803064: I tensorflow/core/platform/cpu_feature_g

In [None]:
input_dir = "../data/processed/triplets/parallel_parts"
vectorizer_dir = "../data/custom_model/saved_vectorizer"
vocab_size = 30000
max_len = 32
embed_dim = 128
num_heads = 8
ff_dim = 256
batch_size = 512
num_epochs = 30
learning_rate = 1e-4
total_lines = 15123359
weights_dir = "../data/custom_model"

# Load or create vectorizer
if os.path.exists(vectorizer_dir):
    print("Loading saved vectorizer")
    vectorizer = tf.keras.models.load_model(vectorizer_dir)
else:
    vectorizer = create_vectorizer(input_dir, vectorizer_dir, vocab_size, max_len)

# Load dataset
train_dataset = load_triplet_dataset_streamed(input_dir, vectorizer, batch_size)

# Model setup
encoder = CustomEncoder(vocab_size, max_len, embed_dim, num_heads, ff_dim)
trainer = TripletTrainer(encoder)
trainer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))

# Callbacks
callbacks = [
    EarlyStopping(monitor="loss", patience=2),
    ModelCheckpoint(
        filepath=f"{weights_dir}/best_encoder.weights.h5",
        monitor="loss",
        save_best_only=True,
        save_weights_only=True
    )
]
# Training
trainer.fit(
    train_dataset.repeat(),  # infinite generator
    steps_per_epoch=total_lines // batch_size,
    epochs=num_epochs,
    callbacks=callbacks
)

print("Training complete and weights saved.")

Embed and Index Wikidata

In [6]:
input_dir = "../data/processed/triplets/parallel_parts"
vectorizer_dir = "../data/custom_model/saved_vectorizer"
weights_dir = "../data/custom_model"

In [8]:
vocab_size = 30000
max_len = 32
embed_dim = 128
num_heads = 8
ff_dim = 256
batch_size = 512
num_epochs = 30
learning_rate = 1e-4
total_lines = 15123359
weights_dir = "../data/custom_model"

encoder = CustomEncoder(vocab_size, max_len, embed_dim, num_heads, ff_dim, num_layers=2)
encoder.load_weights(f"{weights_dir}/best_encoder.weights.h5")

2025-06-04 02:51:25.619531: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-06-04 02:51:25.621564: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-06-04 02:51:25.621589: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-06-04 02:51:25.623775: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-06-04 02:51:25.623802: I tensorflow/compile

In [10]:
anchors = set()
files = os.listdir(input_dir)
for file in tqdm(files, desc="Loading Anchors"):
    if not file.endswith(".jsonl"):
        continue
    with open(os.path.join(input_dir, file), "r", encoding="utf-8") as f:
        for line in f:
            triplet = json.loads(line)
            anchors.add(triplet["anchor"])

# Save to disk
os.makedirs("../data/custom_model/embeddings_output", exist_ok=True)
with open("../data/custom_model/embeddings_output/unique_anchors.txt", "w", encoding="utf-8") as out_f:
    for anchor in tqdm(sorted(anchors), desc="Saving Unique Anchors"):
        out_f.write(anchor + "\n")


Loading Anchors: 100%|██████████| 16425/16425 [07:30<00:00, 36.43it/s] 
Saving Embeddings: 100%|██████████| 14882224/14882224 [01:54<00:00, 130485.63it/s]


In [11]:
def anchor_batch_generator(filepath, batch_size):
    with open(filepath, "r", encoding="utf-8") as f:
        batch = []
        for line in f:
            batch.append(line.strip())
            if len(batch) == batch_size:
                yield batch
                batch = []
        if batch:
            yield batch

In [17]:
vectorizer = tf.keras.models.load_model(vectorizer_dir)
os.makedirs("../data/custom_model/embeddings_output/chunks", exist_ok=True)

batch_size = 4096
for i, anchor_batch in enumerate(tqdm(anchor_batch_generator(
        "../data/custom_model/embeddings_output/unique_anchors.txt", batch_size),
        total=len(anchors) // batch_size,
        desc="Encoding Anchors")):
    
    anchor_batch_tensor = tf.constant(anchor_batch)
    tokenized = vectorizer(anchor_batch_tensor)
    embeddings = encoder(tokenized, training=False).numpy()
    
    if i % 100 == 0 and i != 0:
        np.save(f"../data/custom_model/embeddings_output/chunks/embeddings_chunk_{i//100}.npy", embeddings)
    if i == len(anchors) // batch_size:
        np.save(f"../data/custom_model/embeddings_output/chunks/embeddings_chunk_{(i//100)+1}.npy", embeddings)



Encoding Anchors: 3634it [11:29,  5.27it/s]                          


Create faiss index if not already existing

In [2]:
from utils.faiss_index import create_faiss_index_from_dir

embedding_path = "../data/custom_model/embeddings_output/chunks"
faiss_path = "../data/custom_model/faiss/faiss_index.index"

# Creates embeddings and faiss.index
create_faiss_index_from_dir(embedding_path, faiss_path)

Found 37 embedding files. Building FAISS index...


Processing embedding files: 100%|██████████| 37/37 [00:00<00:00, 56.22it/s]


FAISS index saved.


In [16]:
vectorizer = tf.keras.models.load_model(vectorizer_dir)

# Looad the weights
vocab_size = 30000
max_len = 32
embed_dim = 128
num_heads = 8
ff_dim = 256
batch_size = 512
num_epochs = 30
learning_rate = 1e-4

encoder = CustomEncoder(vocab_size, max_len, embed_dim, num_heads, ff_dim, num_layers=2)
encoder.load_weights(f"{weights_dir}/best_encoder.weights.h5")

# Your query
query = "Who was involved in World War 2?"

query_seq = vectorizer(tf.constant([query]))
query_embedding = encoder(query_seq).numpy()
query_embedding = query_embedding.astype(np.float32)



In [17]:
from utils.faiss_index import query_faiss
import json

faiss_path = "../data/custom_model/faiss/faiss_index.index"
indices = query_faiss(faiss_path, query_embedding, 10)

# Load article metadata
metadata_path = "../data/custom_model/article_metadata.json"
with open(metadata_path, encoding='utf-8') as f:
    metadata = json.load(f)

# Retrieve top-k articles
results = [metadata[i] for i in indices[0]]

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

embedding_chunk_dir = "../data/custom_model/embeddings_output/chunks"
chunk_files = sorted([os.path.join(embedding_chunk_dir, f) for f in os.listdir(embedding_chunk_dir) if f.endswith(".npy")])
chunk_size = 4096

# Flatten FAISS indices into one list
faiss_indices = indices[0].tolist()

# Map: index -> (chunk_idx, local_offset)
index_map = {
    idx: (idx // chunk_size, idx % chunk_size)
    for idx in faiss_indices
}

# Load only the needed chunks
required_chunks = sorted(set(chunk_idx for chunk_idx, _ in index_map.values()))
chunk_data = {}

for chunk_idx in required_chunks:
    chunk_path = chunk_files[chunk_idx]
    chunk_data[chunk_idx] = np.load(chunk_path)

# Reconstruct article embeddings
article_embeddings = {
    idx: chunk_data[chunk_idx][offset]
    for idx, (chunk_idx, offset) in index_map.items()
}

# Load metadata aligned to the full dataset
with open("../data/custom_model/article_metadata.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)

# Normalize query vector
query_vec = query_embedding[0].reshape(1, -1)

# Collect article dicts for reranking
top_articles = [
    metadata[i] | {"vec": article_embeddings[i]}
    for i in faiss_indices
]

# Rerank using cosine similarity
top_articles.sort(
    key=lambda x: cosine_similarity(query_vec, x["vec"].reshape(1, -1))[0][0],
    reverse=True
)

In [19]:
import pprint

# Final top-k results
final_results = [
    {
        "title": article["title"],
        "url": article.get("url", "N/A")
    }
    for article in top_articles#[:5]
]

pprint.pprint(final_results)

[{'title': 'Huntly, New Zealand',
  'url': 'https://en.wikipedia.org/wiki?curid=264191'},
 {'title': 'Halfway, Oregon',
  'url': 'https://en.wikipedia.org/wiki?curid=130677'},
 {'title': 'Growth stock', 'url': 'https://en.wikipedia.org/wiki?curid=426130'},
 {'title': 'Polycystine', 'url': 'https://en.wikipedia.org/wiki?curid=320877'},
 {'title': 'Voices Carry (album)',
  'url': 'https://en.wikipedia.org/wiki?curid=1056233'},
 {'title': 'House, New Mexico',
  'url': 'https://en.wikipedia.org/wiki?curid=125952'},
 {'title': 'Red Hill, South Carolina',
  'url': 'https://en.wikipedia.org/wiki?curid=134536'},
 {'title': 'RKKY interaction',
  'url': 'https://en.wikipedia.org/wiki?curid=1227105'},
 {'title': 'Diplomatic mission',
  'url': 'https://en.wikipedia.org/wiki?curid=8970'},
 {'title': 'Elizabeth Proctor',
  'url': 'https://en.wikipedia.org/wiki?curid=1047314'}]


In [20]:
from utils.top_k_testing import evaluate_all_metrics, retrieval_function, load_test_set

test_set = load_test_set("../data/test_data/test_queries.json")
results = evaluate_all_metrics(test_set, retrieval_function)

print("Evaluation Metrics:")
for metric, value in results.items():
    print(f"{metric}: {value}")

FileNotFoundError: [Errno 2] No such file or directory: '../data/test_data/test_queries.json'