<a href="https://colab.research.google.com/github/ZainabSMU/GenAI/blob/main/GenAI_FINAL_RECENT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the main libraries for the project
!pip install -q datasets transformers accelerate peft bitsandbytes sentencepiece faiss-cpu evaluate beautifulsoup4 lxml

import os, math, random, requests
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset, Dataset as HFDataset
from transformers import (
    AutoTokenizer,
    BertConfig,
    BertModel,
    get_cosine_schedule_with_warmup,
    AutoModelForCausalLM,
)

import evaluate
import faiss
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using the following device:", device)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m101.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


In [5]:
# Loading the general-purpose corpus from BookCorpus
#book_ds = load_dataset("rojagtap/bookcorpus", split="train")

# Keeping only the text column and clean it a bit
#def clean_book(example):
#    return {"text": example["text"].strip()}

#book_ds = book_ds.map(clean_book, remove_columns=book_ds.column_names)

# Filtering out very short/noisy  lines
#book_ds = book_ds.filter(lambda e: len(e["text"]) > 50)

# Subsampling to make training feasible
#book_ds = book_ds.shuffle(seed=42).select(range(10000))
#print("BookCorpus subset size:", len(book_ds))
#print(book_ds[0])



# Streaming BookCorpus instead so we don't download/process the entire thing at once
streaming_book = load_dataset(
    "rojagtap/bookcorpus",
    split="train",
    streaming=True
)

target_samples = 20000  # originally 20000, but runtime was too long so decreased it...(maybe try 10k–20k later); UPDATE: streaming made it run faster, so increased it back to 20k
collected = []

# Iterating and collecting only as many examples as needed
for ex in streaming_book:
    txt = ex["text"].strip()
    if len(txt) <= 50:
        continue  # skipping the very short lines
    collected.append({"text": txt})
    if len(collected) >= target_samples:
        break

# Turning the small list into a regular HF Dataset
book_ds = HFDataset.from_list(collected)

print("BookCorpus subset size:", len(book_ds))
print(book_ds[0])


BookCorpus subset size: 20000
{'text': 'the half-ling book one in the fall of igneeria series kaylee soderburg copyright 2013 kaylee soderburg all rights reserved .'}


In [6]:
# Scraping the Paul Graham (PG) essays from the website: paulgraham.com
BASE_URL = "http://www.paulgraham.com/articles.html"

def fetch_pg_essays():
    # Loading the index page listing the articles
    index_html = requests.get(BASE_URL).text
    soup = BeautifulSoup(index_html, "lxml")

    links = []
    # Collecting all the relative .html links that look like essays
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.endswith(".html") and not href.startswith("http"):
            links.append("http://www.paulgraham.com/" + href)

    texts = []
    for url in sorted(set(links)):
        try:
            html = requests.get(url).text
            s = BeautifulSoup(html, "lxml")
            # Since essays are usually inside main <table>, fall back to whole page
            body = s.find("table")
            if body is None:
                body = s
            text = body.get_text(separator=" ", strip=True)
            texts.append({"url": url, "text": text})
        except Exception as e:
            print("Error fetching", url, e)
    return texts

pg_texts = fetch_pg_essays()
print("Fetched essays:", len(pg_texts), "Example URL:", pg_texts[0]["url"])


# Converting the PG essays into a HF dataset and filtering the very short essays
pg_ds = HFDataset.from_list(pg_texts)
pg_ds = pg_ds.remove_columns(["url"])
pg_ds = pg_ds.filter(lambda e: len(e["text"]) > 200)
print("PG dataset size:", len(pg_ds))
print(pg_ds[0]["text"][:300])

Fetched essays: 230 Example URL: http://www.paulgraham.com/13sentences.html


Filter:   0%|          | 0/230 [00:00<?, ? examples/s]

PG dataset size: 228
Want to start a startup? Get funded by Y Combinator . Watch how this essay was written . February 2009 One of the things I always tell startups is a principle I learned
from Paul Buchheit: it's better to make a few people really happy
than to make a lot of people semi-happy.  I was saying recently t


Rather than preprocessing the corpora into sentence-level .txt files as is sometimes done in corpus-construction scripts, I adopted a more streamlined and training-aligned approach that better fits both the computational constraints of Colab and the goals of contrastive representation learning. For the general-purpose corpus, I streamed a fixed-size subset of BookCorpus using HuggingFace’s streaming=True mode. This avoids expensive full-dataset downloads while still providing thousands of diverse, natural-language passages suitable for learning general semantic structure. For the Paul Graham domain corpus, I scraped all available essays from paulgraham.com and processed them directly in-memory. In both cases, I applied chunk-based tokenization using the exact tokenizer employed by my embedding models, and constructed adjacent-chunk contrastive pairs. This approach is more consistent with transformer training pipelines, avoids unnecessary I/O overhead, and yields high-quality training examples while remaining computationally feasible. It also directly satisfies the assignment’s requirement to use a general out-of-domain corpus and a domain-specific corpus, without imposing unnecessary preprocessing complexity.

In [17]:
# Using a pretrained tokenizer (but the weights for the encoders will be randomly initialized)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer.model_max_length = 128

book_ds = book_ds.filter(lambda e: len(e["text"].split()) > 30)
MAX_LEN = 64 # Was originally 128 but only got 3/20000 documents
MIN_CHUNK_LEN = 8  # Added a min for a more lenient cutoff
PAD_ID = tokenizer.pad_token_id

Filter:   0%|          | 0/2007 [00:00<?, ? examples/s]

In [18]:
# Chunking the longer documents into fixed-length token chunks
def chunk_text(example):
    tokens = tokenizer(
        example["text"],
        truncation=False,
        add_special_tokens=False,
    )["input_ids"]

    chunks = []
    step = MAX_LEN // 2  # originally didn't have "// 2" but now it is a 50% overlap instead of non-overlapping
    for i in range(0, len(tokens), step):
        chunk_ids = tokens[i:i+MAX_LEN]
        if len(chunk_ids) < MIN_CHUNK_LEN:  # originally was 16 but now set to "MIN_CHUNK_LEN" to be more specific in skipping the very short chunks
            continue
        chunks.append(chunk_ids)
    return {"chunks": chunks}

book_chunks = book_ds.map(chunk_text, remove_columns=["text"])
pg_chunks   = pg_ds.map(chunk_text, remove_columns=["text"])

# Keeping only the documents with at least 2 chunks
book_chunks = book_chunks.filter(lambda e: len(e["chunks"]) > 1)
pg_chunks   = pg_chunks.filter(lambda e: len(e["chunks"]) > 1)

#print("Documents with >=2 chunks:" "\nBookCorpus:", len(book_chunks), "\nPG:", len(pg_chunks))
print("Documents with >=2 chunks:")
print("BookCorpus:", sum(1 for x in book_chunks if len(x["chunks"]) >= 2))
print("PG:",        sum(1 for x in pg_chunks   if len(x["chunks"]) >= 2))





# Building positive pairs from adjacent chunks: (chunk_i, chunk_{i+1})
def build_pairs(ds):
    anchors, positives = [], []
    for row in ds:
        ch = row["chunks"]
        for i in range(len(ch) - 1):
            anchors.append(ch[i])
            positives.append(ch[i+1])
    return anchors, positives

book_anchor_ids, book_pos_ids = build_pairs(book_chunks)
pg_anchor_ids,   pg_pos_ids   = build_pairs(pg_chunks)

print("\nNum BookCorpus pairs:", len(book_anchor_ids), "\nNum PG pairs:", len(pg_anchor_ids))





# Splitting the pairs into train and validation sets (80/20) for both BookCorpus and PG
def split_pairs(anchors, positives, val_ratio=0.2):
    idxs = list(range(len(anchors)))
    random.shuffle(idxs)
    split = int(len(idxs) * (1 - val_ratio))
    train_idx, val_idx = idxs[:split], idxs[split:]

    def subset(xs, indices):
        return [xs[i] for i in indices]

    return (
        subset(anchors, train_idx),
        subset(positives, train_idx),
        subset(anchors, val_idx),
        subset(positives, val_idx),
    )

(book_train_anchor, book_train_pos,
 book_val_anchor,   book_val_pos) = split_pairs(book_anchor_ids, book_pos_ids)

(pg_train_anchor, pg_train_pos,
 pg_val_anchor,   pg_val_pos) = split_pairs(pg_anchor_ids, pg_pos_ids)

print("\nBookCorpus train:", len(book_train_anchor), "\nBookCorpus validation:", len(book_val_anchor))
print("\nPG train:", len(pg_train_anchor),   "\nPG validation:", len(pg_val_anchor))





# Dataset + collator for in-batch contrastive pairs
@dataclass
class PairBatch:
    input_ids_a: torch.Tensor
    attention_mask_a: torch.Tensor
    input_ids_b: torch.Tensor
    attention_mask_b: torch.Tensor

class PairDataset(Dataset):
    def __init__(self, anchors, positives):
        self.anchors = anchors
        self.positives = positives

    def __len__(self):
        return len(self.anchors)

    def _pad(self, ids):
        # Right-pad to MAX_LEN using pad token
        return ids + [PAD_ID] * (MAX_LEN - len(ids))

    def __getitem__(self, idx):
        a = self.anchors[idx][:MAX_LEN]
        b = self.positives[idx][:MAX_LEN]
        a = self._pad(a)
        b = self._pad(b)
        return {"a": a, "b": b}

def collate_pairs(batch):
    # Stack padded ids and create attention masks
    a_ids = torch.tensor([x["a"] for x in batch], dtype=torch.long)
    b_ids = torch.tensor([x["b"] for x in batch], dtype=torch.long)
    a_mask = (a_ids != PAD_ID).long()
    b_mask = (b_ids != PAD_ID).long()
    return PairBatch(a_ids, a_mask, b_ids, b_mask)

def make_loaders(anchors_train, pos_train, anchors_val, pos_val, batch_size=64):
    train_ds = PairDataset(anchors_train, pos_train)
    val_ds   = PairDataset(anchors_val, pos_val)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  collate_fn=collate_pairs)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, collate_fn=collate_pairs)
    return train_loader, val_loader


Map:   0%|          | 0/2007 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (173 > 128). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2007 [00:00<?, ? examples/s]

Filter:   0%|          | 0/228 [00:00<?, ? examples/s]

Documents with >=2 chunks:
BookCorpus: 897
PG: 228

Num BookCorpus pairs: 939 
Num PG pairs: 21324

BookCorpus train: 751 
BookCorpus validation: 188

PG train: 17059 
PG validation: 4265


In [19]:
# BERT-style sentence encoder with random initialization (no pretrained weights)
class BertSentenceEncoder(nn.Module):
    def __init__(self, hidden_size=512, num_hidden_layers=6):
        super().__init__()
        config = BertConfig(
            vocab_size=len(tokenizer),
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=8,       # 8 heads; hidden_size 512 / 8 = 64 per head
            intermediate_size=hidden_size * 4,
            max_position_embeddings=512,
        )
        self.bert = BertModel(config)
        self.proj = nn.Linear(hidden_size, hidden_size)  # optional projection layer

    def encode(self, input_ids, attention_mask):
        # Forward through BERT
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Mean-pool token representations using attention mask
        x = (out.last_hidden_state * attention_mask.unsqueeze(-1)).sum(1)
        x = x / attention_mask.sum(1, keepdim=True)
        x = self.proj(x)
        # L2-normalize embeddings for cosine similarity / InfoNCE
        x = nn.functional.normalize(x, p=2, dim=-1)
        return x

    def forward(self, batch: PairBatch):
        z_a = self.encode(batch.input_ids_a, batch.attention_mask_a)
        z_b = self.encode(batch.input_ids_b, batch.attention_mask_b)
        return z_a, z_b

general_model = BertSentenceEncoder().to(device)
sum(p.numel() for p in general_model.parameters()) / 1e6





# InfoNCE contrastive loss with in-batch negatives
def info_nce_loss(z_a, z_b, temperature=0.05):
    # Ensuring unit vectors (cosine similarity)
    z_a = nn.functional.normalize(z_a, dim=-1)
    z_b = nn.functional.normalize(z_b, dim=-1)

    # Similarity matrix between all anchor-positive pairs in the batch
    logits = z_a @ z_b.t() / temperature
    labels = torch.arange(z_a.size(0), device=z_a.device)

    # Cross-entropy in both directions for symmetry
    loss_i = nn.functional.cross_entropy(logits, labels)
    loss_j = nn.functional.cross_entropy(logits.t(), labels)
    return (loss_i + loss_j) / 2

hidden_size=512

num_hidden_layers=6

num_attention_heads=8 which is around 40M params

Batch size: 64

In [20]:
# Helper to compute the retrieval metrics on the validation set (MRR, P@k, nDCG) ??????What is P@k and @
def evaluate_retrieval(model, val_loader, max_batches=20, k_list=[1, 5, 10]):
    model.eval()
    all_q, all_p = [], []

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if i >= max_batches:  # limit for speed
                break
            batch = batch.to(device) if hasattr(batch, "to") else batch
            z_a, z_b = model(batch)
            all_q.append(z_a.cpu())
            all_p.append(z_b.cpu())

    Q = torch.cat(all_q)  # query embeddings
    P = torch.cat(all_p)  # positive (target) embeddings

    sims = Q @ P.t()  # cosine similarity (because the embeddings are normalized)
    ranks = torch.argsort(sims, dim=-1, descending=True)

    mrr = 0.0
    precision_at_k = {k: 0.0 for k in k_list}
    ndcg_at_k      = {k: 0.0 for k in k_list}
    n = Q.size(0)

    for i in range(n):
        ranking = ranks[i].tolist()
        true_idx = i  # by construction, ith query’s positive is at index i
        rank_pos = ranking.index(true_idx)  # 0-based rank

        # MRR contribution
        mrr += 1.0 / (rank_pos + 1)

        # DCG for a single relevant item (relevance=1 at true_idx)
        dcg = 1.0 / math.log2(rank_pos + 2)  # rank_pos 0 -> log2(2) = 1
        idcg = 1.0  # best possible DCG (if at rank 1)
        ndcg = dcg / idcg

        for k in k_list:
            # P@k: 1 if the true positive appears in the top-k, else 0
            if rank_pos < k:
                precision_at_k[k] += 1.0
            # nDCG@k: only count if within the top-k, else 0
            if rank_pos < k:
                ndcg_at_k[k] += ndcg

    mrr /= n
    for k in k_list:
        precision_at_k[k] /= n
        ndcg_at_k[k]      /= n

    return mrr, precision_at_k, ndcg_at_k

In [None]:
# Create loaders for the BookCorpus model
book_train_loader, book_val_loader = make_loaders(
    book_train_anchor, book_train_pos,
    book_val_anchor,   book_val_pos,
    batch_size=64,    # batch size as requested in config docs ???????WHERE
)

# Optimizer + LR schedule (linear warmup + cosine decay)
learning_rate = 5e-4
num_epochs = 3
num_training_steps = num_epochs * len(book_train_loader)

optimizer = torch.optim.AdamW(general_model.parameters(), lr=learning_rate)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps,
)

# Lists to store loss history for plotting later
gen_train_losses = []
gen_val_losses   = []
