#Initial Setups

In [1]:
!pip install -q pydantic evaluate transformers datasets accelerate bitsandbytes peft trl wandb
!pip install -q torch torchinfo sentence-transformers faiss-cpu chromadb whoosh
!pip install -q scikit-learn matplotlib seaborn pandas numpy
!pip install transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m21.6 MB/s[0m eta [3

In [8]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import List, Dict, Tuple, Optional, Any, Union
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
import time
import math

# Pydantic for data validation
from pydantic import BaseModel, Field, field_validator, ConfigDict

# HuggingFace ecosystem
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
from datasets import Dataset as HFDataset
import evaluate  # HF evaluate library for metrics

# PEFT for parameter-efficient fine-tuning
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

# TRL for RLHF and instruction tuning
from trl import SFTTrainer, DPOTrainer

# Accelerate for distributed training
from accelerate import Accelerator

# 8-bit optimization
import bitsandbytes as bnb

# IR libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import faiss  # Facebook AI Similarity Search for efficient nearest neighbor

# Experiment tracking
import wandb

# Set device and seeds
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(42)
np.random.seed(42)
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")



Using device: cpu


#Data Structures and Pydantic Validation

In [9]:
class Document(BaseModel):
  """Document with validation using pydantic v2"""

  model_config = ConfigDict(
      extra='forbid',  # no extra fields allowed
      validate_assignment=True # validate on reassingment
  )


  doc_id: str = Field(..., min_length = 1, description="Document identifier")
  title: str = Field(..., min_length = 1, description="Document title")
  content: str = Field(..., min_length = 1, description="Document text content")
  metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")

  @field_validator('content')
  @classmethod
  def validate_content(cls, v:str) -> str:
    """Ensure content is non-empty after stripping"""

    if not v.strip():
      raise ValueError("Content cannot be empty")
    return v.strip()


  def get_tokens(self) -> List[str]:
    """Simple whitespace tokenization"""

    return self.content.lower().split()


class Query(BaseModel):
  """Query with validation"""

  query_id: str = Field(..., min_length=1)
  text: str = Field(..., min_length=1)
  metadata: Dict[str, Any] = Field(default_factory=dict )

  def get_tokens(self) -> List[str]:
    """Tokenize query text"""

    return self.text.lower().split()


class DocumentCollection(BaseModel):
  """Collection of documents with indexing capabilities"""

  documents: List[Document] = Field(defualt_factory=list)

  def add_document(self, doc: Document):
    """Add validated document to collection"""

    self.documents.append(doc)


  def get_vocabulary(self) -> List[str]:
    """Extract unique vocabulary from all the documents"""

    vocab = set()
    for doc in self.documents:
      vocab.update(doc.get_tokens()) # Adding all the tokens to the set

    return sorted(list(vocab))



##Simple test with Shakespeare corpus from the book

In [13]:
shakespeare_docs = [
  Document(
    doc_id="as_you_like_it",
    title="As You Like It",
    content="battle good fool wit love forest magic",
    metadata={"genre": "comedy", "year": 1599}
  ),
  Document(
    doc_id="twelfth_night",
    title="Twelfth Night",
    content="good fool wit love comedy mistaken identity",
    metadata={"genre": "comedy", "year": 1602}
  ),
  Document(
    doc_id="julius_caesar",
    title="Julius Caesar",
    content="battle battle battle good fool war rome politics",
    metadata={"genre": "tragedy", "year": 1599}
  ),
  Document(
    doc_id="henry_v",
    title="Henry V",
    content="battle battle battle battle good wit war king england",
    metadata={"genre": "history", "year": 1599}
  )
]

collection = DocumentCollection(documents=shakespeare_docs)
print(f"Created collection with {len(collection.documents)} documents")
print(f"Vocabulary size: {len(collection.get_vocabulary())} unique terms")
print(f"Sample vocabulary: {collection.get_vocabulary()[:5]}")

Created collection with 4 documents
Vocabulary size: 15 unique terms
Sample vocabulary: ['battle', 'comedy', 'england', 'fool', 'forest']


# TF-IDF with scikit-learn

In [30]:
class TFIDFRetriever:
  """TF-idf retriever with pydantic validation and sklearn"""

  def __init__(self, collection: DocumentCollection):
    self.collection = collection
    self.vectorizer = TfidfVectorizer(
      lowercase=True,  #convert to lowercase
      max_features=1000, # max vocab size
      ngram_range=(1,2), # use unigrams and bigrams
      sublinear_tf=True, # log(tf) instead of just raw tf
      smooth_idf=True, # Laplace smoothing
      norm='l2' # normalize with ridge for cos sim
    )

    self.tfidf_matrix = None
    self.doc_ids = []


  def fit(self):
    """Build TF-IDF matrix from doc collection"""

    texts = []

    for doc in self.collection.documents:
      texts.append(doc.content)
      self.doc_ids.append(doc.doc_id) # storing doc ids


    print("Documents from test")
    print("Row 0: battle good fool wit love forest magic")
    print("Row 1: good fool wit love comedy mistaken identity")
    print("Row 2: battle battle battle good fool war rome politics")
    print("Row 3: battle battle battle battle good wit war king england")


    # fit and transform docs to tf idf matrix
    self.tfidf_matrix = self.vectorizer.fit_transform(texts) # [n_docs, n_features]  n_docs x vocab per doc
    print(f"\nself.tfidf_matrix.shape: {self.tfidf_matrix.shape}\n")
    print(f"\nself.tfidf_matrix:\n {self.tfidf_matrix}\n")


  def search(self,query: Query, top_k: int=3) -> List[Tuple[Document,float]]:
    """Search doc using tf-idf cos similarity"""

    # Transform qry to tf-idf vector
    query_vector = self.vectorizer.transform([query.text]) # [1, n_features]  single vector with all the features/words

    # Compute cos similarity
    similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

    print(f"\nQuery: '{query.text}'")
    print(f"Query vector shape: {query_vector.shape}")
    print(f"Similarities: {similarities}")

    # Top-k doc indices
    top_indices = np.argsort(similarities)[-top_k:][::-1] # sort desc

    results = []

    for idx in top_indices:
      doc = self.collection.documents[idx]
      score = similarities[idx]
      results.append((doc, score))
      print(f"  Rank {len(results)}: '{doc.title}' (score={score:.4f})")

    return results


# Simple test for the TF-IDF Retriever

In [35]:
retriever = TFIDFRetriever(collection)
retriever.fit()


# Get the feature names (what each column represents)
print("\nUnigrams and bigrams:\n")
feature_names = retriever.vectorizer.get_feature_names_out()
for idx, feature in enumerate(feature_names):
    print(f"Column {idx}: '{feature}'")


test_query = Query(query_id="q1", text="battle war")
results = retriever.search(test_query, top_k=3)

Documents from test
Row 0: battle good fool wit love forest magic
Row 1: good fool wit love comedy mistaken identity
Row 2: battle battle battle good fool war rome politics
Row 3: battle battle battle battle good wit war king england

self.tfidf_matrix.shape: (4, 32)


self.tfidf_matrix:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 50 stored elements and shape (4, 32)>
  Coords	Values
  (0, 0)	0.22325169970289915
  (0, 11)	0.18252289313304706
  (0, 6)	0.22325169970289915
  (0, 29)	0.22325169970289915
  (0, 17)	0.2757602638693091
  (0, 9)	0.34976692846571494
  (0, 20)	0.34976692846571494
  (0, 2)	0.22325169970289915
  (0, 12)	0.22325169970289915
  (0, 8)	0.2757602638693091
  (0, 30)	0.2757602638693091
  (0, 19)	0.34976692846571494
  (0, 10)	0.34976692846571494
  (1, 11)	0.17057535199597096
  (1, 6)	0.20863814180702328
  (1, 29)	0.20863814180702328
  (1, 17)	0.25770961257841624
  (1, 12)	0.20863814180702328
  (1, 8)	0.25770961257841624
  (1, 30)	0.25770961257841624
  (1