In [4]:
pdf_path = "asda.pdf"

In [5]:
import fitz
import re

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file"""
    doc = fitz.open(pdf_path)
    full_text = ""
    
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text = page.get_text()
        full_text += text + "\n"
    
    doc.close()
    
    # Clean the text
    full_text = re.sub(r'\s+', ' ', full_text)
    full_text = re.sub(r'\n+', '\n', full_text)
    
    return full_text.strip()

In [6]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, module='colbert')


In [7]:
def chunk_text(text, chunk_size=512, overlap=128):
    """
    Split text into overlapping chunks
    chunk_size: approximate number of words per chunk
    overlap: number of overlapping words between chunks
    """
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if chunk:
            chunks.append(chunk)
    
    return chunks

In [8]:
from colbert import Indexer
from colbert.infra import Run, RunConfig
from colbert.data import Queries, Collection

def initialize_colbert(index_name="pdf_index", experiment_name="pdf_experiment"):
    """Initialize ColBERT with configuration"""
    
    # Configure ColBERT run
    with Run().context(RunConfig(nranks=1, experiment=experiment_name)):
        config = {
            'doc_maxlen': 512,  # Max document length in tokens
            'query_maxlen': 128,  # Max query length in tokens
            'dim': 256,  # Embedding dimension
            'similarity': 'cosine',
            'checkpoint': 'colbert-ir/colbertv2.0'  # Use ColBERT v2.0 checkpoint
        }
        
    return config, index_name

In [9]:
def prepare_collection(pdf_path):
    """Prepare document collection from PDF"""
    
    # Extract text
    text = extract_text_from_pdf(pdf_path)
    
    # Chunk the text
    chunks = chunk_text(text, chunk_size=512, overlap=128)
    
    # Create a collection with metadata
    collection = []
    metadata = []
    
    for idx, chunk in enumerate(chunks):
        collection.append(chunk)
        metadata.append({
            'chunk_id': idx,
            'source': pdf_path,
            'text': chunk[:100] + '...'  # Store preview
        })
    
    return collection, metadata

In [10]:
from colbert import Indexer
from colbert.infra import Run, RunConfig, ColBERTConfig
import torch

def index_documents(collection, index_name="pdf_index"):
    """Index the document collection using ColBERT"""
    
    # Check if CUDA is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    if device == "cuda":
        print(f"GPU: {torch.cuda.get_device_name(0)}")
    
    with Run().context(RunConfig(nranks=1, gpus=1)):
        config = ColBERTConfig(
            doc_maxlen=512,
            nbits=2,  # Compression bits
            kmeans_niters=4,
            checkpoint='colbert-ir/colbertv2.0'
        )
        
        indexer = Indexer(checkpoint="colbert-ir/colbertv2.0", config=config)
        
        # Create index
        indexer.index(
            name=index_name,
            collection=collection,
            overwrite=True
        )
        
        indexer.get_index()
    
    return indexer

In [11]:
from colbert import Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
import torch

def search_pdf(query, index_name="pdf_index", k=5):
    """Search the indexed PDF content"""
    
    # Check if CUDA is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device for search: {device}")
    
    with Run().context(RunConfig(nranks=1, gpus=1)):
        config = ColBERTConfig(
            checkpoint='colbert-ir/colbertv2.0'
        )
        
        searcher = Searcher(index=index_name, config=config)
        
        # Search - returns a Ranking object with passage_ids and scores
        results = searcher.search(query, k=k)
        
        # Format results
        search_results = []
        
        # ColBERT returns a list where each item is (passage_id, rank, score)
        # or just the ranking indices
        for rank, (passage_id, score) in enumerate(zip(results[0], results[1])):
            search_results.append({
                'passage_id': passage_id,
                'rank': rank + 1,
                'score': score
            })
        
    return search_results

In [12]:
def embed_pdf_with_colbert(pdf_path, index_name="pdf_index"):
    """Complete pipeline to embed PDF with ColBERT v2.0"""
    
    print("Step 1: Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    
    print("Step 2: Chunking document...")
    chunks = chunk_text(text, chunk_size=256, overlap=32)
    print(f"Created {len(chunks)} chunks")
    
    print("Step 3: Initializing ColBERT...")
    config, index_name = initialize_colbert(index_name=index_name)
    
    print("Step 4: Preparing collection...")
    collection, metadata = prepare_collection(pdf_path)
    
    print("Step 5: Indexing documents...")
    indexer = index_documents(collection, index_name)
    
    print("Step 6: Index created successfully!")
    
    # Save metadata for later reference
    import json
    with open(f"{index_name}_metadata.json", "w") as f:
        json.dump(metadata, f)
    
    return index_name, metadata

In [None]:
if __name__ == "__main__":
    pdf_file = "../asda.pdf"
    index_name, metadata = embed_pdf_with_colbert(pdf_file)

Step 1: Extracting text from PDF...


FileNotFoundError: no such file: 'asda.pdf'