# Creating Local RAG Pipeline from Scratch

## 1. Data Preparation and Embedding Creation

### 1.1 Importing PDF Document for our Book

In [6]:
import os
import re
import fitz
import torch
import random
import requests
import textwrap
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from typing import List, Dict# for type hints
from spacy.lang.en import English
from transformers.utils import is_flash_attn_2_available
from sentence_transformers import util, SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

In [5]:
# getting pdf documents from local system
pdf_path = "/Users/adityamishra/Documents/Machine Learning Tutorial/4. RAG/clrs.pdf"

# download pdf if not present
if not os.path.exists(pdf_path):
    print(f"Given path {pdf_path} does not exist. Downloading the pdf file!!!")
    url = "https://www.cs.mcgill.ca/~akroit/math/compsci/Cormen%20Introduction%20to%20Algorithms.pdf"
    
    filename = pdf_path
    response = requests.get(url)# download the file
    
    if response.status_code == 200: # check if the download was successful
        with open(filename, 'wb') as file:
            file.write(response.content)# save the file
        print(f"File downloaded and saved as {filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
else:
    print(f"File already exists at {pdf_path}. Proceeding to read the file.")

File already exists at /Users/adityamishra/Documents/Machine Learning Tutorial/4. RAG/clrs.pdf. Proceeding to read the file.


Sine now we have imported our file now next step is to preprocess the text as we read it. We have imported the pages of book in the `file_path` and now we can open and read it with `PyMuPDF` by typing command `import fitz`.

In [7]:
#performing text formatting according to our need for clrs pdf

def text_formatter(text:str):
    
    #fixing hyphenated words
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
    
    #fixing indentation issues in paragraphs
    lines = text.split('\n')# split text into lines
    processed_lines = []# list to hold processed lines
    
    for line in lines:
        if re.match(r'^\s{4,}', line) or line.strip().startswith('//'):
            processed_lines.append('__CODE__' + line)
        else:
            processed_lines.append(line)

    text = '\n'.join(processed_lines)# join lines back into text
    
    #removing excessive blank lines
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'(?<!__CODE__)  +', ' ', text)# collapse multiple spaces to single space except in code blocks
    #emove __CODE__ markers but keep the indentation
    text = text.replace('__CODE__', '')
    # fixing spacing around punctuation except mathematical notation
    text = re.sub(r'\s+([.,;:!?])', r'\1', text)
    # Preserving mathematical notation spacing
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)
    # Remove excessive whitespace at start/end
    text = text.strip()
    return text

#to detext psuedo code blocks in the text of clrs pdf
def is_algorithm_block(text: str) -> bool:
    indicators = [
        r'\bif\b.*\bthen\b',# if-then statements
        r'\bfor\b.*\bdo\b',# for-do loops
        r'\bwhile\b.*\bdo\b',# while-do loops
        r'\breturn\b',# return statements
        r'\belse\b',# else statements
        r'\bfunction\b',# function definitions
        r'\bprocedure\b',# procedure definitions
        r'\bbegin\b',  # begin blocks
        r'\bend\b',  # end blocks
        r'^\s*//.*',  # comments
        r'[A-Z][A-Z-]+\(',  # Function calls like sorting(), MERGE()
        r'^\s*[\d]+\.',  # Numbered steps
    ]
    
    # check each pattern in text and return True if any matches
    return any(re.search(pattern, text, re.IGNORECASE | re.MULTILINE) for pattern in indicators)

# detecting section headers in the text of clrs pdf
def is_section_header(text: str) -> bool:
    #check if the text is in title case and not too long
    text = text.strip()
    return (len(text) < 100 and
            len(text.split()) < 10 and
            not text.endswith('.') and
            len(text) > 0 and
            not text[0].isdigit())


# chunking texts for preserving context
def smart_chunker(pages_and_texts: List[Dict],
                  chunk_size: int = 1000,
                  overlap: int = 200) -> List[Dict]:
    chunks = []

    for page_data in pages_and_texts:
        text = page_data['text']
        page_num = page_data['page_number']

        # Split by double newlines to get paragraphs/blocks
        blocks = re.split(r'\n\n+', text)

        current_chunk = ""
        current_chunk_metadata = {
            'page_number': page_num,
            'has_algorithm': False,
            'section_header': None
        }

        for block in blocks:
            block = block.strip()
            if not block:
                continue

            # Check if this is a section header
            if is_section_header(block):
                # If we have a current chunk, save it
                if current_chunk:
                    chunks.append({
                        'text': current_chunk.strip(),
                        'page_number': current_chunk_metadata['page_number'],
                        'chunk_char_count': len(current_chunk),
                        'chunk_word_count': len(current_chunk.split()),
                        'chunk_token_count': len(current_chunk) / 4,
                        'has_algorithm': current_chunk_metadata['has_algorithm'],
                        'section_header': current_chunk_metadata['section_header']
                    })

                # Start new chunk with this header
                current_chunk = block + "\n\n"
                current_chunk_metadata = {
                    'page_number': page_num,
                    'has_algorithm': False,
                    'section_header': block
                }
                continue

            # Check if this is an algorithm block
            if is_algorithm_block(block):
                current_chunk_metadata['has_algorithm'] = True

                # If adding this would exceed chunk_size and we have content then save current chunk
                if len(current_chunk) + len(block) > chunk_size and current_chunk:
                    chunks.append({
                        'text': current_chunk.strip(),
                        'page_number': current_chunk_metadata['page_number'],
                        'chunk_char_count': len(current_chunk),
                        'chunk_word_count': len(current_chunk.split()),
                        'chunk_token_count': len(current_chunk) / 4,
                        'has_algorithm': current_chunk_metadata['has_algorithm'],
                        'section_header': current_chunk_metadata['section_header']
                    })

                    # Start new chunk with overlap from previous
                    overlap_text = current_chunk[-overlap:] if len(
                        current_chunk) > overlap else current_chunk
                    current_chunk = overlap_text + "\n\n" + block + "\n\n"
                    current_chunk_metadata = {
                        'page_number': page_num,
                        'has_algorithm': True,
                        'section_header': current_chunk_metadata['section_header']
                    }
                else:
                    # Add algorithm block to current chunk (keep it together!)
                    current_chunk += block + "\n\n"
            else:
                # Regular text block
                if len(current_chunk) + len(block) > chunk_size and current_chunk:
                    # Save current chunk
                    chunks.append({
                        'text': current_chunk.strip(),
                        'page_number': current_chunk_metadata['page_number'],
                        'chunk_char_count': len(current_chunk),
                        'chunk_word_count': len(current_chunk.split()),
                        'chunk_token_count': len(current_chunk) / 4,
                        'has_algorithm': current_chunk_metadata['has_algorithm'],
                        'section_header': current_chunk_metadata['section_header']
                    })

                    # Start new chunk with overlap
                    overlap_text = current_chunk[-overlap:] if len(
                        current_chunk) > overlap else current_chunk
                    current_chunk = overlap_text + "\n\n" + block + "\n\n"
                    current_chunk_metadata = {
                        'page_number': page_num,
                        'has_algorithm': False,
                        'section_header': current_chunk_metadata['section_header']
                    }
                else:
                    current_chunk += block + "\n\n"

        # checking the last chunk from this page
        if current_chunk.strip():
            chunks.append({
                'text': current_chunk.strip(),
                'page_number': current_chunk_metadata['page_number'],
                'chunk_char_count': len(current_chunk),
                'chunk_word_count': len(current_chunk.split()),
                'chunk_token_count': len(current_chunk) / 4,
                'has_algorithm': current_chunk_metadata['has_algorithm'],
                'section_header': current_chunk_metadata['section_header']
            })

    # Add chunk IDs
    for i, chunk in enumerate(chunks):
        chunk['chunk_id'] = i

    return chunks

def open_and_read_pdf(pdf_path: str) -> List[Dict]:
    doc = fitz.open(pdf_path)# opening doc
    pages_and_texts = []# 
    for page_number, page in tqdm(enumerate(doc), desc="Reading PDF pages"):
        text = page.get_text()
        text = text_formatter(text)

        pages_and_texts.append({
            "page_number": page_number - 41,#adjusting to start of content for book
            "page_char_count": len(text),
            "page_word_count": len(text.split()),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })

    return pages_and_texts

In [8]:
pages_and_texts = open_and_read_pdf(pdf_path)
#creating smart chunks
chunks = smart_chunker(pages_and_texts, chunk_size=1000, overlap=200)
#results
print(f"Total pages: {len(pages_and_texts)}")
print(f"Total chunks: {len(chunks)}")
print(f"Chunks with algorithms: {sum(1 for c in chunks if c['has_algorithm'])}")
# Looking at a sample chunk
print("\nSample chunk:")
print(chunks[10])

Reading PDF pages: 0it [00:00, ?it/s]

Total pages: 1313
Total chunks: 4570
Chunks with algorithms: 1859

Sample chunk:
{'text': 'vi\nContents\nII\nSorting and Order Statistics\nIntroduction', 'page_number': -35, 'chunk_char_count': 58, 'chunk_word_count': 8, 'chunk_token_count': 14.5, 'has_algorithm': False, 'section_header': 'vi\nContents\nII\nSorting and Order Statistics\nIntroduction', 'chunk_id': 10}
