# Creating Local RAG Pipeline from Scratch

## 1. Data Preparation and Embedding Creation

### 1.1 Importing PDF Document for our Book

In [1]:
import os
import re
import fitz
import random
import requests
import textwrap
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from typing import List, Dict  # for type hints
from spacy.lang.en import English
from transformers.utils import is_flash_attn_2_available
from sentence_transformers import util, SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
import torch

In [3]:
# getting pdf documents from local system
pdf_path = "/Users/adityamishra/Documents/Machine Learning Tutorial/4. RAG/clrs.pdf"

# download pdf if not present
if not os.path.exists(pdf_path):
    print(f"Given path {pdf_path} does not exist. Downloading the pdf file!!!")
    url = "https://www.cs.mcgill.ca/~akroit/math/compsci/Cormen%20Introduction%20to%20Algorithms.pdf"
    
    filename = pdf_path
    response = requests.get(url)# download the file
    
    if response.status_code == 200: # check if the download was successful
        with open(filename, 'wb') as file:
            file.write(response.content)# save the file
        print(f"File downloaded and saved as {filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
else:
    print(f"File already exists at {pdf_path}. Proceeding to read the file.")

File already exists at /Users/adityamishra/Documents/Machine Learning Tutorial/4. RAG/clrs.pdf. Proceeding to read the file.


Sine now we have imported our file now next step is to preprocess the text as we read it. We have imported the pages of book in the `file_path` and now we can open and read it with `PyMuPDF` by typing command `import fitz`.

In [4]:
def text_formatter(text: str) -> str:
    #fixing hyphenated words split across lines
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
    # fixing words split across lines WITHOUT hyphens
    text = re.sub(r'(\w+)\s*\n\s*(\w+)', lambda m: m.group(1) + m.group(2) if m.group(2)[0].islower() else m.group(0), text)
    # preserve code blocks
    lines = text.split('\n')
    processed_lines = []
    in_code_block = False

    for i, line in enumerate(lines):
        stripped = line.strip()
        #skipping empty lines
        if not stripped:
            processed_lines.append('')
            continue

        # checking if code or pseudocode
        is_code_line = (
            len(line) - len(line.lstrip()) >= 4 or  # 4+ space indent
            stripped.startswith('//') or  # comment
            re.match(r'^(if|for|while|return|else)\b', stripped, re.IGNORECASE) or
            re.match(r'^[A-Z][A-Z\-]+\(', stripped)  # FUNCTION-NAME(
        )

        if is_code_line:
            in_code_block = True
            # keeping the line with a marker
            processed_lines.append('__CODELINE__' + line)
        else:
            # checking if we just exited a code block
            if in_code_block:
                processed_lines.append('__CODEEND__')
                in_code_block = False
            processed_lines.append(line)

    text = '\n'.join(processed_lines)

    #paragraph joining for non-code text where we join lines that are part of the same paragraph
    text = re.sub(
        r'(?<!__CODELINE__)(?<!__CODEEND__)(?<!\n)\n(?!__CODELINE__)(?!__CODEEND__)(?!\n)(?![A-Z])', ' ', text)
    #cleaning up markers
    text = text.replace('__CODELINE__', '')
    text = text.replace('__CODEEND__', '\n')
    #removing excessive blank lines (3+) but keep double newlines for sections
    text = re.sub(r'\n{3,}', '\n\n', text)
    #cleaning up multiple spaces (but not at line start - that's indentation)
    text = re.sub(r'([^\n]) {2,}', r'\1 ', text)
    #fixing spacing around punctuation
    text = re.sub(r'\s+([.,;:!?])', r'\1', text)
    #removing standalone page numbers (just digits on their own line)
    text = re.sub(r'^\s*\d{1,4}\s*$', '', text, flags=re.MULTILINE)
    #removing common header and footer patterns
    text = re.sub(r'^(Chapter|Section)\s+\d+.*$', '',
                  text, flags=re.MULTILINE | re.IGNORECASE)
    text = text.strip()
    return text

#detecting block of algorithm in clrs
def is_algorithm_block(text: str) -> bool:
    indicators = [
        bool(re.search(r'\b(if|then|else)\b', text, re.IGNORECASE)),
        bool(re.search(r'\b(for|while|do)\b', text, re.IGNORECASE)),
        bool(re.search(r'\breturn\b', text, re.IGNORECASE)),
        bool(re.search(r'^\s*//.*', text, re.MULTILINE)),  # comments
        bool(re.search(r'[A-Z][A-Z\-]+\([^)]*\)', text)),  # FUNCTION(...)
        bool(re.search(r'A\[\s*\d+', text)),  # Array notation A[1]
        bool(re.search(r'←|:=|=', text)),  # Assignment operators
    ]
    # Need at least 2 indicators and text should be substantial
    return sum(indicators) >= 2 and len(text.split()) > 10

#section header text
def is_section_header(text: str) -> bool:
    text = text.strip()
    if not text or len(text) > 100:
        return False

    words = text.split()
    if len(words) > 15 or len(words) < 2:
        return False
    # Skip table of contents entries (have lots of dots)
    if text.count('.') > 3:
        return False
    # Skip if it's just "Contents" or roman numerals
    if text.lower() in ['contents', 'preface', 'index', 'references']:
        return False
    # Likely a header if it doesn't end with period and isn't too long
    return not text.endswith('.')

#detecting table of contents
def is_toc_or_front_matter(text: str) -> bool:
    indicators = [
        'contents' in text.lower()[:50],
        'preface' in text.lower()[:50],
        text.count('...') > 2,  # TOC dots
        # Too many periods (TOC page numbers)
        text.count('.') > len(text.split()) * 0.5,
        # Roman numerals only
        bool(re.search(r'^[ivxlcdm]+$', text.strip(), re.IGNORECASE)),
    ]
    return any(indicators) or len(text.split()) < 5

#preserving chunk for contexts
def smart_chunker(pages_and_texts: List[Dict],
                  chunk_size: int = 1000,
                  overlap: int = 200,
                  skip_front_matter: bool = True) -> List[Dict]:
    chunks = []

    for page_data in pages_and_texts:
        text = page_data['text']
        page_num = page_data['page_number']

        # Skip if this looks like front matter
        if skip_front_matter and is_toc_or_front_matter(text):
            continue
        # Skip very short pages (likely artifacts)
        if len(text.split()) < 10:
            continue
        # Split by double newlines to get paragraphs/blocks
        blocks = re.split(r'\n\n+', text)

        current_chunk = ""
        current_chunk_metadata = {
            'page_number': page_num,
            'has_algorithm': False,
            'section_header': None
        }
        for block in blocks:
            block = block.strip()
            if not block or len(block) < 10:
                continue
            if is_section_header(block):# if we have a current chunk, save it
                if current_chunk and len(current_chunk.split()) > 10:
                    chunks.append({
                        'text': current_chunk.strip(),
                        'page_number': current_chunk_metadata['page_number'],
                        'chunk_char_count': len(current_chunk),
                        'chunk_word_count': len(current_chunk.split()),
                        'chunk_token_count': len(current_chunk) / 4,
                        'has_algorithm': current_chunk_metadata['has_algorithm'],
                        'section_header': current_chunk_metadata['section_header']
                    })

                # Start new chunk with this header
                current_chunk = block + "\n\n"
                current_chunk_metadata = {
                    'page_number': page_num,
                    'has_algorithm': False,
                    'section_header': block
                }
                continue

            # Check if this is an algorithm block
            block_has_algo = is_algorithm_block(block)
            if block_has_algo:
                current_chunk_metadata['has_algorithm'] = True

                # If adding this would exceed chunk_size significantly and we have content, save current chunk
                if len(current_chunk) + len(block) > chunk_size * 1.5 and len(current_chunk.split()) > 20:
                    chunks.append({
                        'text': current_chunk.strip(),
                        'page_number': current_chunk_metadata['page_number'],
                        'chunk_char_count': len(current_chunk),
                        'chunk_word_count': len(current_chunk.split()),
                        'chunk_token_count': len(current_chunk) / 4,
                        'has_algorithm': current_chunk_metadata['has_algorithm'],
                        'section_header': current_chunk_metadata['section_header']
                    })

                    # Start new chunk with overlap from previous
                    if len(current_chunk) > overlap:
                        # Get more than we need to find clean word boundary
                        overlap_start = max(
                            0, len(current_chunk) - overlap - 50)
                        overlap_text = current_chunk[overlap_start:]
                        # Find first complete word (space followed by word followed by space)
                        match = re.search(r'\s+(\S+\s+)', overlap_text)
                        if match:
                            # Start from the beginning of that complete word
                            overlap_text = overlap_text[match.start(1):]
                        else:
                            # Fallback: just find first space
                            space_idx = overlap_text.find(' ')
                            if space_idx > 0:
                                overlap_text = overlap_text[space_idx+1:]
                    else:
                        overlap_text = current_chunk
                    current_chunk = overlap_text + "\n\n" + block + "\n\n"
                    current_chunk_metadata = {
                        'page_number': page_num,
                        'has_algorithm': True,
                        'section_header': current_chunk_metadata['section_header']
                    }
                else:
                    # Add algorithm block to current chunk (keep it together!)
                    current_chunk += block + "\n\n"
            else:
                # Regular text block
                if len(current_chunk) + len(block) > chunk_size and len(current_chunk.split()) > 20:
                    # Save current chunk
                    chunks.append({
                        'text': current_chunk.strip(),
                        'page_number': current_chunk_metadata['page_number'],
                        'chunk_char_count': len(current_chunk),
                        'chunk_word_count': len(current_chunk.split()),
                        'chunk_token_count': len(current_chunk) / 4,
                        'has_algorithm': current_chunk_metadata['has_algorithm'],
                        'section_header': current_chunk_metadata['section_header']
                    })

                    # Start new chunk with overlap
                    if len(current_chunk) > overlap:
                        # Find word boundary for clean overlap
                        overlap_text = current_chunk[-overlap:]
                        # Adjust to start at word boundary
                        space_idx = overlap_text.find(' ')
                        if space_idx > 0:
                            overlap_text = overlap_text[space_idx+1:]
                    else:
                        overlap_text = current_chunk
                    current_chunk = overlap_text + "\n\n" + block + "\n\n"
                    current_chunk_metadata = {
                        'page_number': page_num,
                        'has_algorithm': False,
                        'section_header': current_chunk_metadata['section_header']
                    }
                else:
                    current_chunk += block + "\n\n"

        # Save the last chunk from this page if it's substantial
        if current_chunk.strip() and len(current_chunk.split()) > 10:
            chunks.append({
                'text': current_chunk.strip(),
                'page_number': current_chunk_metadata['page_number'],
                'chunk_char_count': len(current_chunk),
                'chunk_word_count': len(current_chunk.split()),
                'chunk_token_count': len(current_chunk) / 4,
                'has_algorithm': current_chunk_metadata['has_algorithm'],
                'section_header': current_chunk_metadata['section_header']
            })

    # Add chunk IDs
    for i, chunk in enumerate(chunks):
        chunk['chunk_id'] = i

    return chunks

In [5]:
def open_and_read_pdf(pdf_path: str, page_offset: int = 0) -> List[Dict]:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc), desc="Reading PDF pages", total=len(doc)):
        text = page.get_text()
        text = text_formatter(text)
        # Skip completely empty pages
        if not text.strip():
            continue
        pages_and_texts.append({
            "page_number": page_number - page_offset,
            "pdf_page_number": page_number,  # Keep original for reference
            "page_char_count": len(text),
            "page_word_count": len(text.split()),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })

    return pages_and_texts

def find_content_start_page(pdf_path: str, sample_size: int = 50) -> int:
    doc = fitz.open(pdf_path)
    for page_num in range(min(sample_size, len(doc))):
        page = doc[page_num]
        text = page.get_text()

        #looking for indicators of actual content like chapter 1 and introduction etc.
        if re.search(r'chapter\s+1|^1\s+introduction', text, re.IGNORECASE | re.MULTILINE):
            print(f"Found likely content start at PDF page {page_num}")
            print(f"First 200 chars: {text[:200]}")
            return page_num

    print("Could not automatically find content start. Please check manually.")
    return 0

In [6]:
content_start = find_content_start_page(pdf_path)
pages_and_texts = open_and_read_pdf(pdf_path, page_offset=41)
chunks = smart_chunker(pages_and_texts, chunk_size=1000,
                       overlap=200, skip_front_matter=True)
print(f"Total pages: {len(pages_and_texts)}")
print(f"Total chunks: {len(chunks)}")
print(
    f"Chunks with algorithms: {sum(1 for c in chunks if c['has_algorithm'])}")
for i in range(50, 53):
    print(f"\n{'='*60}")
    print(f"Chunk {i}:")
    print(
        f"Page: {chunks[i]['page_number']}, Has algo: {chunks[i]['has_algorithm']}")
    print(f"Header: {chunks[i]['section_header']}")
    print(f"Text preview: {chunks[i]['text'][:300]}...")

Found likely content start at PDF page 23
First 200 chars: Introduction
This part will start you thinking about designing and analyzing algorithms. It is
intended to be a gentle introduction to how we specify algorithms, some of the
design strategies we will 


Reading PDF pages:   0%|          | 0/1313 [00:00<?, ?it/s]

Total pages: 1306
Total chunks: 1931
Chunks with algorithms: 1260

Chunk 50:
Page: 14, Has algo: True
Header: None
Text preview: 2.3
Designing algorithms 35 5 2 4 7 1 3 2 6 2 5 4 7 1 3 2 6 2 4 5 7 1 2 3 6 1 2 2 3 4 5 6 7 mergemerge mergesorted sequenceinitial sequencemerge mergemerge merge
Figure 2.4
The operation of merge sort on the array A D h5; 2; 4; 7; 1; 3; 2; 6i. The lengths of thesorted sequences being merged increase...

Chunk 51:
Page: 15, Has algo: True
Header: None
Text preview: Getting Startedthe original problem size is a power of 2. Each divide step then yields two subsequences of size exactly n=2. In Chapter 4, we shall see that this assumption doesnot affect the order of growth of the solution to the recurrence.
We reason as follows to set up the recurrence for T.n/, t...

Chunk 52:
Page: 15, Has algo: True
Header: 2T.n=2/ C ‚.n/
if n > 1:
Text preview: 2T.n=2/ C ‚.n/
if n > 1:

(2.1)
In Chapter 4, we shall see the “master theorem,” which we can use to showthat T.n/ is

In [7]:
random.sample(pages_and_texts, k=3)# finding random text from random pages

[{'page_number': 156,
  'pdf_page_number': 197,
  'page_char_count': 2503,
  'page_word_count': 439,
  'page_sentence_count_raw': 15,
  'page_token_count': 625.75,
  'text': '7.2\nPerformance of quicksort 177 n 0 n–1 (n–1)/2 – 1 (n–1)/2n (n–1)/2 (a) (b) (n–1)/2 Θ(n) Θ(n)\nFigure 7.5 (a) Two levels of a recursion tree for quicksort. The partitioning at the root costs nand produces a “bad” split: two subarrays of sizes 0 and n \x00 1. The partitioning of the subarray ofsize n \x00 1 costs n \x00 1 and produces a “good” split: subarrays of size.n \x00 1/=2 \x00 1 and.n \x00 1/=2. (b) A single level of a recursion tree that is very well balanced. In both parts, the partitioning cost forthe subproblems shown with elliptical shading is ‚.n/. Yet the subproblems remaining to be solvedin (a), shown with square shading, are no larger than the corresponding subproblems remaining to besolved in (b).\nIntuition for the average case\nTo develop a clear notion of the randomized behavior of quicksort

### 1.2 Getting Text Statistics


In [8]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,pdf_page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,0,189,92,4,47.25,A L G O R I T H M S\nI N T R O D U C T I O N T...
1,-40,1,40,5,1,10.0,Introduction to Algorithms\nThird Edition
2,-38,3,165,23,4,41.25,Thomas H. Cormen\nCharles E. Leiserson\nRonald...
3,-37,4,883,127,13,220.75,c 2009 Massachusetts Institute of Technology\n...
4,-36,5,876,134,1,219.0,Contents\nPreface xiii\nI\nFoundations\nIntrod...


In [9]:
df.describe().round(2)# getting basic statistics of the dataframe

Unnamed: 0,page_number,pdf_page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,1306.0,1306.0,1306.0,1306.0,1306.0,1306.0
mean,615.66,656.66,1886.35,333.99,13.02,471.59
std,378.27,378.27,562.75,94.74,9.72,140.69
min,-41.0,0.0,13.0,2.0,1.0,3.25
25%,288.25,329.25,1547.25,279.0,9.0,386.81
50%,615.5,656.5,1918.0,339.0,12.0,479.5
75%,942.75,983.75,2282.25,396.0,15.0,570.56
max,1271.0,1312.0,3225.0,570.0,103.0,806.25


### 1.3 Splitting Sentences

The purpose of splitting our texts into smaller groups is that we want them to fit into our embedding model context window which is having 384 tokens limit.

In [10]:
num_sentence_chunk_size = 10# setting chunk size for sentences
print(f"Sentence chunk size set to: {num_sentence_chunk_size}")

#creating function to split text into sentence chunks
def split_list(input_list: list[str], slice_size:int = num_sentence_chunk_size) -> list[list[str]]:    
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

Sentence chunk size set to: 10


[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]