In [1]:
print("Hello")

Hello


In [None]:
from dotenv import load_dotenv
import os

load_dotenv()  # loads .env file

OPENAI_API_KEY = os.getenv("openai_api_key")

### Reading the PDF

In [None]:
import fitz  # pip install pymupdf

def extract_with_headings(pdf_path, output_file=None):
    doc = fitz.open(pdf_path)
    output_lines = []

    for page_num, page in enumerate(doc, start=1):
        output_lines.append(f"\n--- Page {page_num} ---\n")
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" not in b:
                continue
            for line in b["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    size = span["size"]
                    font = span["font"]

                    # Simple rule: bigger font = heading
                    if size >= 16:
                        output_lines.append(f"\n[HEADING] {text}\n")
                    else:
                        output_lines.append(f"{text} ")

        output_lines.append("\n")
    
    doc.close()
    
    output_text = "".join(output_lines)
    
    if output_file:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(output_text)
        print(f"Output saved to {output_file}")
    else:
        print(output_text)
    
    return output_text

extract_with_headings("THE_BHARATIYA_NYAYA_SANHITA_2023.pdf", "extracted_text.txt")


### Regex 

#### Chapter

In [3]:
import re

# Read the extracted text
with open('extracted_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Regex to find chapters and clean headings
pattern = r'CHAPTER([IVX]+)\s+([A-Z\s,]+?)(?=\s+\d+\.|$)'
chapters = []

for match in re.finditer(pattern, text):
    chapter_num = match.group(1)
    heading = match.group(2).strip().lstrip()
    
    # Join single letters with next capitalized part: "P RELIMINARY" -> "PRELIMINARY"
    parts = heading.split()
    cleaned = []
    i = 0
    while i < len(parts):
        if len(parts[i]) == 1 and i + 1 < len(parts) and parts[i+1][0].isupper():
            cleaned.append(parts[i] + parts[i+1])
            i += 2
        else:
            cleaned.append(parts[i])
            i += 1
    
    heading_clean = ' '.join(cleaned).replace(' ,', ',')
    chapters.append({'chapter': f'CHAPTER {chapter_num}', 'heading': heading_clean})
    print(f"{chapters[-1]['chapter']}: {chapters[-1]['heading']}")

print(f"\nTotal chapters found: {len(chapters)}")

CHAPTER I: PRELIMINARY
CHAPTER II: OF PUNISHMENTS
CHAPTER III: GENERAL EXCEPTIONS
CHAPTER VII: OF OFFENCES AGAINST THE STATE
CHAPTER VIII: OF OFFENCES RELATING TO THE ARMY,N AVY AND AIR FORCE
CHAPTER IX: OF OFFENCES RELATING TO ELECTIONS
CHAPTER XI: OF OFFENCES AGAINST THE PUBLIC TRANQUILLITY
CHAPTER XII: OF OFFENCES BY OR RELATING TO PUBLIC SERVANTS
CHAPTER XIII: OF CONTEMPTS OF THE LAWFUL AUTHORITY OF PUBLIC SERVANTS
CHAPTER XIV: OF FALSE EVIDENCE AND OFFENCES AGAINST PUBLIC JUSTICE
CHAPTER XX: REPEAL AND SAVINGS

Total chapters found: 11


#### Sections

In [13]:
# Find section numbers like " 1." not followed by "-"
# Pattern: space + number + period, NOT followed by dash
pattern = r'\s+(\d{1,3})\.(?!\s*[-–—])'

sections = []
for match in re.finditer(pattern, text):
    section_num = match.group(1)
    sections.append(int(section_num))

    cleaned = match.group(0).strip()
    if cleaned:
        print(cleaned,end=" | ")

1. | 2. | 3. | 4. | 5. | 6. | 7. | 8. | 9. | 10. | 11. | 12. | 13. | 14. | 15. | 16. | 17. | 18. | 19. | 20. | 21. | 22. | 23. | 24. | 25. | 26. | 27. | 28. | 29. | 30. | 31. | 32. | 33. | 34. | 35. | 36. | 37. | 38. | 39. | 40. | 41. | 42. | 43. | 44. | 45. | 46. | 47. | 48. | 49. | 50. | 51. | 52. | 53. | 54. | 55. | 56. | 57. | 58. | 59. | 60. | 61. | 62. | 63. | 64. | 65. | 66. | 67. | 63. | 68. | 63. | 64. | 69. | 70. | 71. | 72. | 73. | 74. | 75. | 76. | 77. | 78. | 79. | 80. | 81. | 82. | 83. | 84. | 85. | 86. | 87. | 88. | 89. | 90. | 91. | 92. | 93. | 94. | 95. | 96. | 97. | 98. | 99. | 98. | 100. | 101. | 102. | 103. | 104. | 105. | 106. | 107. | 108. | 109. | 110. | 111. | 112. | 113. | 114. | 115. | 116. | 117. | 118. | 119. | 120. | 121. | 122. | 101. | 123. | 124. | 125. | 126. | 127. | 128. | 129. | 130. | 131. | 132. | 133. | 134. | 135. | 136. | 131. | 137. | 138. | 139. | 140. | 141. | 142. | 143. | 145. | 146. | 147. | 148. | 149. | 150. | 151. | 152. | 153. | 147. |

In [11]:
pattern = r'--- Page (\d+) ---'

page_numbers = [int(n) for n in re.findall(pattern, text)]

print(page_numbers)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102]


#### for sub sections

In [15]:
# Find numbers in format "( 3 )" - parentheses with spaces around number
pattern = r'\(\s+(\d+)\s+\)'

numbered_items = []
for match in re.finditer(pattern, text):
    num = match.group(1)
    numbered_items.append(int(num))
    print(f"{match.group(0)}", end=" ")

( 1 ) ( 2 ) ( 3 ) ( 4 ) ( 5 ) ( 6 ) ( 1 ) ( 2 ) ( 3 ) ( 4 ) ( 5 ) ( 6 ) ( 7 ) ( 8 ) ( 9 ) ( 10 ) ( 11 ) ( 12 ) ( 13 ) ( 14 ) ( 15 ) ( 16 ) ( 17 ) ( 18 ) ( 19 ) ( 20 ) ( 21 ) ( 22 ) ( 23 ) ( 24 ) ( 2 ) ( 3 ) ( 4 ) ( 5 ) ( 7 ) ( 8 ) ( 6 ) ( 7 ) ( 2 ) ( 1 ) ( 1 ) ( 25 ) ( 26 ) ( 27 ) ( 28 ) ( 31 ) ( 45 ) ( 29 ) ( 30 ) ( 31 ) ( 32 ) ( 33 ) ( 34 ) ( 35 ) ( 36 ) ( 37 ) ( 38 ) ( 39 ) ( 1 ) ( 2 ) ( 3 ) ( 4 ) ( 5 ) ( 6 ) ( 7 ) ( 8 ) ( 9 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 3 ) ( 4 ) ( 5 ) ( 6 ) ( 7 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 3 ) ( 4 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 2 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 2 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 1 ) ( 1 ) ( 2 ) ( 1 ) ( 3 ) ( 1 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 1 ) ( 1 ) ( 2 ) ( 1 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 1 ) ( 1 ) ( 1 ) ( 2 ) ( 3 ) ( 4 ) ( 5 ) ( 6 ) ( 7 ) ( 1 ) ( 2 ) ( 1 ) ( 2 ) ( 3 ) ( 4 ) ( 5 ) ( 6 ) ( 7 ) ( 1 ) ( 2 ) ( 1 ) ( 1 ) ( 2 ) ( 2 ) ( 3 ) ( 1 ) ( 4 ) ( 1 ) ( 1 ) ( 2 ) ( 2 ) ( 1 ) ( 1 ) ( 2 ) ( 1 ) ( 1 ) ( 2 ) ( 1 ) ( 1 ) ( 2 ) ( 1 ) ( 

### Chunks

In [20]:
import re
import json

pdf_name = "THE_BHARATIYA_NYAYA_SANHITA_2023.pdf"
source_url = "https://www.mha.gov.in/sites/default/files/250883_english_01042024.pdf"

with open('extracted_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Clean heading helper
def clean_heading(h):
    parts = h.strip().lstrip().split()
    cleaned = []
    i = 0
    while i < len(parts):
        if len(parts[i]) == 1 and i + 1 < len(parts) and parts[i+1][0].isupper():
            cleaned.append(parts[i] + parts[i+1])
            i += 2
        else:
            cleaned.append(parts[i])
            i += 1
    return ' '.join(cleaned).replace(' ,', ',')

# Extract all markers with positions
page_positions = [(int(m.group(1)), m.start()) for m in re.finditer(r'--- Page (\d+) ---', text)]
chapters = [(m.group(1), clean_heading(m.group(2)), m.start()) for m in re.finditer(r'CHAPTER([IVX]+)\s+([A-Z\s,]+?)(?=\s+\d+\.|$)', text)]
sections = [(int(m.group(1)), m.start()) for m in re.finditer(r'\s+(\d{1,3})\.(?!\s*[-–—])', text)]
# Subsection pattern: full stop + space + ( number ) - avoids picking numbers from middle of sentences
subsections = [(int(m.group(1)), m.start()) for m in re.finditer(r'\.\s+\(\s+(\d+)\s+\)', text)]

# Helper to get page number for a position
def get_page(pos):
    for i, (pnum, ppos) in enumerate(page_positions):
        if i + 1 < len(page_positions):
            if ppos <= pos < page_positions[i+1][1]:
                return pnum
        elif ppos <= pos:
            return pnum
    return None

# Helper to find subsections within a section
def get_subsections_for_section(sec_pos, sec_end_pos):
    return [(num, pos) for num, pos in subsections if sec_pos <= pos < sec_end_pos]

# Create chunks hierarchically: Chapter -> Section -> Subsection
chunks = []

# Iterate through chapters
for i, (chap_num, chap_heading, chap_pos) in enumerate(chapters):
    # Get sections for this chapter
    chap_end_pos = chapters[i+1][2] if i+1 < len(chapters) else len(text)
    chapter_sections = [(num, pos) for num, pos in sections if chap_pos <= pos < chap_end_pos]
    
    # Iterate through sections in this chapter
    for j, (sec_num, sec_pos) in enumerate(chapter_sections):
        # Get subsections for this section
        sec_end_pos = chapter_sections[j+1][1] if j+1 < len(chapter_sections) else chap_end_pos
        section_subsections = get_subsections_for_section(sec_pos, sec_end_pos)
        
        if section_subsections:
            # Section has subsections - create chunk for each
            for k, (subnum, subpos) in enumerate(section_subsections):
                sub_end_pos = section_subsections[k+1][1] if k+1 < len(section_subsections) else sec_end_pos
                # subpos points to period, find actual ( number ) position for page tracking
                subsection_match = re.search(r'\(\s+\d+\s+\)', text[subpos:subpos+20])
                actual_subpos = subpos + subsection_match.start() if subsection_match else subpos
                # Remove period + space + ( number ) from content
                content = re.sub(r'\.\s+\(\s+\d+\s+\)', '', text[subpos:sub_end_pos], count=1).strip()
                
                page_num = get_page(actual_subpos)
                chunks.append({
                    'page_number': page_num,
                    'pdf_name': pdf_name,
                    'source_url': f'{source_url}#page={page_num}' if page_num else source_url,
                    'chapter': f'CHAPTER {chap_num}',
                    'chapter_heading': chap_heading,
                    'section': f'Section {sec_num}',
                    'subsection': f'({subnum})',
                    'content': content
                })
        else:
            # Section has no subsections - create single chunk for entire section
            content = text[sec_pos:sec_end_pos].strip()
            # Remove section number pattern if at start
            content = re.sub(r'^\s*\d+\.\s*', '', content).strip()
            
            page_num = get_page(sec_pos)
            chunks.append({
                'page_number': page_num,
                'pdf_name': pdf_name,
                'source_url': f'{source_url}#page={page_num}' if page_num else source_url,
                'chapter': f'CHAPTER {chap_num}',
                'chapter_heading': chap_heading,
                'section': f'Section {sec_num}',
                'subsection': None,
                'content': content
            })

# Save chunks to JSON
with open('chunks.json', 'w', encoding='utf-8') as f:
    json.dump(chunks, f, indent=2, ensure_ascii=False)

print(f"Total chunks: {len(chunks)}")
print(f"Saved to chunks.json")
if chunks:
    c = chunks[0]
    print(f"\nSample: Page {c['page_number']} | {c['chapter']} | {c['section']} | Subsection: {c['subsection']}")
    print(f"Content: {c['content'][:150]}...")

Total chunks: 532
Saved to chunks.json

Sample: Page 1 | CHAPTER I | Section 1 | Subsection: (1)
Content: ThisAct maybe called the Bharatiya Nyaya Sanhita, 2023...


### FAISS

In [21]:
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings
from typing import List, Dict
import os

class LegalFAISSIndex:
    def __init__(self, openai_api_key: str):
        """
        Initialize FAISS for legal document retrieval using OpenAI embeddings (text-embedding-3-large)
        """
        self.embedding_model = OpenAIEmbeddings(
            model='text-embedding-3-large',
            api_key=openai_api_key,
        )
        # OpenAI text-embedding-3-large has 3072 dimensions
        self.dimension = 3072
        self.index = None
        self.chunks = []
        
    def create_index(self, chunks: List[Dict]):
        """
        Create FAISS Flat index for maximum accuracy
        """
        self.chunks = chunks
        texts = [chunk['content'] for chunk in chunks]
        
        # Generate embeddings using OpenAI
        print("Generating embeddings with OpenAI text-embedding-3-large...")
        embeddings = self.embedding_model.embed_documents(texts)
        embeddings = np.array(embeddings).astype('float32')
        
        # Normalize for cosine similarity
        faiss.normalize_L2(embeddings)
        
        # Use Flat index for maximum accuracy
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings)
        print(f"Added {len(embeddings)} vectors to Flat index")
        
    def search(self, query: str, k: int = 5):
        """
        Search for relevant sections using pure semantic search
        
        Args:
            query: Legal query or crime description
            k: Number of results to return
        """
        # Generate query embedding
        query_vector = self.embedding_model.embed_query(query)
        query_vector = np.array([query_vector]).astype('float32')
        faiss.normalize_L2(query_vector)
        
        # Search
        scores, indices = self.index.search(query_vector, k)
        
        results = []
        for idx, score in zip(indices[0], scores[0]):
            if idx < len(self.chunks):
                result = {
                    'chunk': self.chunks[idx],
                    'score': float(score)
                }
                results.append(result)
        
        return results
    
    def save_index(self, index_path: str, chunks_path: str):
        """Save FAISS index and chunks"""
        faiss.write_index(self.index, index_path)
        import json
        with open(chunks_path, 'w', encoding='utf-8') as f:
            json.dump(self.chunks, f, indent=2, ensure_ascii=False)
        print(f"Saved index to {index_path} and chunks to {chunks_path}")
    
    def load_index(self, index_path: str, chunks_path: str):
        """Load FAISS index and chunks"""
        self.index = faiss.read_index(index_path)
        import json
        with open(chunks_path, 'r', encoding='utf-8') as f:
            self.chunks = json.load(f)
        print(f"Loaded index from {index_path} and {len(self.chunks)} chunks")

In [None]:
import json
import os

# Load chunks
with open('chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

# Create index
indexer = LegalFAISSIndex(openai_api_key=OPENAI_API_KEY)
indexer.create_index(chunks)

# Optional: Save index for future use
indexer.save_index('legal_index.faiss', 'chunks.json')

Generating embeddings with OpenAI text-embedding-3-large...
Added 532 vectors to Flat index
Saved index to legal_index.faiss and chunks to chunks.json


### Query

In [None]:
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings
import json
import os

# Load index and chunks
index = faiss.read_index('legal_index.faiss')
with open('chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

# Initialize embedding model
embedding_model = OpenAIEmbeddings(model='text-embedding-3-large', api_key=OPENAI_API_KEY)

# Query
query = "What is the punishment for murder?"
k = 5

# Generate query embedding
query_vector = embedding_model.embed_query(query)
query_vector = np.array([query_vector]).astype('float32')
faiss.normalize_L2(query_vector)

# Search
scores, indices = index.search(query_vector, k)

# Display results
print(f"Query: {query}\n")
print("=" * 80)
for i, (idx, score) in enumerate(zip(indices[0], scores[0]), 1):
    chunk = chunks[idx]
    print(f"\n[{i}] Score: {score:.4f}")
    print(f"    {chunk['chapter']} - {chunk['chapter_heading']}")
    print(f"    {chunk['section']} - {chunk.get('subsection', 'N/A')}")
    print(f"    Page: {chunk['page_number']}")
    print(f"    Content: {chunk['content'][:200]}...")
    print(f"    Source: {chunk['source_url']}")
    print("-" * 80)


Query: What is the punishment for murder?


[1] Score: 0.5848
    CHAPTER III - GENERAL EXCEPTIONS
    Section 103 - (1)
    Page: 34
    Content: Whoever commits murder shall be punished with death or imprisonment for life, and shall also be liable to fine...
    Source: https://www.mha.gov.in/sites/default/files/250883_english_01042024.pdf#page=34
--------------------------------------------------------------------------------

[2] Score: 0.5826
    CHAPTER III - GENERAL EXCEPTIONS
    Section 104 - None
    Page: 34
    Content: Whoever, being under sentence of imprisonment for life, commits murder, shall be punished with death or with imprisonment for life, which shall mean the remainder of that person’s natural life....
    Source: https://www.mha.gov.in/sites/default/files/250883_english_01042024.pdf#page=34
--------------------------------------------------------------------------------

[3] Score: 0.5433
    CHAPTER III - GENERAL EXCEPTIONS
    Section 105 - None
    Page: 34
 