In [1]:
print("Hello4")

Hello4


In [None]:
from dotenv import load_dotenv
import os

load_dotenv()  # loads .env file

OPENAI_API_KEY = os.getenv("openai_api_key")

### Reading the PDF

In [10]:
import fitz  # pip install pymupdf

def extract_with_headings(pdf_path, output_file=None):
    doc = fitz.open(pdf_path)
    output_lines = []

    for page_num, page in enumerate(doc, start=1):
        output_lines.append(f"\n--- Page {page_num} ---\n")
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:
            if "lines" not in b:
                continue
            for line in b["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    size = span["size"]
                    font = span["font"]

                    # Simple rule: bigger font = heading
                    if size >= 16:
                        output_lines.append(f"\n[HEADING] {text}\n")
                    else:
                        output_lines.append(f"{text} ")

        output_lines.append("\n")
    
    doc.close()
    
    output_text = "".join(output_lines)
    
    if output_file:
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(output_text)
        print(f"Output saved to {output_file}")
    else:
        print(output_text)
    
    return output_text

extract_with_headings("narcotic_drugs_and_psychotropic_substances_act_1985.pdf", "extracted_text.txt")


Output saved to extracted_text.txt


'\n--- Page 1 ---\n1  THE NARCOTIC DRUGS AND PSYCHOTROPIC SUBSTANCES, ACT, 1985 ____________  ARRANGEMENT OF SECTIONS Last Update 3-1-2022 ___________________ CHAPTER I P RELIMINARY  S ECTIONS 1.  Short title, extent and commencement .  2.  Definitions. 3.  Power to add to or omit from the list of psychotropic substances.  CHAPTER II A UTHORITIES AND OFFICERS  4.  Central Government to take measures for preventing and combating abuse of and illicit traffic in narcotic drugs, etc. 5.  Officers of Central Government. 6.  The Narcotic Drugs and Psychotropie Substances Consultative Committee. 7.  Officers of State Government.  CHAPTER IIA N ATIONAL F UND FOR C ONTROL OF D RUG A BUSE  7A. National Fund for Control of Drug Abuse. 7B. Annual report of activities financed under the Fund.   CHAPTER III P ROHIBITION, CONTROL AND REGULATION  8.  Prohibition of certain operations. 8A. Prohibition of certain activities relating to property derived from offence. 9.  Power of Central Government to pe

### Regex 

#### Chapter

In [8]:
import re

# Read the extracted text
with open('extracted_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Regex to find chapters and clean headings
pattern = r'CHAPTER\s*([IVXLCDM]+)\s+([A-Z\s,\-]{5,})'
chapters = []

for match in re.finditer(pattern, text):
    chapter_num = match.group(1)
    heading = match.group(2).strip().lstrip()
    
    # Join single letters with next capitalized part: "P RELIMINARY" -> "PRELIMINARY"
    parts = heading.split()
    cleaned = []
    i = 0
    while i < len(parts):
        if len(parts[i]) == 1 and i + 1 < len(parts) and parts[i+1][0].isupper():
            cleaned.append(parts[i] + parts[i+1])
            i += 2
        else:
            cleaned.append(parts[i])
            i += 1
    
    heading_clean = ' '.join(cleaned).replace(' ,', ',')
    chapters.append({'chapter': f'CHAPTER {chapter_num}', 'heading': heading_clean})
    print(f"{chapters[-1]['chapter']}: {chapters[-1]['heading']}")

print(f"\nTotal chapters found: {len(chapters)}")

CHAPTER I: PRELIMINARY SECTIONS
CHAPTER II: AUTHORITIES AND OFFICERS
CHAPTER III: PROHIBITION, CONTROL AND REGULATION
CHAPTER IV: OFFENCES AND PENALTIES
CHAPTER V: PROCEDURE
CHAPTER VI: MISCELLANEOUS
CHAPTER I: PRELIMINARY
CHAPTER II: AUTHORITIES AND OFFICERS
CHAPTER III: PROHIBITION, CONTROL AND REGULATION
CHAPTER IV: OFFENCES AND PENALTIES
CHAPTER V: PROCEDURE
CHAPTER VI: MISCELLANEOUS

Total chapters found: 12


#### Sections

In [11]:
import re

# Read file
with open("extracted_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

print("Original length:", len(text))

# Safer pattern: stops strictly before page marker line
pattern = r'\d+\s*\.\s*(?:Ins|Subs).*?(?=\n--- Page \d+ ---)'

# Remove only the unwanted content
cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)

print("Cleaned length:", len(cleaned_text))

# Write back
with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("✅ Cleaned and saved (page markers fully preserved)")

Original length: 169881
Cleaned length: 161574
✅ Cleaned and saved (page markers fully preserved)


In [13]:
# Find section numbers like " 1." not followed by "-"
# Pattern: space + number + period, NOT followed by dash

with open("extracted_text.txt", "r", encoding="utf-8") as f:
    text = f.read()

pattern = r'(?:\s+|\[\s*)(\d{1,3})\.(?!\s*[-–—]) '

sections = []
for match in re.finditer(pattern, text):
    cleaned = match.group(1).strip()
    if cleaned:
        print(cleaned,end=" | ")

1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 27 | 35 | 36 | 36 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 66 | 1 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 

#### Page

In [14]:
pattern = r'--- Page (\d+) ---'

page_numbers = [int(n) for n in re.findall(pattern, text)]

print(page_numbers)

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]


#### for sub sections

In [15]:
# Find numbers in format "( 3 )" - parentheses with spaces around number
pattern = r'\(\s*(\d+)\s*\)'

numbered_items = []
for match in re.finditer(pattern, text):
    num = match.group(1)
    numbered_items.append(int(num))
    print(f"({match.group(1)})", end=" ")

(1) (2) (3) (1) (2) (3) (1) (2) (3) (4) (1) (2) (1) (3) (2) (1) (3) (2) (3) (1) (1) (2) (3) (4) (5) (1) (2) (1) (1) (2) (3) (4) (5) (1) (2) (1) (2) (1) (1) (2) (1) (1) (2) (1) (2) (1) (3) (1) (2) (1) (1) (2) (1) (2) (7) (11) (2) (1) (1) (2) (3) (4) (1) (2) (1) (2) (3) (1) (2) (2) (3) (1) (4) (2) (5) (1) (2) (1) (1) (1) (2) (1) (1) (2) (1) (1) (2) (1) (1) (2) (1) (3) (1) (2) (3) (1) (2) (2) (1) (1) (2) (1) (3) (4) (5) (6) (5) (3) (1) (2) (1) (3) (2) (4) (2) (3) (1) (2) (1) (1) (3) (2) (4) (2) (1) (2) (1) (2) (1) (1) (2) (1) (2) (3) (1) (2) (1) (2) (1) (1) (3) (1) (2) (1) (2) (1) (2) (3) (3) (1) (2) (2) (1) (1) (2) (1) (2) (3) (4) (5) (6) (2) (3) (4) (5) (7) (2) (3) (1) (2) (1) (1) (2) (1) (2) (1) (3) (2) (1) (1) (2) (1) (1) (2) (1) (1) (3) (1) (2) (1) (2) (1) (2) (1) (3) (2) (4) (1) (2) (1) (3) (1) (1) (1) (1) (1) (1) (1) (2) (1) (3) (4) (3) (5) (6) (1) (2) (1) (1) (3) (2) (1) (3) (2) (1) (1) (2) (2) (1) (2) (1) (1) (2) (1) (2) (1) (2) (1) (2) (2) (6) (1) (1) (5) (1) (2) (1) (1) (3) (1)

### Chunks

In [16]:
import re
import json

pdf_name = "narcotic_drugs_and_psychotropic_substances_act_1985.pdf"
source_url = "https://www.indiacode.nic.in/bitstream/123456789/18974/1/narcotic-drugs-and-psychotropic-substances-act-1985.pdf"

with open('extracted_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Clean heading helper
def clean_heading(h):
    parts = h.strip().lstrip().split()
    cleaned = []
    i = 0
    while i < len(parts):
        if len(parts[i]) == 1 and i + 1 < len(parts) and parts[i+1][0].isupper():
            cleaned.append(parts[i] + parts[i+1])
            i += 2
        else:
            cleaned.append(parts[i])
            i += 1
    return ' '.join(cleaned).replace(' ,', ',')

# Extract all markers with positions
page_positions = [(int(m.group(1)), m.start()) for m in re.finditer(r'--- Page (\d+) ---', text)]
chapters = [(m.group(1), clean_heading(m.group(2)), m.start()) for m in re.finditer(r'CHAPTER\s*([IVXLCDM]+)\s+([A-Z\s,\-]{5,})', text)]
sections = [(int(m.group(1)), m.start()) for m in re.finditer(r'(?:\s+|\[\s*)(\d{1,3})\.(?!\s*[-–—]) ', text)]
# Subsection pattern: full stop + space + ( number ) - avoids picking numbers from middle of sentences
subsections = [(int(m.group(1)), m.start()) for m in re.finditer(r'\(\s*(\d+)\s*\)', text)]

# Helper to get page number for a position
def get_page(pos):
    for i, (pnum, ppos) in enumerate(page_positions):
        if i + 1 < len(page_positions):
            if ppos <= pos < page_positions[i+1][1]:
                return pnum
        elif ppos <= pos:
            return pnum
    return None

# Helper to find subsections within a section
def get_subsections_for_section(sec_pos, sec_end_pos):
    return [(num, pos) for num, pos in subsections if sec_pos <= pos < sec_end_pos]

# Create chunks hierarchically: Chapter -> Section -> Subsection
chunks = []

# Iterate through chapters
for i, (chap_num, chap_heading, chap_pos) in enumerate(chapters):
    # Get sections for this chapter
    chap_end_pos = chapters[i+1][2] if i+1 < len(chapters) else len(text)
    chapter_sections = [(num, pos) for num, pos in sections if chap_pos <= pos < chap_end_pos]
    
    # Iterate through sections in this chapter
    for j, (sec_num, sec_pos) in enumerate(chapter_sections):
        # Get subsections for this section
        sec_end_pos = chapter_sections[j+1][1] if j+1 < len(chapter_sections) else chap_end_pos
        section_subsections = get_subsections_for_section(sec_pos, sec_end_pos)
        
        if section_subsections:
            # Section has subsections - create chunk for each
            for k, (subnum, subpos) in enumerate(section_subsections):
                sub_end_pos = section_subsections[k+1][1] if k+1 < len(section_subsections) else sec_end_pos
                # subpos points to period, find actual ( number ) position for page tracking
                subsection_match = re.search(r'\(\s+\d+\s+\)', text[subpos:subpos+20])
                actual_subpos = subpos + subsection_match.start() if subsection_match else subpos
                # Remove period + space + ( number ) from content
                content = re.sub(r'\.\s+\(\s+\d+\s+\)', '', text[subpos:sub_end_pos], count=1).strip()
                
                page_num = get_page(actual_subpos)
                chunks.append({
                    'page_number': page_num,
                    'pdf_name': pdf_name,
                    'source_url': f'{source_url}#page={page_num}' if page_num else source_url,
                    'chapter': f'CHAPTER {chap_num}',
                    'chapter_heading': chap_heading,
                    'section': f'Section {sec_num}',
                    'subsection': f'({subnum})',
                    'content': content
                })
        else:
            # Section has no subsections - create single chunk for entire section
            content = text[sec_pos:sec_end_pos].strip()
            # Remove section number pattern if at start
            content = re.sub(r'^\s*\d+\.\s*', '', content).strip()
            
            page_num = get_page(sec_pos)
            chunks.append({
                'page_number': page_num,
                'pdf_name': pdf_name,
                'source_url': f'{source_url}#page={page_num}' if page_num else source_url,
                'chapter': f'CHAPTER {chap_num}',
                'chapter_heading': chap_heading,
                'section': f'Section {sec_num}',
                'subsection': None,
                'content': content
            })

# Save chunks to JSON
with open('chunks.json', 'w', encoding='utf-8') as f:
    json.dump(chunks, f, indent=2, ensure_ascii=False)

print(f"Total chunks: {len(chunks)}")
print(f"Saved to chunks.json")
if chunks:
    c = chunks[0]
    print(f"\nSample: Page {c['page_number']} | {c['chapter']} | {c['section']} | Subsection: {c['subsection']}")
    print(f"Content: {c['content'][:150]}...")

Total chunks: 337
Saved to chunks.json

Sample: Page 5 | CHAPTER I | Section 1 | Subsection: (1)
Content: ( 1 ) This Act may be called the Narcotic Drugs and Psychotropic Substances Act, 1985....


### FAISS

In [17]:
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings
from typing import List, Dict
import os

class LegalFAISSIndex:
    def __init__(self, openai_api_key: str):
        """
        Initialize FAISS for legal document retrieval using OpenAI embeddings (text-embedding-3-large)
        """
        self.embedding_model = OpenAIEmbeddings(
            model='text-embedding-3-large',
            api_key=openai_api_key,
        )
        # OpenAI text-embedding-3-large has 3072 dimensions
        self.dimension = 3072
        self.index = None
        self.chunks = []
        
    def create_index(self, chunks: List[Dict]):
        """
        Create FAISS Flat index for maximum accuracy
        """
        self.chunks = chunks
        texts = [chunk['content'] for chunk in chunks]
        
        # Generate embeddings using OpenAI
        print("Generating embeddings with OpenAI text-embedding-3-large...")
        embeddings = self.embedding_model.embed_documents(texts)
        embeddings = np.array(embeddings).astype('float32')
        
        # Normalize for cosine similarity
        faiss.normalize_L2(embeddings)
        
        # Use Flat index for maximum accuracy
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings)
        print(f"Added {len(embeddings)} vectors to Flat index")
        
    def search(self, query: str, k: int = 5):
        """
        Search for relevant sections using pure semantic search
        
        Args:
            query: Legal query or crime description
            k: Number of results to return
        """
        # Generate query embedding
        query_vector = self.embedding_model.embed_query(query)
        query_vector = np.array([query_vector]).astype('float32')
        faiss.normalize_L2(query_vector)
        
        # Search
        scores, indices = self.index.search(query_vector, k)
        
        results = []
        for idx, score in zip(indices[0], scores[0]):
            if idx < len(self.chunks):
                result = {
                    'chunk': self.chunks[idx],
                    'score': float(score)
                }
                results.append(result)
        
        return results
    
    def save_index(self, index_path: str, chunks_path: str):
        """Save FAISS index and chunks"""
        faiss.write_index(self.index, index_path)
        import json
        with open(chunks_path, 'w', encoding='utf-8') as f:
            json.dump(self.chunks, f, indent=2, ensure_ascii=False)
        print(f"Saved index to {index_path} and chunks to {chunks_path}")
    
    def load_index(self, index_path: str, chunks_path: str):
        """Load FA
        ISS index and chunks"""
        self.index = faiss.read_index(index_path)
        import json
        with open(chunks_path, 'r', encoding='utf-8') as f:
            self.chunks = json.load(f)
        print(f"Loaded index from {index_path} and {len(self.chunks)} chunks")

In [None]:
import json
import os

# Load chunks
with open('chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

# Initialize with OpenAI API key
# Set your OpenAI API key as environment variable or replace with your key

# Create index
indexer = LegalFAISSIndex(openai_api_key=OPENAI_API_KEY)
indexer.create_index(chunks)

# Optional: Save index for future use
indexer.save_index('legal_index.faiss', 'chunks.json')

Generating embeddings with OpenAI text-embedding-3-large...
Added 337 vectors to Flat index
Saved index to legal_index.faiss and chunks to chunks.json


### Query

In [None]:
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings
import json
import os

# Load index and chunks
index = faiss.read_index('legal_index.faiss')
with open('chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

# Initialize embedding model
embedding_model = OpenAIEmbeddings(model='text-embedding-3-large', api_key=OPENAI_API_KEY)

# Query
query = "What to do after the user was caugght with Ganja "
k = 5

# Generate query embedding
query_vector = embedding_model.embed_query(query)
query_vector = np.array([query_vector]).astype('float32')
faiss.normalize_L2(query_vector)

# Search
scores, indices = index.search(query_vector, k)

# Display results
print(f"Query: {query}\n")
print("=" * 80)
for i, (idx, score) in enumerate(zip(indices[0], scores[0]), 1):
    chunk = chunks[idx]
    print(f"\n[{i}] Score: {score:.4f}")
    print(f"    {chunk['chapter']} - {chunk['chapter_heading']}")
    print(f"    {chunk['section']} - {chunk.get('subsection', 'N/A')}")
    print(f"    Page: {chunk['page_number']}")
    print(f"    Content: {chunk['content'][:200]}...")
    print(f"    Source: {chunk['source_url']}")
    print("-" * 80)


Query: What to do after the user was caugght with Ganja 


[1] Score: 0.4689
    CHAPTER IV - OFFENCES AND PENALTIES
    Section 20 - None
    Page: 16
    Content: Punishment for contravention in relation to cannabis plant and cannabis .—Whoever, in contravention of any provision of this Act or any rule or order made or condition of licence granted thereunder,— ...
    Source: https://www.indiacode.nic.in/bitstream/123456789/18974/1/narcotic-drugs-and-psychotropic-substances-act-1985.pdf#page=16
--------------------------------------------------------------------------------

[2] Score: 0.4258
    CHAPTER IV - OFFENCES AND PENALTIES
    Section 39 - (1)
    Page: 25
    Content: ( 1 ) When any addict is found guilty of an offence punishable under section 27 1 [or for offences relating to small quantity of any narcotic drug or psychotropic substance] and if the court by which ...
    Source: https://www.indiacode.nic.in/bitstream/123456789/18974/1/narcotic-drugs-and-psychotropic-substa

### Creating Index only for Chapter 5

In [None]:
import json
import os

# Load chunks
with open('chunks.json', 'r', encoding='utf-8') as f:
    all_chunks = json.load(f)

# Filter chunks to only Chapter 5 (PROCEDURE)
chapter5_chunks = [chunk for chunk in all_chunks if chunk.get('chapter') == 'CHAPTER V' and chunk.get('chapter_heading') == 'PROCEDURE']

print(f"Total chunks: {len(all_chunks)}")
print(f"Chapter 5 (PROCEDURE) chunks: {len(chapter5_chunks)}")


# Create index only for Chapter 5
indexer = LegalFAISSIndex(openai_api_key=OPENAI_API_KEY)
indexer.create_index(chapter5_chunks)

# Save index with "procedure" in the name
indexer.save_index('legal_index_procedure.faiss', 'chunks_procedure.json')

Total chunks: 325
Chapter 5 (PROCEDURE) chunks: 146
Generating embeddings with OpenAI text-embedding-3-large...
Added 146 vectors to Flat index
Saved index to legal_index_procedure.faiss and chunks to chunks_procedure.json


In [None]:
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings
import json
import os

# Load index and chunks
index = faiss.read_index('legal_index_procedure.faiss')
with open('chunks_procedure.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

# Initialize embedding model
embedding_model = OpenAIEmbeddings(model='text-embedding-3-large', api_key=OPENAI_API_KEY)

# Query
query = "What to do after the suspect was caught with 15kg of Ganja "
k = 15

# Generate query embedding
query_vector = embedding_model.embed_query(query)
query_vector = np.array([query_vector]).astype('float32')
faiss.normalize_L2(query_vector)

# Search
scores, indices = index.search(query_vector, k)

# Display results
print(f"Query: {query}\n")
print("=" * 80)
for i, (idx, score) in enumerate(zip(indices[0], scores[0]), 1):
    chunk = chunks[idx]
    print(f"\n[{i}] Score: {score:.4f}")
    print(f"    {chunk['chapter']} - {chunk['chapter_heading']}")
    print(f"    {chunk['section']} - {chunk.get('subsection', 'N/A')}")
    print(f"    Page: {chunk['page_number']}")
    print(f"    Content: {chunk['content']}...")
    print(f"    Source: {chunk['source_url']}")
    print("-" * 80)


Query: What to do after the suspect was caught with 15kg of Ganja 


[1] Score: 0.4354
    CHAPTER V - PROCEDURE
    Section 63 - (2)
    Page: 32
    Content: ( 2 ) Where any article or thing seized under this Act appears to be liable to confiscation under section 60 or section 61 or section 62, but the person who committed the offence in connection therewith is not known or cannot be found, the court may inquire into and decide such liability, and may order confiscation accordingly: Provided that no order of confiscation of an article or thing shall be made until the expiry of one month from the date of seizure, or without hearing any person who may claim any right thereto and the evidence, if any, which he produces in respect of his claim: Provided further that if any such article or thing, other than a narcotic drug, psychotropic substance 5 [controlled substance], the opium poppy, coca plant or cannabis plant is liable to speedy and natural decay, or if the court is of opinion tha