In [1]:
print("Hello1")

Hello1


In [2]:
from dotenv import load_dotenv
import os

load_dotenv()  # loads .env file

OPENAI_API_KEY = os.getenv("openai_api_key")

In [3]:
import re
import json
from typing import List, Dict

# Read the text file
with open('english_only_output.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Total text length: {len(text)} characters")

# Regex pattern to match case numbers (1-3 digits, not starting with 9xx) followed by year
# Pattern: ^(?!9\d{2})\d{1,3}\.?\s*\n\d{4}\b
pattern = r'^(?!9\d{2})\d{1,3}\.?\s*\n\d{4}\b'

# Find all matches and their positions
matches = list(re.finditer(pattern, text, re.MULTILINE))

print(f"Found {len(matches)} case markers")

# Create chunks based on the pattern
chunks = []
for i, match in enumerate(matches):
    start_pos = match.start()
    # End position is the start of next match, or end of text
    end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(text)
    
    chunk_text = text[start_pos:end_pos].strip()
    
    # Extract case number and year from the match
    case_match = match.group(0)
    case_num = re.search(r'\d{1,3}', case_match).group(0) if re.search(r'\d{1,3}', case_match) else None
    year_match = re.search(r'\d{4}', case_match)
    year = year_match.group(0) if year_match else None
    
    if chunk_text and len(chunk_text) > 50:  # Only include substantial chunks
        chunks.append({
            'chunk_id': len(chunks),
            'case_number': case_num,
            'year': year,
            'content': chunk_text,
            'start_pos': start_pos,
            'end_pos': end_pos
        })

print(f"\n✅ Created {len(chunks)} chunks")
if chunks:
    print(f"\nFirst chunk preview:")
    print(f"Case: {chunks[0]['case_number']}, Year: {chunks[0]['year']}")
    print(f"Content preview: {chunks[0]['content'][:200]}...")

# Save chunks to JSON
with open('chunks.json', 'w', encoding='utf-8') as f:
    json.dump(chunks, f, indent=2, ensure_ascii=False)

print(f"\n✅ Saved chunks to chunks.json")

Total text length: 215195 characters
Found 137 case markers

✅ Created 132 chunks

First chunk preview:
Case: 1, Year: 1990
Content preview: 1.
1990
886/1990
1990
Kadukkak
unnil
Appachan
& Another
vs. Excise
Circle
Inspector
Kerala
High court
Crl M.C.
886/1990
Honorable Supreme Court, ultimately
came to the conclusion that section 167
(2),...

✅ Saved chunks to chunks.json


In [4]:
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings
from typing import List, Dict
import os

class NDPSJudgementsFAISSIndex:
    def __init__(self, openai_api_key: str):
        """
        Initialize FAISS for NDPS judgements retrieval using OpenAI embeddings (text-embedding-3-large)
        """
        self.embedding_model = OpenAIEmbeddings(
            model='text-embedding-3-large',
            api_key=openai_api_key,
        )
        # OpenAI text-embedding-3-large has 3072 dimensions
        self.dimension = 3072
        self.index = None
        self.chunks = []
        
    def create_index(self, chunks: List[Dict]):
        """
        Create FAISS Flat index for maximum accuracy
        """
        self.chunks = chunks
        texts = [chunk['content'] for chunk in chunks]
        
        # Generate embeddings using OpenAI
        print("Generating embeddings with OpenAI text-embedding-3-large...")
        embeddings = self.embedding_model.embed_documents(texts)
        embeddings = np.array(embeddings).astype('float32')
        
        # Normalize for cosine similarity
        faiss.normalize_L2(embeddings)
        
        # Use Flat index for maximum accuracy
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings)
        print(f"✅ Added {len(embeddings)} vectors to Flat index")
        
    def search(self, query: str, k: int = 5):
        """
        Search for relevant judgements
        
        Args:
            query: Legal query or case description
            k: Number of results to return
        """
        # Generate query embedding
        query_vector = self.embedding_model.embed_query(query)
        query_vector = np.array([query_vector]).astype('float32')
        faiss.normalize_L2(query_vector)
        
        # Search
        scores, indices = self.index.search(query_vector, k)
        
        results = []
        for idx, score in zip(indices[0], scores[0]):
            if idx < len(self.chunks):
                result = {
                    'chunk': self.chunks[idx],
                    'score': float(score)
                }
                results.append(result)
        
        return results
    
    def save_index(self, index_path: str, chunks_path: str):
        """Save FAISS index and chunks"""
        faiss.write_index(self.index, index_path)
        import json
        with open(chunks_path, 'w', encoding='utf-8') as f:
            json.dump(self.chunks, f, indent=2, ensure_ascii=False)
        print(f"✅ Saved index to {index_path} and chunks to {chunks_path}")
    
    def load_index(self, index_path: str, chunks_path: str):
        """Load FAISS index and chunks"""
        self.index = faiss.read_index(index_path)
        import json
        with open(chunks_path, 'r', encoding='utf-8') as f:
            self.chunks = json.load(f)
        print(f"✅ Loaded index with {self.index.ntotal} vectors and {len(self.chunks)} chunks")

# Load chunks
with open('chunks.json', 'r', encoding='utf-8') as f:
    chunks = json.load(f)

# Create and save index
indexer = NDPSJudgementsFAISSIndex(OPENAI_API_KEY)
indexer.create_index(chunks)
indexer.save_index('legal_index.faiss', 'chunks.json')

print(f"\n✅ FAISS index created successfully!")

Generating embeddings with OpenAI text-embedding-3-large...
✅ Added 132 vectors to Flat index
✅ Saved index to legal_index.faiss and chunks to chunks.json

✅ FAISS index created successfully!


In [5]:
# Test query
query = "bail application in NDPS cases"

print(f"Query: {query}\n")
print("=" * 80)

# Load index
indexer = NDPSJudgementsFAISSIndex(OPENAI_API_KEY)
indexer.load_index('legal_index.faiss', 'chunks.json')

# Search
results = indexer.search(query, k=5)

print(f"Found {len(results)} relevant results:\n")

for i, result in enumerate(results, 1):
    chunk = result['chunk']
    score = result['score']
    
    print(f"\n{'='*80}")
    print(f"Result {i} (Score: {score:.4f})")
    print(f"Case Number: {chunk.get('case_number', 'N/A')}")
    print(f"Year: {chunk.get('year', 'N/A')}")
    print(f"\nContent:")
    print("-" * 80)
    # Show first 500 characters
    content_preview = chunk['content'][:500]
    print(content_preview)
    if len(chunk['content']) > 500:
        print("...")
    print("-" * 80)

Query: bail application in NDPS cases

✅ Loaded index with 132 vectors and 132 chunks
Found 5 relevant results:


Result 1 (Score: 0.6588)
Case Number: 24
Year: 2001

Content:
--------------------------------------------------------------------------------
24
2001
Union of
India Vs.
Ashok
Kumar
Jaiswal
(2007)15SC
C569
Supreme Court held that Under the
mandatory
conditions provided in
Section 37 before granting bail the
Court is to be satisfied that there are
reasonable grounds for believing that
the accused is not guilty of offence and
that he is not likely to commit offences
under the Act while on bail. This Court
in various judgments while quashing
the orders granting bail to accused of
offence under the Act have cautioned
the courts about the m
...
--------------------------------------------------------------------------------

Result 2 (Score: 0.6484)
Case Number: 3
Year: 1990

Content:
--------------------------------------------------------------------------------
3.
1990
Raj
Ku