# RAG Preprocessing

In [1]:
import os
import fitz  # PyMuPDF
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from datetime import datetime

## Configuration

In [2]:
# PATHS
dataset_root = r"D:\Projects\legal\supreme_court_judgments"
output_path = "judgments_chunks.jsonl"

# CHUNK CONFIGURATION
def get_text_splitter():
    return RecursiveCharacterTextSplitter(
        chunk_size=500,     
        chunk_overlap=100
    )


## PDF Extraction and Cleaning

In [3]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'Page \d+', '', text)
    text = re.sub(r"Indian Kanoon.*?", "", text)
    return text.strip()

## Metadata Extraction

In [4]:
def extract_metadata(text, file_name):
    case_title_match = re.search(r'^(.*?vs.*?) on ', text, re.IGNORECASE)
    case_title = case_title_match.group(1).strip() if case_title_match else file_name.replace("_", " ")

    date_match = re.search(r'on (\d{1,2} [A-Za-z]+,? \d{4})', text)
    try:
        date = datetime.strptime(date_match.group(1).replace(",", ""), "%d %B %Y")
        date_of_judgment = date.strftime("%Y-%m-%d")
    except:
        date_of_judgment = None

    citation = ""
    bench = []
    lines = text.split("\n")
    i = 0
    while i < len(lines):
        stripped = lines[i].strip()

        # Citation Extraction
        if re.match(r'^Equivalent\s+Citations?\s*[:：]?', stripped, re.IGNORECASE):
            citation_lines = []

            # extracting content after colon
            match = re.split(r'[:：]', stripped, maxsplit=1)
            if len(match) > 1 and match[1].strip():
                citation_lines.append(match[1].strip())

            # for extracting multi-line citations until another subheading is found
            i += 1
            while i < len(lines):
                next_line = lines[i].strip()
                if re.match(r'^(Bench|Coram|Date|Author|HEADNOTE)\s*[:：]', next_line, re.IGNORECASE):
                    break
                if next_line:
                    citation_lines.append(next_line)
                i += 1

            citation = " ".join(citation_lines)
            continue
        
        # Bench 
        elif re.match(r'^(Bench|Coram)\s*[:：]\s*', stripped, re.IGNORECASE):
            members = stripped.split(":", 1)[-1].strip()
            bench += [b.strip() for b in re.split(r'[;,]', members) if b.strip()]
        
        i += 1

    # Article References Extraction
    article_regex = r'\b(?:Art\.?|Article|Section|Sec\.?|s\.)\s*\d+[A-Za-z0-9\-()]*\)?(?: of the [A-Za-z ()]+)?'

    articles = list(set(re.findall(article_regex, text)))
    raw_articles = re.findall(article_regex, text)

    cleaned_articles = set()
    for art in raw_articles:
        cleaned = re.sub(r'\s+', ' ', art).strip()
        
        if '(' in cleaned and ')' not in cleaned:
            continue

        cleaned_articles.add(cleaned)

    articles = list(cleaned_articles)

    return {
        "doc_id": f"{date_of_judgment or 'unknown'}_{case_title.replace(' ', '_')[:50]}",
        "case_title": case_title,
        "date_of_judgment": date_of_judgment,
        "citation": citation,
        "bench": bench,
        "article_references": articles,
        "bench_strength": len(bench),
        "source_pdf": file_name
    }

## Main Preprocessing

In [5]:
def process_all_pdfs(root_dir, output_json_path):
    all_chunks = []
    splitter = get_text_splitter()

    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.lower().endswith(".pdf"):
                try:
                    pdf_path = os.path.join(subdir, file)
                    file_name = os.path.basename(pdf_path)

                    raw_text = extract_text_from_pdf(pdf_path)
                    cleaned = clean_text(raw_text)

                    normalized = raw_text.replace('\r\n', '\n').replace('\r', '\n')
                    lines = normalized.splitlines()

                    judgment_start_index = None
                    for i, line in enumerate(lines):
                        if re.match(r'^\s*(J\s+U\s+D\s+G\s+M\s+E\s+N\s+T|JUDGMENT)\s*:?', line.strip(), re.IGNORECASE):
                            judgment_start_index = i
                            break

                    if judgment_start_index is not None:
                        judgment_lines = lines[judgment_start_index + 1:]
                        if re.match(r'^[A-Z][a-z]+, J\.', judgment_lines[0]):
                            judgment_lines = judgment_lines[1:]
                        main_text = "\n".join(judgment_lines).strip()
                    else:
                        main_text = normalized

                    metadata = extract_metadata(normalized, file_name)
                    chunks = splitter.split_text(main_text)
                    total_chunks = len(chunks)

                    for i, chunk in enumerate(chunks):
                        chunk_dict = {
                            **metadata,
                            "chunk_id": f"{metadata['doc_id']}_{i+1:03}",
                            "chunk_index": i + 1,
                            "total_chunks": total_chunks,
                            "chunk_text": chunk
                        }
                        all_chunks.append(chunk_dict)

                except Exception as e:
                    print(f"Failed: {file} — {e}")

    with open(output_json_path, "w", encoding="utf-8") as f:
        for chunk in all_chunks:
            f.write(json.dumps(chunk) + "\n")

    print("All judgments processed!\n", f"Output saved to: {output_json_path}")

## Saving the preprocessed data

In [251]:
process_all_pdfs(dataset_root, output_path)


All judgments processed!
 Output saved to: judgments_chunks.jsonl


## Checking the head of the JSONL file

In [6]:
with open(output_path, "r", encoding="utf-8") as f:
    for i in range(5):
        line = f.readline()
        data = json.loads(line)
        print(json.dumps(data, indent=2))

{
  "doc_id": "1950-03-14_Abdulla_Ahmed_vs_Animendra_Kissen_Mitter",
  "case_title": "Abdulla Ahmed vs Animendra Kissen Mitter",
  "date_of_judgment": "1950-03-14",
  "citation": "1950 AIR 15, 1950 SCR 30, AIR 1950 SUPREME COURT 15",
  "bench": [
    "Hiralal J. Kania",
    "Saiyid Fazal Ali",
    "Mehr Chand Mahajan"
  ],
  "article_references": [],
  "bench_strength": 3,
  "source_pdf": "Abdulla_Ahmed_vs_Animendra_Kissen_Mitter_on_14_March_1950_1.PDF",
  "chunk_id": "1950-03-14_Abdulla_Ahmed_vs_Animendra_Kissen_Mitter_001",
  "chunk_index": 1,
  "total_chunks": 186,
  "chunk_text": "APPEAL from the High Court of Judicature at Cal cutta:\nCivil Appeal No. XLIV of 1949.\nThis was an appeal by special leave from a judgment and decree of the High Court of\nJudicature at Calcutta (Hatties C.J. and Mukherjea J.) dated 5th January 1948 which\nvaried a judgment passed by a single Judge sitting on the Original Side of the same\nHigh Court (Gentle J.) dated 11th June, 1945. The facts of the ca

## Ensuring All PDFs were processed

In [7]:
# Get all PDF files
pdf_files = set()
for root, _, files in os.walk(dataset_root):
    for file in files:
        if file.lower().endswith(".pdf"):
            pdf_files.add(file)

print(f"Total PDF files found: {len(pdf_files)}")

# Read JSONL and extract source PDFs
jsonl_pdfs = set()
with open(output_path, "r", encoding="utf-8") as f:
    for line in f:
        try:
            obj = json.loads(line)
            jsonl_pdfs.add(obj.get("source_pdf", ""))
        except json.JSONDecodeError:
            print("JSON decode error in a line")

print(f"Unique PDFs processed in JSONL: {len(jsonl_pdfs)}")

# Compare
missing = pdf_files - jsonl_pdfs
extra = jsonl_pdfs - pdf_files

if missing:
    print(f"\n{len(missing)} PDFs missing from JSONL:")
    for m in sorted(missing):
        print(" -", m)
else:
    print("\nAll PDFs processed.")

if extra:
    print(f"\n{len(extra)} extra entries in JSONL not found in folder:")
    for e in sorted(extra):
        print(" -", e)


Total PDF files found: 26688
Unique PDFs processed in JSONL: 26688

All PDFs processed.


## Cleaning and Checking the Data

In [8]:
missing_dates = []
dates = []

with open(output_path, "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        date = record.get("date_of_judgment")
        if date:
            dates.append(date)
        else:
            missing_dates.append(record.get("source_pdf"))

if dates:
    print("Earliest date:", min(dates))
    print("Latest date:", max(dates))

if missing_dates:
    print(f"\n{len(missing_dates)} records missing dates:")
    for pdf in sorted(set(missing_dates)):
        print(" -", pdf)


Earliest date: 1950-01-24
Latest date: 2025-04-04

291 records missing dates:
 - Abdul_Khader_Rowther_vs_P_K_Sara_Bai_And_Ors_on_28_August_1989_1.PDF
 - Bharat_Petroleum_Corpn_Ltd_vs_Maddula_Ratnavalli_Ors_on_27_April_2007_1.PDF
 - Bridge_Roof_Co_India_Ltd_vs_Union_Of_India_Uoi_on_11_September_1962_1.PDF
 - India_Tobacco_Co_Ltd_vs_The_Commercial_Tax_Officer_on_6_November_1974_1.PDF
 - Krishan_Gopal_vs_Sri_Prakashchandra_And_Ors_on_8_November_1973_1.PDF


In [None]:
target_pdfs = set([
    "Abdul_Khader_Rowther_vs_P_K_Sara_Bai_And_Ors_on_28_August_1989_1.PDF",
    "Bharat_Petroleum_Corpn_Ltd_vs_Maddula_Ratnavalli_Ors_on_27_April_2007_1.PDF",
    "Bridge_Roof_Co_India_Ltd_vs_Union_Of_India_Uoi_on_11_September_1962_1.PDF",
    "India_Tobacco_Co_Ltd_vs_The_Commercial_Tax_Officer_on_6_November_1974_1.PDF",
    "Krishan_Gopal_vs_Sri_Prakashchandra_And_Ors_on_8_November_1973_1.PDF",
])

count=0
with open("judgments_chunks_.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        if record["source_pdf"] in target_pdfs:
            print(f"\n📄 File: {record['source_pdf']}")
            print(f"🔹 Doc ID: {record['doc_id']}")
            print(f"🔹 Chunk ID: {record['chunk_id']}")
            print(f"🔹 Date of Judgment: {record['date_of_judgment']}")
            print(f"🔹 Case Title: {record['case_title']}")
            print(f"🔹 Bench: {record['bench']}")
            print(f"🔹 Citation: {record['citation']}")
            print(f"🔹 Chunk Text (start): {record['chunk_text'][:300]}...")
            count+=1

print(f"\nTotal matching records: {count}")



📄 File: Bridge_Roof_Co_India_Ltd_vs_Union_Of_India_Uoi_on_11_September_1962_1.PDF
🔹 Doc ID: unknown_Bridge_&_Roof_Co._(India)_Ltd._vs_Union_Of_India_(
🔹 Chunk ID: unknown_Bridge_&_Roof_Co._(India)_Ltd._vs_Union_Of_India_(_001
🔹 Date of Judgment: None
🔹 Case Title: Bridge & Roof Co. (India) Ltd. vs Union Of India (Uoi)
🔹 Bench: ['B.P. Sinha', 'J.C. Shah', 'K. Subba Rao', 'K.N. Wanchoo', 'N. Rajgopala']
🔹 Citation: AIR1963SC1474, [1962(5)FLR423], (1962)IILLJ490SC, [1963]3SCR978
🔹 Chunk Text (start): 1. The short question raised in this writ petition under Art. 32 of the Constitution is whether
production bonus is included within the term "basic wages" as defined in s. 2(b) of the Employees'
Provident Funds Act, No. 19 of 1952, (hereinafter referred to as the Act) Writ Petition 64 of 1962
(The J...

📄 File: Bridge_Roof_Co_India_Ltd_vs_Union_Of_India_Uoi_on_11_September_1962_1.PDF
🔹 Doc ID: unknown_Bridge_&_Roof_Co._(India)_Ltd._vs_Union_Of_India_(
🔹 Chunk ID: unknown_Bridge_&_Roof_Co._(I

In [269]:
input_path = "judgments_chunks.jsonl"
output_path = "judgments_chunks_fixed.jsonl"

# Manually providing correct dates
manual_dates = {
    "Abdul_Khader_Rowther_vs_P_K_Sara_Bai_And_Ors_on_28_August_1989_1.PDF": "1989-08-28",
    "Bharat_Petroleum_Corpn_Ltd_vs_Maddula_Ratnavalli_Ors_on_27_April_2007_1.PDF": "2007-04-27",
    "Bridge_Roof_Co_India_Ltd_vs_Union_Of_India_Uoi_on_11_September_1962_1.PDF": "1962-09-11",
    "India_Tobacco_Co_Ltd_vs_The_Commercial_Tax_Officer_on_6_November_1974_1.PDF": "1974-11-06",
    "Krishan_Gopal_vs_Sri_Prakashchandra_And_Ors_on_8_November_1973_1.PDF": "1973-11-08"
}

# Keeping track of number of chunks per PDF
chunk_counts = {}

# First pass to count chunks per file
with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        fname = record["source_pdf"]
        chunk_counts[fname] = chunk_counts.get(fname, 0) + 1

# Second pass to apply fixes
with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    chunk_index_tracker = {}

    for line in infile:
        record = json.loads(line)
        fname = record["source_pdf"]

        # If this file is one of the manually corrected ones
        if fname in manual_dates:
            fixed_date = manual_dates[fname]
            record["date_of_judgment"] = fixed_date

            # Prepare fixed doc_id using updated case title
            case_title = record["case_title"]
            fixed_doc_id = f"{fixed_date}_{case_title.replace(' ', '_')[:50]}"
            record["doc_id"] = fixed_doc_id

            # Update chunk index
            chunk_index_tracker[fname] = chunk_index_tracker.get(fname, 0) + 1
            i = chunk_index_tracker[fname]
            total_chunks = chunk_counts[fname]

            record["chunk_index"] = i
            record["total_chunks"] = total_chunks
            record["chunk_id"] = f"{fixed_doc_id}_{i:03}"

        outfile.write(json.dumps(record) + "\n")

print("All missing date and ID fields updated.")

All missing date and ID fields updated.


In [270]:
missing_dates = []
dates = []

with open(output_path, "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        date = record.get("date_of_judgment")
        if date:
            dates.append(date)
        else:
            missing_dates.append(record.get("source_pdf"))

if dates:
    print("Earliest date:", min(dates))
    print("Latest date:", max(dates))

if missing_dates:
    print(f"\n❌ {len(missing_dates)} records missing dates:")
    for pdf in sorted(set(missing_dates)):
        print(" -", pdf)



Earliest date: 1950-01-24
Latest date: 2025-04-04


In [272]:
target_pdfs = set([
    "Abdul_Khader_Rowther_vs_P_K_Sara_Bai_And_Ors_on_28_August_1989_1.PDF",
    "Bharat_Petroleum_Corpn_Ltd_vs_Maddula_Ratnavalli_Ors_on_27_April_2007_1.PDF",
    "Bridge_Roof_Co_India_Ltd_vs_Union_Of_India_Uoi_on_11_September_1962_1.PDF",
    "India_Tobacco_Co_Ltd_vs_The_Commercial_Tax_Officer_on_6_November_1974_1.PDF",
    "Krishan_Gopal_vs_Sri_Prakashchandra_And_Ors_on_8_November_1973_1.PDF",
])

count=0
with open("judgments_chunks_fixed.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        if record["source_pdf"] in target_pdfs:
            print(f"\n📄 File: {record['source_pdf']}")
            print(f"🔹 Doc ID: {record['doc_id']}")
            print(f"🔹 Chunk ID: {record['chunk_id']}")
            print(f"🔹 Date of Judgment: {record['date_of_judgment']}")
            print(f"🔹 Case Title: {record['case_title']}")
            print(f"🔹 Bench: {record['bench']}")
            print(f"🔹 Citation: {record['citation']}")
            print(f"🔹 Chunk Text (start): {record['chunk_text'][:300]}...")
            count+=1

print(f"\nTotal matching records: {count}")



📄 File: Bridge_Roof_Co_India_Ltd_vs_Union_Of_India_Uoi_on_11_September_1962_1.PDF
🔹 Doc ID: 1962-09-11_Bridge_&_Roof_Co._(India)_Ltd._vs_Union_Of_India_(
🔹 Chunk ID: 1962-09-11_Bridge_&_Roof_Co._(India)_Ltd._vs_Union_Of_India_(_001
🔹 Date of Judgment: 1962-09-11
🔹 Case Title: Bridge & Roof Co. (India) Ltd. vs Union Of India (Uoi)
🔹 Bench: ['B.P. Sinha', 'J.C. Shah', 'K. Subba Rao', 'K.N. Wanchoo', 'N. Rajgopala']
🔹 Citation: AIR1963SC1474, [1962(5)FLR423], (1962)IILLJ490SC, [1963]3SCR978
🔹 Chunk Text (start): 1. The short question raised in this writ petition under Art. 32 of the Constitution is whether
production bonus is included within the term "basic wages" as defined in s. 2(b) of the Employees'
Provident Funds Act, No. 19 of 1952, (hereinafter referred to as the Act) Writ Petition 64 of 1962
(The J...

📄 File: Bridge_Roof_Co_India_Ltd_vs_Union_Of_India_Uoi_on_11_September_1962_1.PDF
🔹 Doc ID: 1962-09-11_Bridge_&_Roof_Co._(India)_Ltd._vs_Union_Of_India_(
🔹 Chunk ID: 1962-09-11_Br

In [273]:
input_path = "judgments_chunks_fixed.jsonl"  # or your latest .jsonl
output_path = "judgments_chunks_cleaned.jsonl"

def clean_kanoon(text):
    # Remove Indian Kanoon links and headers
    text = re.sub(r'Indian Kanoon\s*[-–—]?\s*https?://\S+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'https?://(?:www\.)?indiankanoon\.org/\S+', '', text, flags=re.IGNORECASE)
    return text.strip()

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        record = json.loads(line)
        original = record["chunk_text"]
        cleaned = clean_kanoon(original)
        record["chunk_text"] = cleaned
        outfile.write(json.dumps(record) + "\n")

print("Indian Kanoon references removed from all chunk_text fields.")


Indian Kanoon references removed from all chunk_text fields.


In [274]:
jsonl_path = "judgments_chunks_cleaned.jsonl" 
kanoon_count = 0
files_with_kanoon = set()

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        chunk = record.get("chunk_text", "")
        matches = re.findall(r'Indian Kanoon|https?://(?:www\.)?indiankanoon\.org/\S+', chunk, re.IGNORECASE)
        if matches:
            kanoon_count += len(matches)
            files_with_kanoon.add(record.get("source_pdf", "Unknown"))

print(f"Total 'Indian Kanoon' references in chunk_text: {kanoon_count}")
print(f"Total unique files with at least one mention: {len(files_with_kanoon)}")


Total 'Indian Kanoon' references in chunk_text: 0
Total unique files with at least one mention: 0


In [3]:
from collections import Counter

jsonl_path = "judgments_chunks_cleaned.jsonl"  # ← Change this to your actual file

chunk_ids = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        chunk_ids.append(data["chunk_id"])

# Count how many times each chunk_id appears
counts = Counter(chunk_ids)

# Identify duplicates (appearing more than once)
duplicate_ids = [chunk_id for chunk_id, count in counts.items() if count > 1]

total = len(chunk_ids)
duplicates = len(duplicate_ids)

print(f"Found {duplicates} duplicate chunk_ids / {total} total chunks")


Found 1976 duplicate chunk_ids / 2150563 total chunks


In [3]:
from collections import defaultdict
import csv

output_csv = "duplicate_chunks.csv"  # Optional export

# Count all chunk_ids
chunk_id_counts = Counter()
with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        chunk_id_counts[data["chunk_id"]] += 1

# Store duplicates with metadata
duplicates = defaultdict(list)

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        chunk_id = data["chunk_id"]
        if chunk_id_counts[chunk_id] > 1:
            duplicates[chunk_id].append({
                "chunk_text": data["chunk_text"][:200].replace("\n", " ") + "...",  # show only start
                "source_pdf": data["source_pdf"]
            })

# Display a few examples
print(f"Total duplicate chunk_ids found: {len(duplicates)}\n")
for chunk_id, entries in list(duplicates.items())[:5]:  # show just 5 for preview
    print(f"chunk_id: {chunk_id}")
    for i, entry in enumerate(entries):
        print(f"  - [#{i+1}] from PDF: {entry['source_pdf']}")
        print(f"    Text: {entry['chunk_text']}\n")

# Export to CSV for review
with open(output_csv, "w", encoding="utf-8", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["chunk_id", "source_pdf", "chunk_text"])
    writer.writeheader()
    for chunk_id, entries in duplicates.items():
        for entry in entries:
            writer.writerow({
                "chunk_id": chunk_id,
                "source_pdf": entry["source_pdf"],
                "chunk_text": entry["chunk_text"]
            })

print(f"✅ Full duplicate data written to: {output_csv}")


Total duplicate chunk_ids found: 1976

chunk_id: 1952-12-22_Anglo_French_Textile_Co_Ltd_vs_Commissioner_Of_Inc_001
  - [#1] from PDF: Anglo_French_Textile_Co_Ltd_vs_Commissioner_Of_Income_Tax_Madras_on_22_December_1952 (1)_1.PDF
    Text: CIVIL APPELLATE JURISDICTION: Civil Appeal No. 12 of 1952. Appeal from the Judgment and Order dated the 18th January, 1950, of the High Court of Judicature at Madras (Satyanarayana Rao and Viswanatha ...

  - [#2] from PDF: Anglo_French_Textile_Co_Ltd_vs_Commissioner_Of_Income_Tax_Madras_on_22_December_1952 (2)_1.PDF
    Text: CIVIL APPELLATE JURISDICTION: Civil Appeal No. 13 of 1952. Appeal from the Judgment and Order dated 18th January, 1950, of the High Court of Judicature at Madras (Satyanarayana Rao and Viswanaths Sast...

  - [#3] from PDF: Anglo_French_Textile_Co_Ltd_vs_Commissioner_Of_Income_Tax_Madras_on_22_December_1952_1.PDF
    Text: CIVIL APPELLATE JURISDICTION: Appeal No. 1 1 of 1952. Appeal from the Judgment and Order dated January 18, 

In [4]:
duplicate_pdfs = set()

for chunk_id, entries in duplicates.items():
    for entry in entries:
        duplicate_pdfs.add(entry["source_pdf"])

print(f"Total unique PDFs with duplicate chunks: {len(duplicate_pdfs)}")
print("Sample names of PDFs with duplicates:",duplicate_pdfs)

Total unique PDFs with duplicate chunks: 56
Sample names of PDFs with duplicates: {'The_Morvi_Mercantile_Bank_Ltd_And_Anr_vs_Union_Of_India_Through_The_General_on_3_March_1965_1.PDF', 'Commissioner_Of_Income_Tax_vs_M_S_Hindustan_Bulk_Carriers_M_S_Damani_on_17_December_2002_1.PDF', 'Keshav_Nilkanth_Joglekar_vs_The_Commissioner_Of_Police_on_17_September_1956_1.PDF', 'Swadeshi_Cotton_Mills_Co_Ltd_vs_Commissioner_Of_Income_Tax_Uttar_on_20_September_1966 (1)_1.PDF', 'Commissioner_Of_Income_Tax_Punjab_vs_Raghbir_Singh_on_9_April_1965_1.PDF', 'Anglo_French_Textile_Co_Ltd_vs_Commissioner_Of_Income_Tax_Madras_on_22_December_1952_1.PDF', 'Minister_Of_National_Revenue_vs_Anaconda_American_Brass_Ltd_on_13_December_1955_1.PDF', 'Dosa_Satyanarayanamurty_Etc_vs_The_Andhra_Pradesh_State_Roadtransport_on_8_September_1960_1.PDF', 'A_N_Lakshmana_Shenoy_vs_The_Income_Tax_Officer_Ernakulam_on_28_April_1958_1.PDF', 'Shoorji_Vallabhdas_Co_Bombay_vs_The_Commissioner_Of_Income_Tax_Excess_on_19_April_1960_1.PDF

In [5]:
input_jsonl = "judgments_chunks_cleaned.jsonl"         
output_jsonl = "judgments_chunks_cleaned_unique.jsonl"  

# Load entries into memory by doc_id
grouped_by_doc = defaultdict(list)
doc_id_to_pdf = {}

with open(input_jsonl, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        grouped_by_doc[data["doc_id"]].append(data)
        doc_id_to_pdf[data["doc_id"]] = data["source_pdf"]

# Only process doc_ids where the PDF was in the duplicate set
with open(output_jsonl, "w", encoding="utf-8") as fout:
    for doc_id, chunks in grouped_by_doc.items():
        source_pdf = doc_id_to_pdf[doc_id]
        if source_pdf in duplicate_pdfs:
            # Reassign chunk_ids only for these
            chunks.sort(key=lambda x: x["chunk_index"])
            for i, chunk in enumerate(chunks):
                chunk["chunk_id"] = f"{doc_id}_{i+1:03}"
                chunk["chunk_index"] = i + 1
                chunk["total_chunks"] = len(chunks)
                fout.write(json.dumps(chunk) + "\n")
        else:
            # Leave untouched for non-duplicate ones
            for chunk in chunks:
                fout.write(json.dumps(chunk) + "\n")

print(f"✅ New JSONL saved: {output_jsonl}")


✅ New JSONL saved: judgments_chunks_cleaned_unique.jsonl


In [4]:
with open('judgments_chunks_cleaned_unique.jsonl', "r", encoding="utf-8") as f:
    ids = [json.loads(line)["chunk_id"] for line in f]

dupes = [k for k, v in Counter(ids).items() if v > 1]
print(f"🚨 Duplicates found after renaming: {len(dupes)}")

🚨 Duplicates found after renaming: 0


In [6]:
# Get all PDF files
pdf_files = set()
for root, _, files in os.walk('supreme_court_judgments'):
    for file in files:
        if file.lower().endswith(".pdf"):
            pdf_files.add(file)

print(f"Total PDF files found: {len(pdf_files)}")

# Read JSONL and extract source PDFs
jsonl_pdfs = set()
with open('judgments_chunks_cleaned_unique.jsonl', "r", encoding="utf-8") as f:
    for line in f:
        try:
            obj = json.loads(line)
            jsonl_pdfs.add(obj.get("source_pdf", ""))
        except json.JSONDecodeError:
            print("JSON decode error in a line")

print(f"Unique PDFs processed in JSONL: {len(jsonl_pdfs)}")

# Compare
missing = pdf_files - jsonl_pdfs
extra = jsonl_pdfs - pdf_files

if missing:
    print(f"\n{len(missing)} PDFs missing from JSONL:")
    for m in sorted(missing):
        print(" -", m)
else:
    print("\nAll PDFs processed.")

if extra:
    print(f"\n{len(extra)} extra entries in JSONL not found in folder:")
    for e in sorted(extra):
        print(" -", e)


Total PDF files found: 26688
Unique PDFs processed in JSONL: 26688

All PDFs processed.


In [13]:
# Total no. of chunks

with open('judgments_chunks_cleaned_unique.jsonl', "r", encoding="utf-8") as f:
    total_chunks = sum(1 for _ in f)
print(f"Total number of chunks: {total_chunks}")


Total number of chunks: 2150563


In [17]:
# Check the first few entries of the embedded output
with open('embeddings\embedding_output_part_3.jsonl', "r", encoding="utf-8") as f:
    for i in range(5):
        line = f.readline()
        data = json.loads(line)
        print(json.dumps(data, indent=2))

{
  "id": "2012-02-28_Ramnaresh_&_Ors_vs_State_Of_Chhattisgarh_056",
  "embedding": [
    -0.04436200112104416,
    0.1315470039844513,
    -0.048899874091148376,
    -0.02480805106461048,
    0.08776503801345825,
    0.03942601755261421,
    0.013300788588821888,
    0.037708599120378494,
    0.0229124017059803,
    0.026483871042728424,
    0.05283160135149956,
    -0.032794706523418427,
    0.016042737290263176,
    -0.012913988903164864,
    -0.04021573066711426,
    0.007192423567175865,
    0.00023793577565811574,
    -0.05029822885990143,
    -0.001662302645854652,
    0.022295866161584854,
    -0.04978184774518013,
    -0.010004037991166115,
    0.06583981215953827,
    -0.020235400646924973,
    -0.06202244386076927,
    0.008952728472650051,
    -0.039913296699523926,
    -0.017075305804610252,
    0.040204551070928574,
    0.0027290338184684515,
    -0.0340082049369812,
    0.023543106392025948,
    -0.05846500024199486,
    -0.01147310808300972,
    0.08454354107379913,
   

In [16]:
# Check the last few entries of the embedded output

from collections import deque
import json

file_path = 'embeddings/embedding_output_part_2.jsonl'

# Keep only the last 5 lines in memory
with open(file_path, "r", encoding="utf-8") as f:
    last_lines = deque(f, maxlen=5)

# Print the last 5 parsed JSON lines
for line in last_lines:
    data = json.loads(line)
    print(json.dumps(data, indent=2))

{
  "id": "2012-02-28_Ramnaresh_&_Ors_vs_State_Of_Chhattisgarh_051",
  "embedding": [
    -0.06306172907352448,
    0.07187744230031967,
    -0.11659900099039078,
    -0.01039380393922329,
    -0.012525205500423908,
    0.013459695503115654,
    0.05617637559771538,
    0.025270020589232445,
    -0.06053391098976135,
    0.07882948219776154,
    0.06367697566747665,
    0.006695875432342291,
    -0.030578430742025375,
    0.03294745832681656,
    0.0029580635018646717,
    -0.022937452420592308,
    0.04401610419154167,
    -0.058624017983675,
    -0.05341615900397301,
    0.07899939268827438,
    -5.807403795188293e-05,
    0.016218865290284157,
    0.08863861858844757,
    -0.042525604367256165,
    0.008231116458773613,
    0.05690210312604904,
    0.012486227788031101,
    -0.024902714416384697,
    0.015997570008039474,
    0.009917204268276691,
    0.010451455600559711,
    0.020097719505429268,
    -0.048979662358760834,
    -0.07015587389469147,
    0.0713820829987526,
    -0.0