In [1]:
# check pdf
import fitz  # PyMuPDF
import re
import json
import random


def inspect_pdf_page(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)  # 0-indexed!

    blocks = page.get_text("blocks")

    for i, block in enumerate(blocks):
        x0, y0, x1, y1, text, block_type, block_no = block
        print(f"\n--- Block {i} ---")
        print(f"Position: ({x0:.1f}, {y0:.1f}) to ({x1:.1f}, {y1:.1f})")
        print(f"Type: {block_type} ({'Text' if block_type == 0 else 'Image/Shape'})")
        print(f"Content:\n{text.strip()}\n")

def inspect_pdf_fonts(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)

    text_dict = page.get_text("dict")

    for block in text_dict["blocks"]:
        if block["type"] == 0:  # Text block
            for line in block["lines"]:
                for span in line["spans"]:
                    print("\n--- Span ---")
                    print(f"Text: {span['text']}")
                    print(f"Font: {span['font']}")
                    print(f"Size: {span['size']:.2f}")
                    print(f"Color: {span['color']}")

def inspect_all_blocks_for_titles(pdf_path, page_number=0):
    """
    Prints all text blocks from the given page and marks which ones match the title pattern.
    """
    import re
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)
    text_dict = page.get_text("dict")

    for i, block in enumerate(text_dict["blocks"]):
        if block["type"] != 0:
            continue

        full_text = " ".join(
            span["text"] for line in block["lines"] for span in line["spans"]
        ).strip()

        match = re.match(r"^\d{3,5}\.", full_text)
        print(f"\n--- Block {i} ---")
        print(f"Text: {repr(full_text)}")
        if match:
            print("MATCHED AS TITLE")
        else:
            print("Not a match")

pdf_path = "jury_instructions_abridged.pdf"
page_number = 1# for example, page 11 if you count from 1
inspect_all_blocks_for_titles(pdf_path, page_number)



--- Block 0 ---
Text: 'Cal.App.3d 314, 325 [276 Cal.Rptr. 430] (quoting  People v. Miller  (1916) 171 Cal. 649, 652 [154 P. 468] and holding that it was prejudicial misconduct for jurors to refer to the dictionary for deﬁnition of the word “preponderance”).) Secondary Sources'
❌ Not a match

--- Block 1 ---
Text: '1 Witkin, California Evidence (6th ed. 2023) Burden of Proof and Presumptions, § 39 Jefferson, California Evidence Benchbook (3d ed. 1997) Ch. 45, Burdens of Proof and of Producing Evidence; Presumptions 48 California Forms of Pleading and Practice, Ch. 551,  Trial , §§ 551.90, 551.92 (Matthew Bender)'
❌ Not a match

--- Block 2 ---
Text: 'EVIDENCE CACI No. 200'
❌ Not a match

--- Block 3 ---
Text: '43'
❌ Not a match


In [3]:

import fitz  # PyMuPDF
import re
import json
import random
import time

# Vary question templates
question_templates_main = [
    "What is the legal standard for the {title}?",
    "What is the principle of {title}?",
    "What is the jury instruction for {title}?",
    "How is {title} explained to the jury?",
    "What are the elements of the {title}?",
    "How should a jury apply {title}?",
    "What does {title} require under the law?"
]

question_templates_sources = [
    "What case law supports {title}?",
    "What authority discusses the {title}?",
    "Which case relates to {title}?",
    "Which legal source is cited for {title}?"
]

# Match regex titles to find the title and all body text
def extract_title_and_body_blocks(pdf_path):

    doc = fitz.open(pdf_path)
    results = []

    current_title = None
    current_body = []
    capturing = False

    for page in doc:
        text_dict = page.get_text("dict")
        for block in text_dict["blocks"]:
            if block["type"] != 0:
                continue

            block_text = " ".join(
                span["text"] for line in block["lines"] for span in line["spans"]
            ).strip()

            if re.match(r"^\d{3,5}\s*\.\s+", block_text):
                has_bold = any(
                    span["font"] == "Helvetica-Bold"
                    for line in block["lines"]
                    for span in line["spans"]
                )
                if has_bold:
                    # Save previous block set
                    if current_title:
                        results.append((current_title, "\n".join(current_body).strip()))
                    # Start new one
                    current_title = re.sub(r"^\d{3,5}\s*\.\s+", "", block_text)
                    current_body = []
                    capturing = True
                    continue

            if capturing:
                current_body.append(block_text)

    if current_title and current_body:
        results.append((current_title, "\n".join(current_body).strip()))

    return results


# Extract summary section from text body
# Ends when it encounters a line like "New September 2003; Revised...
def extract_summary_from_body(body_text):

    lines = body_text.splitlines()
    summary_lines = []
    for line in lines:
        if re.search(r"New \w+ \d{4}|Revised \w+ \d{4}", line):
            break
        summary_lines.append(line.strip())
    return " ".join(summary_lines).strip()


# Extract sources and authorit bullet points section from text body
# Ends when it reaches Secondary Sources
def extract_sources_from_body(body_text):

    lines = body_text.splitlines()
    in_sources = False
    sources = []

    for line in lines:
        if line.strip().startswith("Sources and Authority"):
            in_sources = True
            continue
        if line.strip().startswith("Secondary Sources"):
            break
        if in_sources:
            clean_line = re.sub(r"^(\d+\.|\u2022)\s*", "", line.strip())
            if clean_line:
                sources.append(clean_line)

    return [s for s in sources if len(s) > 20]  # filter out short or invalid entries


# ask for a summary
def create_summary_qna(title, summary):
    question = random.choice(question_templates_main).format(title=title)
    return {
        "messages": [
            {"role": "user", "content": question},
            {"role": "assistant", "content": summary}
        ]
    }

# ask for sources
def create_random_source_qna(title, sources):
    if not sources:
        return None
    selected_sources = random.sample(sources, min(2, len(sources)))
    qna_pairs = []
    for source in selected_sources:
        question = random.choice(question_templates_sources).format(title=title)
        qna_pairs.append({
            "messages": [
                {"role": "user", "content": question},
                {"role": "assistant", "content": source}
            ]
        })
    return qna_pairs


def generate_qna_from_pdf(pdf_path, output_jsonl):
    entries = extract_title_and_body_blocks(pdf_path)
    results = []

    for title, body in entries:
        summary = extract_summary_from_body(body)
        sources = extract_sources_from_body(body)

        if summary:
            results.append(create_summary_qna(title, summary))

        source_qnas = create_random_source_qna(title, sources)
        if source_qnas:
            for source_qna in source_qnas:
                results.append(source_qna)
    
    with open(output_jsonl, "w", encoding="utf-8") as f:
        for item in results:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"Extracted {len(results)} Q&A pairs into {output_jsonl}")




pdf_path = "jury_instructions_abridged.pdf" 
output_jsonl = "caci_instructions.jsonl"
generate_qna_from_pdf(pdf_path, output_jsonl)


#results = extract_title_and_body_blocks(pdf_path)



Extracted 2352 Q&A pairs into caci_instructions.jsonl
