In [1]:

import os, io, re, json, tempfile, mimetypes, math
from typing import Tuple, Dict, Any, Optional, List

# --- File extraction deps
# Handle fitz import conflict - try PyMuPDF first, fallback to alternative
try:
    # Try to import PyMuPDF's fitz module specifically
    import sys
    import importlib.util
    
    # Check if PyMuPDF is installed and get the correct fitz module
    try:
        import pymupdf
        # PyMuPDF is installed, try to get the fitz module from it
        fitz = pymupdf.fitz
        print("✅ PyMuPDF (fitz) imported successfully via pymupdf")
    except ImportError:
        # Try direct import of fitz (PyMuPDF)
        import fitz  # PyMuPDF
        print("✅ PyMuPDF (fitz) imported successfully")
        
except ImportError as e:
    print(f"⚠️  PyMuPDF not available: {e}")
    print("📝 PDF processing will be limited. Install PyMuPDF: pip install PyMuPDF")
    fitz = None
except Exception as e:
    print(f"⚠️  Unexpected error importing fitz: {e}")
    print("📝 This might be due to a naming conflict with another 'fitz' package")
    print("💡 Try: pip uninstall fitz && pip install PyMuPDF")
    fitz = None

try:
    from pdf2image import convert_from_path
    print("✅ pdf2image imported successfully")
except ImportError as e:
    print(f"⚠️  pdf2image not available: {e}")
    print("📝 OCR fallback will be limited. Install pdf2image: pip install pdf2image")
    convert_from_path = None

try:
    import pytesseract
    print("✅ pytesseract imported successfully")
except ImportError as e:
    print(f"⚠️  pytesseract not available: {e}")
    print("📝 OCR processing will be limited. Install pytesseract: pip install pytesseract")
    pytesseract = None

try:
    import cv2
    import numpy as np
    print("✅ OpenCV and NumPy imported successfully")
except ImportError as e:
    print(f"⚠️  OpenCV/NumPy not available: {e}")
    print("📝 Image processing will be limited. Install opencv-python: pip install opencv-python")
    cv2 = None
    np = None

try:
    from PIL import Image
    print("✅ PIL (Pillow) imported successfully")
except ImportError as e:
    print(f"⚠️  PIL not available: {e}")
    print("📝 Image processing will be limited. Install Pillow: pip install Pillow")
    Image = None

try:
    import docx2txt
    print("✅ docx2txt imported successfully")
except ImportError as e:
    print(f"⚠️  docx2txt not available: {e}")
    print("📝 DOCX processing will be limited. Install docx2txt: pip install docx2txt")
    docx2txt = None

# --- OpenAI (Responses API with Structured Outputs)
try:
    from openai import OpenAI
    print("✅ OpenAI imported successfully")
except ImportError as e:
    print(f"❌ OpenAI not available: {e}")
    print("📝 Core functionality requires OpenAI. Install: pip install openai")
    OpenAI = None

# --- Validation
try:
    from jsonschema import Draft7Validator
    print("✅ jsonschema imported successfully")
except ImportError as e:
    print(f"⚠️  jsonschema not available: {e}")
    print("📝 Validation will be limited. Install jsonschema: pip install jsonschema")
    Draft7Validator = None

# Read API key from api.txt file
def get_api_key_from_file(file_path: str = r"C:\Users\Leo\AI projects\_api.txt", keyname: str = "RubricParserPrompt") -> str:
    """Read API key from api.txt file for rubriCheck project."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip().startswith(f'{keyname}:'):
                    return line.strip().split(':', 1)[1].strip()
        raise ValueError("rubriCheck API key not found in file")
    except FileNotFoundError:
        raise FileNotFoundError(f"API file not found at {file_path}")

# Set the API key from file
api_file = r"C:\Users\Leo\AI projects\_api.txt"
keyname = "RubricParserPrompt"
api_key = get_api_key_from_file()
os.environ["OPENAI_API_KEY"] = api_key

# Verify the API key is set
try:
    api_key = os.environ["OPENAI_API_KEY"]
    if api_key == "your-api-key-here":
        print("⚠️  Please replace 'your-api-key-here' with your actual OpenAI API key!")
    else:
        print("✅ API key is set and ready to use!")
        print(f"🔑 Key starts with: {api_key[:8]}...")
except KeyError:
    print("❌ API key not found in environment variables")


⚠️  Unexpected error importing fitz: module 'pymupdf' has no attribute 'fitz'
📝 This might be due to a naming conflict with another 'fitz' package
💡 Try: pip uninstall fitz && pip install PyMuPDF
✅ pdf2image imported successfully
✅ pytesseract imported successfully
✅ OpenCV and NumPy imported successfully
✅ PIL (Pillow) imported successfully
✅ docx2txt imported successfully
✅ OpenAI imported successfully
✅ jsonschema imported successfully
✅ API key is set and ready to use!
🔑 Key starts with: sk-proj-...


In [2]:
# rubric_parser.py
# --------------------------------------------------------------------
# Ingest TXT/DOCX/PDF/Image → extract text → LLM parse (Structured Outputs)
# → local JSON Schema validation → return normalized rubric JSON
# --------------------------------------------------------------------



# =========================
# JSON SCHEMA (Structured)
# =========================

RUBRIC_JSON_SCHEMA: Dict[str, Any] = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "title": "RubricSchema",
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "title": {"type": ["string", "null"], "maxLength": 200},
        "scale": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "type": {"type": "string", "enum": ["categorical", "numeric"]},
                "levels": {
                    "type": ["array", "null"],
                    "items": {"type": "string", "minLength": 1},
                    "minItems": 1
                },
                "min": {"type": ["number", "null"]},
                "max": {"type": ["number", "null"]},
                "original_levels": {
                    "type": ["array", "null"],
                    "items": {"type": "string"}
                },
                "synonyms": {
                    "type": ["object", "null"],
                    "additionalProperties": {"type": "string"}
                }
            },
            "required": ["type"]
        },
        "criteria": {
            "type": "array",
            "minItems": 1,
            "items": {
                "type": "object",
                "additionalProperties": False,
                "properties": {
                    "name": {"type": "string", "minLength": 1, "maxLength": 120},
                    "descriptor_by_level": {
                        "type": "object",
                        "additionalProperties": {"type": "string"}
                    },
                    "weight": {"type": "number", "exclusiveMinimum": 0},
                    "evidence_hint": {"type": ["string", "null"]},
                    "notes": {"type": ["string", "null"]}
                },
                "required": ["name", "descriptor_by_level"]
            }
        },
        "notes": {"type": ["string", "null"]},
        "source_parse": {
            "type": "object",
            "additionalProperties": False,
            "properties": {
                "method": {"type": "string", "enum": ["table", "narrative", "hybrid", "ocr"]},
                "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
                "warnings": {
                    "type": "array",
                    "items": {"type": "string"},
                    "default": []
                }
            },
            "required": ["method", "confidence"]
        }
    },
    "required": ["scale", "criteria", "source_parse"]
}

# Pre-compile validator for speed
RUBRIC_VALIDATOR = Draft7Validator(RUBRIC_JSON_SCHEMA)

# =========================
# File → text extraction
# =========================

IMG_EXT = {".png", ".jpg", ".jpeg", ".webp", ".tif", ".tiff", ".bmp"}

def _deskew_and_binarize(pil_img) -> Image.Image:
    """Basic deskew + binarization to improve OCR."""
    if cv2 is None or np is None:
        print("⚠️  OpenCV/NumPy not available, skipping image preprocessing")
        return pil_img
    
    img = np.array(pil_img.convert("L"))  # grayscale
    # threshold
    _, th = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # deskew
    coords = np.column_stack(np.where(th == 0))
    angle = 0.0
    if coords.size > 0:
        rect = cv2.minAreaRect(coords)
        angle = rect[-1]
        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle
    (h, w) = th.shape[:2]
    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
    rotated = cv2.warpAffine(th, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return Image.fromarray(rotated)

def _ocr_pil_image(pil_img, lang: str = "eng") -> str:
    if pytesseract is None:
        print("⚠️  pytesseract not available, cannot perform OCR")
        return ""
    
    proc = _deskew_and_binarize(pil_img)
    return pytesseract.image_to_string(proc, lang=lang)

def _extract_from_pdf(path: str) -> Tuple[str, str]:
    """Return (text, method). Try native text first; fallback to OCR if text looks empty."""
    if fitz is None:
        print("⚠️  PyMuPDF not available, cannot extract PDF text")
        return "", "error"
    
    doc = fitz.open(path)
    texts = []
    for p in doc:
        txt = p.get_text("text")
        if txt:
            texts.append(txt)
    native_text = "\n".join(texts).strip()
    if len(native_text) >= 400 or (len(native_text) > 40 and len(texts) >= 1):
        return native_text, "table" if " | " in native_text or re.search(r"\bPoints?\b", native_text, re.I) else "narrative"

    # Fallback to OCR
    if convert_from_path is None:
        print("⚠️  pdf2image not available, cannot perform OCR fallback")
        return native_text, "narrative"  # Return what we have
    
    pages = convert_from_path(path, dpi=300)
    ocr_texts = []
    for pil in pages:
        ocr_texts.append(_ocr_pil_image(pil))
    return "\n".join(ocr_texts).strip(), "ocr"

def _extract_from_image(path: str) -> Tuple[str, str]:
    if Image is None:
        print("⚠️  PIL not available, cannot process images")
        return "", "error"
    
    pil = Image.open(path)
    return _ocr_pil_image(pil), "ocr"

def _extract_from_docx(path: str) -> Tuple[str, str]:
    if docx2txt is None:
        print("⚠️  docx2txt not available, cannot process DOCX files")
        return "", "error"
    
    return docx2txt.process(path) or "", "narrative"

def _extract_from_txt(path: str) -> Tuple[str, str]:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read(), "narrative"

def extract_text_from_file(path: str) -> Tuple[str, str]:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        return _extract_from_pdf(path)
    if ext == ".docx":
        return _extract_from_docx(path)
    if ext in IMG_EXT:
        return _extract_from_image(path)
    return _extract_from_txt(path)

# =========================
# OpenAI call (Structured Outputs)
# =========================



def _openai_client():
    if OpenAI is None:
        raise RuntimeError("OpenAI not available")
    return OpenAI()  # reads OPENAI_API_KEY

def parse_rubric_with_llm(raw_text: str, method_hint: str = "narrative", model: str = "gpt-4o-mini") -> Dict[str, Any]:
    """
    Calls the OpenAI Chat Completions API with JSON mode to get a strictly valid rubric JSON.
    """
    if OpenAI is None:
        print("❌ OpenAI not available, cannot parse rubric")
        return {"error": "OpenAI not available"}
    
    client = _openai_client()

    # Build the system message with schema instructions
    system_message = (
        "You are a precise rubric parser. Convert the given rubric into strictly valid JSON that "
        "conforms to the provided JSON Schema. Do not fabricate content. If information is missing, "
        "omit it and write a warning. Preserve original level wording in descriptors; normalize level names.\n\n"
        "IMPORTANT: You must respond with valid JSON only. No additional text or explanations."
    )
    
    user_message = (
        "RAW_RUBRIC_TEXT:\n```\n" + raw_text.strip() + "\n```\n\n"
        "Instructions:\n"
        f"- The extraction method was '{method_hint}'. Set source_parse.method accordingly.\n"
        "- If levels like Excellent/Good/Fair/Poor are present, use them as categorical scale levels in best→worst order.\n"
        "- If a numeric points scale is present (e.g., 0–4), include scale.min/scale.max and set type='numeric'.\n"
        "- Parse weights when explicitly indicated (e.g., 'Clarity (30%)' → weight=0.30); otherwise default to 1.0.\n"
        "- If any descriptor is missing for a level, omit that key and add a warning.\n"
        "- If multiple rubrics are present, parse the first major rubric and add a warning.\n"
        "- Output only the JSON. No extra text.\n\n"
        "Required JSON structure:\n"
        "{\n"
        '  "title": "string or null",\n'
        '  "scale": {\n'
        '    "type": "categorical or numeric",\n'
        '    "levels": ["array of strings for categorical"],\n'
        '    "min": "number for numeric",\n'
        '    "max": "number for numeric"\n'
        '  },\n'
        '  "criteria": [\n'
        '    {\n'
        '      "name": "string",\n'
        '      "descriptor_by_level": {"level": "description"},\n'
        '      "weight": "number"\n'
        '    }\n'
        '  ],\n'
        '  "source_parse": {\n'
        '    "method": "table/narrative/hybrid/ocr",\n'
        '    "confidence": "number 0.0-1.0",\n'
        '    "warnings": ["array of strings"]\n'
        '  }\n'
        "}"
    )

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            response_format={"type": "json_object"},
            max_tokens=2048,
            temperature=0.1
        )
        
        # Extract the JSON response
        json_text = response.choices[0].message.content
        parsed = json.loads(json_text)
        
        return parsed
        
    except json.JSONDecodeError as e:
        print(f"❌ Failed to parse JSON response: {e}")
        print(f"Raw response: {json_text}")
        raise
    except Exception as e:
        print(f"❌ OpenAI API error: {e}")
        raise

# =========================
# Validation & post-checks
# =========================

def validate_rubric(rubric: Dict[str, Any]) -> List[str]:
    errors = []
    
    # Only validate if jsonschema is available
    if Draft7Validator is not None:
        for err in sorted(RUBRIC_VALIDATOR.iter_errors(rubric), key=lambda e: e.path):
            loc = "/".join([str(x) for x in err.path])
            errors.append(f"{loc}: {err.message}")
    else:
        print("⚠️  jsonschema not available, skipping JSON schema validation")
    # Additional semantic checks:
    # 1) categorical: descriptor keys ⊆ levels
    try:
        if rubric["scale"]["type"] == "categorical":
            levels = set(rubric["scale"].get("levels") or [])
            for i, c in enumerate(rubric["criteria"]):
                bad = [k for k in c["descriptor_by_level"].keys() if k not in levels]
                if bad:
                    errors.append(f"criteria[{i}].descriptor_by_level has keys not in scale.levels: {bad}")
    except KeyError:
        pass
    # 2) numeric: min < max (jsonschema also checks types)
    if rubric["scale"]["type"] == "numeric":
        mn = rubric["scale"].get("min"); mx = rubric["scale"].get("max")
        if isinstance(mn, (int,float)) and isinstance(mx, (int,float)) and not (mn < mx):
            errors.append("scale.min must be < scale.max")
    # 3) unique criterion names (case-insensitive)
    names = [c["name"].strip().lower() for c in rubric.get("criteria", [])]
    if len(set(names)) != len(names):
        errors.append("criteria names must be unique (case-insensitive)")
    return errors

# =========================
# Public entry point
# =========================

def parse_rubric_file(path: str, model: str = "gpt-4o-mini") -> Dict[str, Any]:
    raw_text, parse_hint_method = extract_text_from_file(path)
    if not raw_text or len(raw_text.strip()) < 30:
        raise ValueError("Could not extract enough text from the file for parsing.")

    rubric = parse_rubric_with_llm(raw_text, parse_hint_method, model=model)
    problems = validate_rubric(rubric)
    if problems:
        # attach validator findings as warnings
        rubric.setdefault("source_parse", {}).setdefault("warnings", [])
        rubric["source_parse"]["warnings"].extend([f"validation: {p}" for p in problems])
    return rubric

# =========================
# Demo and testing functions
# =========================

def demo_parse_rubric(file_path: str, model: str = "gpt-4o-mini") -> Dict[str, Any]:
    """
    Demo function to parse a rubric file and return the result.
    Use this in Jupyter notebooks instead of command line execution.
    """
    try:
        result = parse_rubric_file(file_path, model=model)
        print("✅ Rubric parsed successfully!")
        print(f"📊 Found {len(result.get('criteria', []))} criteria")
        print(f"📏 Scale type: {result.get('scale', {}).get('type', 'unknown')}")
        return result
    except Exception as e:
        print(f"❌ Error parsing rubric: {e}")
        return {}

def print_rubric_summary(rubric: Dict[str, Any]):
    """Print a formatted summary of the parsed rubric."""
    if not rubric:
        print("No rubric data to display")
        return
    
    print("=" * 50)
    print(f"📋 RUBRIC: {rubric.get('title', 'Untitled')}")
    print("=" * 50)
    
    # Scale info
    scale = rubric.get('scale', {})
    print(f"📏 Scale Type: {scale.get('type', 'unknown')}")
    if scale.get('type') == 'categorical':
        levels = scale.get('levels', [])
        print(f"📊 Levels: {' → '.join(levels) if levels else 'None'}")
    elif scale.get('type') == 'numeric':
        min_val = scale.get('min')
        max_val = scale.get('max')
        print(f"📊 Range: {min_val} - {max_val}")
    
    # Criteria
    criteria = rubric.get('criteria', [])
    print(f"\n📝 Criteria ({len(criteria)}):")
    for i, criterion in enumerate(criteria, 1):
        name = criterion.get('name', 'Unnamed')
        weight = criterion.get('weight', 1.0)
        print(f"  {i}. {name} (weight: {weight})")
    
    # Warnings
    warnings = rubric.get('source_parse', {}).get('warnings', [])
    if warnings:
        print(f"\n⚠️  Warnings ({len(warnings)}):")
        for warning in warnings:
            print(f"  • {warning}")
    
    print("=" * 50)



# CLI functionality (only runs when script is executed directly, not in notebook)
if __name__ == "__main__" and not hasattr(__builtins__, '__IPYTHON__'):
    import argparse, pprint
    ap = argparse.ArgumentParser(description="Parse a rubric file into canonical JSON.")
    ap.add_argument("file", help="Path to rubric: .txt .docx .pdf .png .jpg")
    ap.add_argument("--model", default="gpt-4.1-mini", help="OpenAI model (supports Structured Outputs).")
    args = ap.parse_args()

    result = parse_rubric_file(args.file, model=args.model)
    print(json.dumps(result, indent=2, ensure_ascii=False))


In [3]:
# Test the imports and show what's available
print("🔍 Import Status Check:")
print("=" * 40)

# Check each import
imports_status = {
    "PyMuPDF (fitz)": fitz is not None,
    "pdf2image": convert_from_path is not None,
    "pytesseract": pytesseract is not None,
    "OpenCV": cv2 is not None,
    "NumPy": np is not None,
    "PIL": Image is not None,
    "docx2txt": docx2txt is not None,
    "OpenAI": OpenAI is not None,
    "jsonschema": Draft7Validator is not None
}

for name, status in imports_status.items():
    status_icon = "✅" if status else "❌"
    print(f"{status_icon} {name}")

print("=" * 40)

# Special diagnostic for fitz conflict
if fitz is None:
    print("🔍 Diagnosing fitz import issue...")
    try:
        import sys
        print(f"Python version: {sys.version}")
        
        # Check what packages are installed
        import subprocess
        result = subprocess.run(['pip', 'list'], capture_output=True, text=True)
        if 'fitz' in result.stdout:
            print("⚠️  Found 'fitz' package in pip list - this might be the conflicting package")
            print("💡 Try: pip uninstall fitz")
        if 'PyMuPDF' in result.stdout:
            print("✅ PyMuPDF is installed")
        if 'pymupdf' in result.stdout:
            print("✅ pymupdf is installed")
            
    except Exception as e:
        print(f"Could not run diagnostics: {e}")

print("📝 Note: Some features may be limited if dependencies are missing")
print("💡 Install missing packages with: pip install <package-name>")


🔍 Import Status Check:
❌ PyMuPDF (fitz)
✅ pdf2image
✅ pytesseract
✅ OpenCV
✅ NumPy
✅ PIL
✅ docx2txt
✅ OpenAI
✅ jsonschema
🔍 Diagnosing fitz import issue...
Python version: 3.10.18 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:08:55) [MSC v.1929 64 bit (AMD64)]
⚠️  Found 'fitz' package in pip list - this might be the conflicting package
💡 Try: pip uninstall fitz
✅ PyMuPDF is installed
📝 Note: Some features may be limited if dependencies are missing
💡 Install missing packages with: pip install <package-name>


In [4]:
# 🔧 Fix fitz import conflict (run this cell if fitz import failed)
print("🔧 Fitz Import Conflict Resolution:")
print("=" * 50)

print("If you're getting a 'Directory static/ does not exist' error, you have a naming conflict.")
print("Here's how to fix it:")
print()
print("1️⃣  Uninstall the conflicting 'fitz' package:")
print("   pip uninstall fitz")
print()
print("2️⃣  Install PyMuPDF (the correct package):")
print("   pip install PyMuPDF")
print()
print("3️⃣  Restart your kernel and run this notebook again")
print()
print("Alternative: If you want to keep both packages, you can use:")
print("   from pymupdf import fitz")
print("   (instead of: import fitz)")
print()
print("💡 The issue is that there are two different packages with 'fitz' in the name:")
print("   - 'fitz' (conflicting package causing the error)")
print("   - 'PyMuPDF' (the one we need for PDF processing)")
print("=" * 50)


🔧 Fitz Import Conflict Resolution:
If you're getting a 'Directory static/ does not exist' error, you have a naming conflict.
Here's how to fix it:

1️⃣  Uninstall the conflicting 'fitz' package:
   pip uninstall fitz

2️⃣  Install PyMuPDF (the correct package):
   pip install PyMuPDF

3️⃣  Restart your kernel and run this notebook again

Alternative: If you want to keep both packages, you can use:
   from pymupdf import fitz
   (instead of: import fitz)

💡 The issue is that there are two different packages with 'fitz' in the name:
   - 'fitz' (conflicting package causing the error)
   - 'PyMuPDF' (the one we need for PDF processing)


In [5]:
# 🚀 Quick Fix for fitz Import Conflict
# Run this cell to automatically resolve the fitz import issue

import subprocess
import sys

def fix_fitz_conflict():
    """Automatically fix the fitz import conflict."""
    print("🔧 Attempting to fix fitz import conflict...")
    
    try:
        # Check if the conflicting fitz package is installed
        result = subprocess.run([sys.executable, '-m', 'pip', 'list'], 
                              capture_output=True, text=True, timeout=30)
        
        if 'fitz' in result.stdout and 'PyMuPDF' not in result.stdout:
            print("⚠️  Found conflicting 'fitz' package")
            print("🔄 Uninstalling conflicting package...")
            
            # Uninstall the conflicting package
            uninstall_result = subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'fitz', '-y'], 
                                            capture_output=True, text=True, timeout=30)
            
            if uninstall_result.returncode == 0:
                print("✅ Conflicting 'fitz' package uninstalled")
            else:
                print(f"⚠️  Could not uninstall fitz: {uninstall_result.stderr}")
        
        # Install PyMuPDF
        print("📦 Installing PyMuPDF...")
        install_result = subprocess.run([sys.executable, '-m', 'pip', 'install', 'PyMuPDF'], 
                                      capture_output=True, text=True, timeout=60)
        
        if install_result.returncode == 0:
            print("✅ PyMuPDF installed successfully")
            print("🔄 Please restart your kernel and run the first cell again")
        else:
            print(f"❌ Failed to install PyMuPDF: {install_result.stderr}")
            print("💡 Try running manually: pip install PyMuPDF")
            
    except subprocess.TimeoutExpired:
        print("⏰ Command timed out. Please try running manually:")
        print("   pip uninstall fitz -y")
        print("   pip install PyMuPDF")
    except Exception as e:
        print(f"❌ Error: {e}")
        print("💡 Please run manually:")
        print("   pip uninstall fitz -y")
        print("   pip install PyMuPDF")

# Uncomment the line below to run the automatic fix
# fix_fitz_conflict()

print("📝 To fix manually, run these commands in your terminal:")
print("   pip uninstall fitz -y")
print("   pip install PyMuPDF")
print("   Then restart your kernel and run this notebook again")


📝 To fix manually, run these commands in your terminal:
   pip uninstall fitz -y
   pip install PyMuPDF
   Then restart your kernel and run this notebook again


In [6]:
# Example usage in Jupyter notebook
# Uncomment and modify the path below to test with your rubric file

# Example 1: Parse a rubric file
file_path = "test_file/test_rubric.docx"  # or .docx, .txt, .png, .jpg
result = demo_parse_rubric(file_path)
print_rubric_summary(result)

# Example 2: Just load the functions without running
# print("✅ Rubric parser functions loaded successfully!")
# print("📝 Available functions:")
# print("  • demo_parse_rubric(file_path, model='gpt-4.1-mini') - Parse a rubric file")
# print("  • print_rubric_summary(rubric) - Display formatted summary")
# print("  • parse_rubric_file(file_path, model) - Core parsing function")
# print("  • extract_text_from_file(file_path) - Extract text from various file formats")
# print("\n💡 To use: Uncomment the example code above and provide a valid file path")


✅ Rubric parsed successfully!
📊 Found 6 criteria
📏 Scale type: categorical
📋 RUBRIC: None
📏 Scale Type: categorical
📊 Levels: Excellent → Good → Fair → Poor

📝 Criteria (6):
  1. Thesis & Focus (weight: 3)
  2. Organization (weight: 2)
  3. Evidence & Support (weight: 3)
  4. Analysis & Reasoning (weight: 2)
  5. Style & Clarity (weight: 1)
  6. Grammar & Mechanics (weight: 1)


In [7]:
# Print the result in JSON format
import json

# Pretty print the result as JSON
print("📄 Full JSON Result:")
print("=" * 50)
print(json.dumps(result, indent=2, ensure_ascii=False))
print("=" * 50)


📄 Full JSON Result:
{
  "title": null,
  "scale": {
    "type": "categorical",
    "levels": [
      "Excellent",
      "Good",
      "Fair",
      "Poor"
    ]
  },
  "criteria": [
    {
      "name": "Thesis & Focus",
      "descriptor_by_level": {
        "Excellent": "Clear, original thesis; focused throughout",
        "Good": "Clear thesis, mostly maintained focus",
        "Fair": "Thesis present but vague; focus drifts",
        "Poor": "No clear thesis; unfocused"
      },
      "weight": 3
    },
    {
      "name": "Organization",
      "descriptor_by_level": {
        "Excellent": "Logical structure; smooth transitions",
        "Good": "Mostly logical; some transitions unclear",
        "Fair": "Some organization; transitions weak",
        "Poor": "Disorganized; lacks clear structure"
      },
      "weight": 2
    },
    {
      "name": "Evidence & Support",
      "descriptor_by_level": {
        "Excellent": "Strong, relevant evidence; well-integrated",
        "Good": 