In [1]:
# llm_resume_cfg_single_source.py
import re, os, json, time, argparse, yaml, requests
from pathlib import Path
from typing import Dict, List, Set, Optional

# ===== normalizers (tiny) =====
# Normalize text: remove extra spaces, lowercase, strip edges
_norm = lambda s: re.sub(r"\s+"," ",(s or "").strip().lower())

# Normalize email: strip whitespace and lowercase
_norm_email = lambda s: (s or "").strip().lower()

# Normalize skills list: clean each skill, remove empties, return as set
_norm_skills = lambda xs: { _norm(x) for x in (xs or []) if _norm(x) }

# ===== load config ONCE (single source of truth) =====
def load_cfg(config_path: str) -> Dict:
    """Load and validate YAML configuration for LLM-based resume extraction."""
    # Get config file path and parent directory for relative path resolution
    cfgp = Path(config_path); base = cfgp.parent
    
    # Load YAML configuration safely
    cfg = yaml.safe_load(cfgp.read_text(encoding="utf-8"))

    # Validate required top-level configuration fields
    for k in ["input_dir","output_dir","ground_truth_path","llm"]:
        assert k in cfg, f"Config missing '{k}'"
    
    # Validate required LLM configuration fields
    for k in ["api_key","endpoint","model","temperature","timeout_s",
              "max_retries","min_call_interval_s","prompt_mode","max_chars","max_tokens"]:
        assert k in cfg["llm"], f"Config.llm missing '{k}'"

    # Resolve all paths relative to the config file location
    paths = {
        "input_dir": (base / cfg["input_dir"]).resolve(),                    # Directory with resume files
        "output_dir": (base / cfg["output_dir"]).resolve(),                  # Output directory for results
        "ground_truth_path": (base / cfg["ground_truth_path"]).resolve(),    # Ground truth JSON file
        "aggregate_path": (base / cfg.get("aggregate_path")).resolve() if cfg.get("aggregate_path") else None,  # Optional aggregate output file
    }
    
    # Extract optional settings with defaults
    options = {
        "exts": set(cfg.get("exts", [".pdf",".docx",".txt"])),  # Supported file extensions (includes .txt for LLM version)
        "print_each": bool(cfg.get("print_each_resume", False)), # Print status for each processed file
    }
    
    # Pass LLM config as-is without any defaults (all required fields validated above)
    llm = cfg["llm"]

    return {"paths": paths, "options": options, "llm": llm}

In [2]:
# ===== Class 1: LLM extractor (reads everything from cfg) =====
class LLMResumeExtractor:
    """LLM-based resume parser that uses API calls to extract structured data."""
    
    def __init__(self, cfg: Dict):
        """Initialize with LLM configuration and rate limiting."""
        self.llm = cfg["llm"]  # Store all LLM settings
        self.max_chars = int(self.llm["max_chars"])  # Text truncation limit for API calls
        self._last_call_at = 0.0  # Track last API call for rate limiting

    def _read_text(self, file_path: str) -> str:
        """Extract plain text from .txt, .docx, or .pdf files."""
        p = Path(file_path); suf = p.suffix.lower()
        
        # Handle plain text files
        if suf == ".txt":
            try: 
                return p.read_text(encoding="utf-8", errors="ignore", newline=" ")
            except: 
                return ""
        
        # Handle Word documents
        if suf == ".docx":
            try:
                from docx import Document  # Lazy import
                return "\n".join(para.text for para in Document(str(p)).paragraphs)
            except: 
                return ""
        
        # Handle PDF documents
        if suf == ".pdf":
            try:
                import pdfplumber  # Lazy import
                with pdfplumber.open(str(p)) as pdf:
                    return "\n".join((pg.extract_text() or "") for pg in pdf.pages)
            except: 
                return ""
        
        return ""  # Unsupported file type

    @staticmethod
    def _sys() -> str:
        """Generate system prompt with strict JSON format requirements."""
        return ('Extract ONLY what exists. Return STRICT JSON: '
                '{"name":"string","email":"string","skills":["string",...]}. '
                'No prose. Skills lowercase, dedup. Empty when missing.')

    def _usr(self, text: str) -> str:
        """Generate user prompt with optional few-shot examples."""
        # Truncate text to stay within API limits
        text = (text or "")[: self.max_chars]
        
        # Use few-shot prompting if configured
        if self.llm["prompt_mode"] == "few":
            return ("Example A (output JSON only):\n"
                    "INPUT: 'Jane Doe\\nEmail: jane@ex.com\\nSkills: Python, SQL'\\n"
                    'OUTPUT: {"name":"Jane Doe","email":"jane@ex.com","skills":["python","sql"]}\n'
                    "Example B (missing email):\n"
                    "INPUT: 'John Smith\\nSkills: Docker, Kubernetes'\\n"
                    'OUTPUT: {"name":"John Smith","email":"","skills":["docker","kubernetes"]}\n\n'
                    f"Now extract JSON from RESUME:\n{text}")
        
        # Single-shot prompting (simpler, less context)
        return f"Extract JSON with that schema from this RESUME:\n{text}"

    def _coerce(self, s: str) -> Optional[Dict]:
        """Clean and parse LLM response into structured dictionary."""
        if not s: return None
        
        # Remove markdown code blocks
        s = re.sub(r"^```(?:json)?\s*|\s*```$", "", s.strip(), flags=re.I|re.S).strip()
        
        # Extract JSON object from response
        m = re.search(r"\{.*\}", s, flags=re.S); s = m.group(0) if m else s
        
        # Fix common quote character issues
        s = s.replace("\u201c",'"').replace("\u201d",'"').replace("'",'"')
        
        try:
            obj = json.loads(s)
            if not isinstance(obj, dict): return None
            
            # Ensure string types and clean data
            name, email = str(obj.get("name","")), str(obj.get("email",""))
            skills = [str(x) for x in (obj.get("skills") or []) if str(x).strip()]
            
            return {"name": name, "email": email, "skills": skills}
        except: 
            return None

    def _call(self, messages: List[Dict[str,str]]) -> Optional[Dict]:
        """Make rate-limited API call to LLM service."""
        # Enforce minimum interval between API calls
        elapsed = time.time() - self._last_call_at
        wait = float(self.llm["min_call_interval_s"]) - elapsed
        if wait > 0: time.sleep(wait)
        
        try:
            # Make POST request to LLM API
            r = requests.post(
                self.llm["endpoint"],
                headers={"Authorization": f"Bearer {self.llm['api_key']}", "Content-Type":"application/json"},
                json={
                    "model": self.llm["model"],
                    "messages": messages,
                    "temperature": float(self.llm["temperature"]),
                    "max_tokens": int(self.llm["max_tokens"]),
                },
                timeout=int(self.llm["timeout_s"]),
            )
            
            # Update rate limiting tracker
            self._last_call_at = time.time()
            r.raise_for_status()
            
            # Extract response content and parse JSON
            txt = r.json()["choices"][0]["message"]["content"]
            return self._coerce(txt)
        except: 
            return None

    def parse_one(self, file_path: str) -> Dict:
        """Parse single resume file with retry logic and fallbacks."""
        text = self._read_text(file_path)
        
        # Prepare messages for LLM API call
        msgs = [{"role":"system","content":self._sys()},
                {"role":"user","content":self._usr(text)}]
        
        obj = None
        # Retry with configured max attempts
        for a in range(int(self.llm["max_retries"])):
            obj = self._call(msgs)
            if obj: break
            
            # Last attempt: switch to simpler prompt to reduce drift
            if a == int(self.llm["max_retries"]) - 1 and self.llm["prompt_mode"] == "few":
                self.llm["prompt_mode"] = "single"
                msgs[1]["content"] = self._usr(text)

        # Use rule-based fallbacks if LLM extraction fails
        if not obj:
            obj = {"name":"", "email":"", "skills":[]}
            
            # Fallback email extraction using regex
            m = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text or "")
            if m: obj["email"] = m.group(0)
            
            # Fallback name extraction using heuristics
            for line in (text or "").splitlines():
                line = line.strip()
                if 3<=len(line)<=80 and 2<=len(line.split())<=5 and re.fullmatch(r"[A-Za-zÀ-ÖØ-öø-ÿ.'\- ]+", line):
                    obj["name"] = line; break
        
        # Normalize and sort skills for consistent output
        obj["skills"] = sorted(_norm_skills(obj.get("skills")))
        
        return obj

In [3]:
# ===== Class 2: Evaluator/writer (uses cfg) =====
class ResumeEvaluator:
    """Evaluates LLM extraction results against ground truth and writes output files."""
    
    def __init__(self, cfg: Dict):
        """Initialize evaluator with paths, ground truth data, and metric counters."""
        self.paths = cfg["paths"]; self.options = cfg["options"]
        
        # Load ground truth data (filename -> expected results mapping)
        self.gt = json.loads(self.paths["ground_truth_path"].read_text(encoding="utf-8"))
        
        # Create output directory if it doesn't exist
        self.paths["output_dir"].mkdir(parents=True, exist_ok=True)
        
        # Initialize evaluation metric counters
        self.n_name=self.n_email=self.ok_name=self.ok_email=0  # Name/email accuracy counters
        self.tp=self.fp=self.fn=0  # True positive, false positive, false negative for skills
        
        # Storage for optional aggregate output file
        self.aggregate = {}

    def _cmp_missing(self, ex: Dict, gt: Dict):
        """Compare extracted data with ground truth and identify missing/incorrect fields."""
        miss={}
        
        # Check name match (normalized comparison)
        if _norm(ex.get("name")) != _norm(gt.get("name")): 
            miss["name"]=gt.get("name")
        
        # Check email match (normalized comparison)
        if _norm_email(ex.get("email")) != _norm_email(gt.get("email")): 
            miss["email"]=gt.get("email")
        
        # Compare skill sets and find missing ones
        exs, gts = _norm_skills(ex.get("skills")), _norm_skills(gt.get("skills"))
        lack = sorted(gts - exs)  # Skills in ground truth but not extracted
        ex_extra = sorted(exs - gts)  # Skills extracted but not in ground truth (for debugging)
        if lack: miss["skills"]=lack
        
        return miss, exs, gts

    def _update(self, exs:Set[str], gts:Set[str], name_ok:bool, email_ok:bool):
        """Update evaluation metrics based on comparison results."""
        # Update name/email counters
        self.n_name+=1; self.n_email+=1
        if name_ok: self.ok_name+=1
        if email_ok: self.ok_email+=1
        
        # Update skills metrics (micro-averaged precision/recall)
        inter=len(exs & gts)  # Correctly identified skills
        self.tp+=inter  # True positives
        self.fp+=max(0,len(exs)-inter)  # False positives (extracted but wrong)
        self.fn+=max(0,len(gts)-inter)  # False negatives (missed from ground truth)

    def process_one(self, file_path: str, extracted: Dict) -> bool:
        """Process one resume: evaluate, write individual output, update aggregate."""
        fname = Path(file_path).name
        gt = self.gt.get(fname)
        
        # Skip files not in ground truth (can't evaluate)
        if gt is None: return False
        
        # Compare with ground truth and update metrics
        miss, exs, gts = self._cmp_missing(extracted, gt)
        self._update(exs, gts,
            _norm(extracted.get("name")) == _norm(gt.get("name")),
            _norm_email(extracted.get("email")) == _norm_email(gt.get("email")))
        
        # Prepare output object with extracted data and diagnostic info
        out_obj = {"name":extracted.get("name",""), "email":extracted.get("email",""),
                   "skills":extracted.get("skills",[]), "missing_block":miss}
        
        # Write individual result file
        output_file = self.paths["output_dir"] / (Path(fname).stem + ".json")
        output_file.write_text(
            json.dumps(out_obj, ensure_ascii=False, indent=2), encoding="utf-8")
        
        # Add to aggregate collection if configured
        if self.paths.get("aggregate_path"): 
            self.aggregate[fname]=out_obj
        
        # Print status if requested
        if self.options["print_each"]: 
            print(f"[OK] {fname}")
        
        return True

    def finalize(self) -> Dict:
        """Write aggregate file (if configured) and return final evaluation metrics."""
        # Write aggregate output file if path is configured
        if self.paths.get("aggregate_path"):
            self.paths["aggregate_path"].write_text(
                json.dumps(self.aggregate, ensure_ascii=False, indent=2), encoding="utf-8")
        
        # Calculate precision, recall, F1 for skills (micro-averaged)
        prec = self.tp/(self.tp+self.fp) if (self.tp+self.fp) else 0.0  # TP/(TP+FP)
        rec  = self.tp/(self.tp+self.fn) if (self.tp+self.fn) else 0.0  # TP/(TP+FN)
        f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0  # Harmonic mean
        
        return {
            "name_accuracy": round(self.ok_name/self.n_name,4) if self.n_name else 0.0,
            "email_accuracy": round(self.ok_email/self.n_email,4) if self.n_email else 0.0,
            "skills_precision_micro": round(prec,4),
            "skills_recall_micro": round(rec,4),
            "skills_f1_micro": round(f1,4),
            "num_evaluated": self.n_name,
        }

In [5]:
# ===== runner (no duplicated config usage) =====
def main(argv=None):
    """Main entry point for LLM-based resume extraction pipeline."""
    # Set up command-line argument parsing
    ap = argparse.ArgumentParser()
    ap.add_argument("--config", default="config.yaml", help="Path to YAML configuration file")
    args, _ = ap.parse_known_args(argv)

    # Load configuration once and pass to all components
    cfg = load_cfg(args.config)                              # ← load ONCE
    extractor = LLMResumeExtractor(cfg)                      # ← pass cfg
    evaluator = ResumeEvaluator(cfg)                         # ← pass cfg

    # Extract file processing settings from config
    exts = cfg["options"]["exts"]; input_dir = cfg["paths"]["input_dir"]
    
    # Process all matching files in input directory
    wrote=False
    for p in sorted(input_dir.glob("*")):
        # Skip files with unsupported extensions
        if p.suffix.lower() not in exts: continue
        
        # Extract data using LLM and evaluate against ground truth
        ex = extractor.parse_one(str(p))
        wrote |= evaluator.process_one(str(p), ex)  # Track if any files were processed

    # Print final evaluation metrics
    print(json.dumps(evaluator.finalize(), indent=2))
    
    # Warn if no files were processed (likely ground truth mismatch)
    if not wrote: 
        print("[Note] No files were written (no filename matched ground truth).")

def run_from_notebook(config_path="config.yaml"):
    """Convenience function for running from Jupyter notebooks."""
    # Call main with config argument to avoid sys.argv issues in notebooks
    main(["--config", config_path])

# Standard Python script entry point
if __name__ == "__main__":
    main()

[OK] resume_alice.pdf
[OK] resume_brian.pdf
[OK] resume_carmen.pdf
{
  "name_accuracy": 1.0,
  "email_accuracy": 1.0,
  "skills_precision_micro": 0.0,
  "skills_recall_micro": 0.0,
  "skills_f1_micro": 0.0,
  "num_evaluated": 3
}
