In [32]:
import json
import sys
from pathlib import Path

# Add project root to path for imports (go up one level from examples/)
project_root = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent
sys.path.insert(0, str(project_root))

from eval.evaluator import process_resume, extract_text_from_pdf, clean_resume_text, segment_resume

# --------------------------
# CONFIG
# --------------------------
# Input PDF file
input_pdf = Path(r"C:\RESUME-AI-PROJECT\data\uploads\resume1.pdf")

# Output directory for JSONL files
output_dir = Path(r"C:\RESUME-AI-PROJECT\data\processed_texts")
output_dir.mkdir(parents=True, exist_ok=True)

# --------------------------
# MAIN PROCESSING
# --------------------------
if not input_pdf.exists():
    raise FileNotFoundError(f"❌ Input file not found: {input_pdf}")

print(f"📄 Processing PDF: {input_pdf}")

try:
    # Extract text from PDF
    print("🔄 Extracting text from PDF...")
    raw_text = extract_text_from_pdf(str(input_pdf))
    
    # Clean the text
    print("🔄 Cleaning text...")
    cleaned_text = clean_resume_text(raw_text)
    
    # Segment the resume
    print("🔄 Segmenting resume...")
    segmented_data = segment_resume(cleaned_text)
    
    # Create output data structure
    output_data = {
        "source_file": str(input_pdf),
        "raw_text": raw_text,
        "cleaned_text": cleaned_text,
        "segmented_text": segmented_data,
        "extraction_method": "pdfplumber"
    }
    
    # Save as JSONL format
    output_file = output_dir / f"{input_pdf.stem}.jsonl"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(json.dumps(output_data, ensure_ascii=False) + '\n')
    
    print(f"✅ Successfully processed and saved to: {output_file}")
    print(f"📊 Segmented sections found: {list(segmented_data.keys())}")
    
    # Show preview of segmented data
    print("\n📋 Preview of segmented text:")
    for section, content in segmented_data.items():
        if content.strip():
            print(f"\n{section}:")
            print(f"  {content[:100]}{'...' if len(content) > 100 else ''}")

except Exception as e:
    print(f"❌ Error processing file: {e}")
    import traceback
    traceback.print_exc()


📄 Processing PDF: C:\RESUME-AI-PROJECT\data\uploads\resume1.pdf
🔄 Extracting text from PDF...
🔄 Cleaning text...
🔄 Segmenting resume...
✅ Successfully processed and saved to: C:\RESUME-AI-PROJECT\data\processed_texts\resume1.jsonl
📊 Segmented sections found: ['PROFILE', 'TECH SKILLS', 'SOFT SKILLS', 'EDUCATION', 'OTHER']

📋 Preview of segmented text:

PROFILE:
  II m a student passionate about AI and BI  exploring how artificial CONTACT intelligence can transfo...

TECH SKILLS:
  Hand sign preedictor c programming It analysis the sign and report what it is. c ++ (basics) it is b...

SOFT SKILLS:
  Certified by Avinash Academy on C C++ python basics Teamwork NPTEL :Buisness fundamentals for entrep...

EDUCATION:
  Leadership school : M.S . Vidyalaya Matriculation school 2011 - 2024 Critical Thinking percentage : ...

OTHER:
  ARJUN HAREESH S AI AND BI DEVELOPER
