In [1]:
from pathlib import Path
from pypdf import PdfReader
import re

DATA_DIR = Path('../data')
PDF_FILES = list(DATA_DIR.glob('*.pdf'))
print(f"Found {len(PDF_FILES)} PDF(s):", [p.name for p in PDF_FILES])

def read_pdf_text(path: Path) -> str:
    reader = PdfReader(str(path))
    pages = []
    for p in reader.pages:
        text = p.extract_text() or ""
        pages.append(text)
    return "\n".join(pages)

def clean_text(text: str) -> str:
    text = re.sub(r'\r', '\n', text)
    text = re.sub(r'\n{2,}', '\n', text)
    text = re.sub(r' {2,}', ' ', text)
    return text.strip()

if PDF_FILES:
    sample_pdf = PDF_FILES[0]
    print(f"Reading: {sample_pdf.name}\n")
    raw_text = read_pdf_text(sample_pdf)
    cleaned_text = clean_text(raw_text)

    print(f"Raw length: {len(raw_text)} chars | Cleaned length: {len(cleaned_text)} chars")

    # Show first few lines
    print("\n--- First 10 lines (cleaned) ---\n")
    for i, line in enumerate(cleaned_text.split('\n')[:10]):
        print(f"{i+1:02d}: {line}")

# --- Optional: Save cleaned output for inspection ---
OUTPUT_DIR = Path('../notebooks/output')
OUTPUT_DIR.mkdir(exist_ok=True)

if PDF_FILES:
    out_file = OUTPUT_DIR / f"{sample_pdf.stem}_cleaned.txt"
    with open(out_file, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)
    print(f"\nSaved cleaned text to {out_file}")


Found 1 PDF(s): ['AI_Training_Document.pdf']
Reading: AI_Training_Document.pdf

Raw length: 3638 chars | Cleaned length: 3637 chars

--- First 10 lines (cleaned) ---

01: User Agreement 
02: 1. Introduction 
03: This User Agreement, the Mobile Application Terms of Use, and all policies and additional terms 
04: posted on and in our sites, applications, tools, and services (collectively "Services") set out the terms 
05: on which eBay offers you access to and use of our Services. You can find an overview of our policies 
06: here. The Mobile Application Terms of Use, all policies, and additional terms posted on and in our 
07: Services are incorporated into this User Agreement. You agree to comply with all terms of this User 
08: Agreement when accessing or using our Services. 
09: The entity you are contracting with is: eBay Inc., 2025 Hamilton Ave., San Jose, CA 95125, if you 
10: reside in the United States; eBay (UK) Limited, 1 More London Place, London, SE1 2AF, United 

Saved clea