In [1]:
!pip install openai pdf2image pillow




In [3]:
from pdf2image import convert_from_path

def pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path, dpi=300)
    return images


In [11]:
def image_to_base64(image):
    buffer = BytesIO()
    image.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode()


In [1]:
!pip install pytesseract pdf2image pillow opencv-python numpy




In [2]:
from pdf2image import convert_from_path
import pytesseract
import numpy as np
import cv2

def preprocess_image(image):
    """
    Improves OCR accuracy:
    - grayscale
    - thresholding
    """
    image = np.array(image)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    thresh = cv2.threshold(
        gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
    )[1]
    return thresh


def extract_text_with_confidence(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)

    full_text = ""
    confidences = []

    for page in pages:
        processed_img = preprocess_image(page)

        data = pytesseract.image_to_data(
            processed_img,
            output_type=pytesseract.Output.DICT
        )

        for i in range(len(data["text"])):
            word = data["text"][i].strip()
            conf = int(data["conf"][i])

            if word and conf > 0:
                full_text += word + " "
                confidences.append(conf)

        full_text += "\n"

    avg_confidence = round(
        sum(confidences) / len(confidences), 2
    ) if confidences else 0

    return full_text.strip(), avg_confidence


In [18]:
!pip install --upgrade pip
!pip install numpy==1.26.4 opencv-python pytesseract pdf2image pillow


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----------- ---------------------------- 0.5/1.8 MB 1.7 MB/s eta 0:00:01
   ----------------- ---------------------- 0.8/1.8 MB 2.0 MB/s eta 0:00:01
   ----------------------- ---------------- 1.0/1.8 MB 1.5 MB/s eta 0:00:01
   ----------------------------------- ---- 1.6/1.8 MB 1.8 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 1.8 MB/s eta 0:00:00


ERROR: To modify pip, please run the following command:
C:\Users\Bharat\anaconda3\python.exe -m pip install --upgrade pip


Collecting numpy==1.26.4

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.



  Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
INFO: pip is looking at multiple versions of opencv-python to determine which version is compatible with other requirements. This could take a while.
Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Downloading numpy-1.26.4-cp312-cp312-win_amd64.whl (15.5 MB)
   ---------------------------------------- 0.0/15.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.5 MB ? eta -:--:--
   - -------------------------------------- 0.5/15.5 MB 3.4 MB/s eta 0:00:05
   -- ------------------------------------- 1.0/15.5 MB 2.2 MB/s eta 0:00:07
   ---- ----------------------------------- 1.6/15.5 MB 2.5 MB/s eta 0:00:06
   ------ --------------------------------- 2.4/15.5 MB 2.9 MB/s eta 0:00:05
   -------- ------------------------------- 3.1/15.5 MB 3.0 MB/s eta 0:00:05
   ---------- ----------------------------- 3.9/15.5 MB 3.1 MB/s eta 0:00:04
   ---------

In [3]:
pdf_path = "business-finance-lease-agreement_used-vehicle.pdf"

text, accuracy = extract_text_with_confidence(pdf_path)

print("OCR Accuracy Score:", accuracy, "%")
print("\nExtracted Text Preview:\n")
print(text[:1500])


OCR Accuracy Score: 95.04 %

Extracted Text Preview:

@ TOYOTA BUSINESS FINANCE LEASE AGREEMENT Financial Services LESSOR Legal Name: Trading Name: Address: (‘us”, “we”, “our’) LESSEE Legal Name TAX INVOICE “vou” Address GST NO. 49-809-999 Email Address LEASE NO. GST Registered? If YES, GST No. Vehicle used in taxable activity? At least 75% of your supplies for the last 12 months were subject to GST? GUARANTOR Name Address Email Address SPECIFIC INFORMATION MAKE MODEL VIN NO. REG. NO. YEAR ODOMETER KMS Motor Vehicle ACCESSORIES Insurance Details | !NSURER POLICY NO. DISCLOSURE STATEMENT Full Name & Address of Lessor — see details of the initial Lessor above. This is the person providing you the credit. Note: As this contract will be transferred to Toyota Finance New Zealand Limited (“TFNZ”), TFNZ will be the “Lessor” when the transfer happens so the following information should be used for the purposes of communicating with TFNZ. You may send notices to TFNZ by: Toyota Finance New Zeal

In [1]:
from pdf2image import convert_from_path
import pytesseract

def ocr_pdf(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    text = ""

    for page in pages:
        text += pytesseract.image_to_string(page)

    return text


In [2]:
SLA_FIELDS = {
    "interest_rate_apr": ["interest rate", "apr"],
    "loan_term_months": ["loan term", "term", "duration"],
    "monthly_payment": ["monthly payment"],
    "down_payment": ["down payment"],
    "late_fee_penalty": ["late fee", "penalty"],
    "early_termination": ["early termination"],
    "mileage_limit": ["mileage", "miles per year"],
    "purchase_option": ["purchase option", "buyout"],
    "maintenance_responsibility": ["maintenance"],
    "warranty_insurance": ["warranty", "insurance"]
}


In [3]:
import re

def extract_sla_terms(text):
    sla_output = {}

    for field, keywords in SLA_FIELDS.items():
        found_value = None
        confidence = 0.0

        for kw in keywords:
            pattern = rf"{kw}.{{0,50}}"
            match = re.search(pattern, text, re.IGNORECASE)

            if match:
                found_value = match.group().strip()
                confidence = 0.9
                break

        if not found_value:
            for kw in keywords:
                if kw.lower() in text.lower():
                    found_value = "Mentioned but value unclear"
                    confidence = 0.6
                    break

        sla_output[field] = {
            "value": found_value,
            "confidence_score": confidence
        }

    return sla_output


In [4]:
import json

pdf_path = "business-finance-lease-agreement_used-vehicle.pdf"

# Step 1: OCR
contract_text = ocr_pdf(pdf_path)

# Step 2: SLA Extraction
sla_data = extract_sla_terms(contract_text)

# Step 3: Save as JSON
with open("sla_extracted.json", "w") as f:
    json.dump(sla_data, f, indent=2)

print(json.dumps(sla_data, indent=2))


{
  "interest_rate_apr": {
    "value": "Interest Rate % fixed for the whole term of this contract",
    "confidence_score": 0.9
  },
  "loan_term_months": {
    "value": "Term & Payments Details",
    "confidence_score": 0.9
  },
  "monthly_payment": {
    "value": null,
    "confidence_score": 0.0
  },
  "down_payment": {
    "value": null,
    "confidence_score": 0.0
  },
  "late_fee_penalty": {
    "value": null,
    "confidence_score": 0.0
  },
  "early_termination": {
    "value": "Early Termination",
    "confidence_score": 0.9
  },
  "mileage_limit": {
    "value": null,
    "confidence_score": 0.0
  },
  "purchase_option": {
    "value": null,
    "confidence_score": 0.0
  },
  "maintenance_responsibility": {
    "value": "Maintenance Costs (eg. Service Plan) rental payment",
    "confidence_score": 0.9
  },
  "warranty_insurance": {
    "value": "Insurance Details | !NSURER POLICY NO.",
    "confidence_score": 0.9
  }
}
