In [1]:
# install libraries for pdf parsing, ocr, and optional llm
!pip install --quiet pdfplumber pdf2image pytesseract pillow regex openai
!apt-get update -qq && apt-get install -y -qq poppler-utils tesseract-ocr

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package poppler-utils.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking popp

In [2]:
# import modules and suppress pdfminer cropbox warnings
import logging
logging.getLogger("pdfminer.pdfpage").setLevel(logging.ERROR)

import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import re
import json
from google.colab import files


In [3]:
# extract text from pdf via pdfplumber, fall back to ocr if empty
def extract_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += (page.extract_text() or "") + "\n"
    if not text.strip():
        images = convert_from_path(pdf_path, dpi=300)
        text = "".join(pytesseract.image_to_string(img) for img in images)
    return text

# compiled regex patterns for key fields
patterns = {
    "invoice_number": re.compile(r"(?i)from\s+invoice\s*#\s*([A-Z0-9\-]+)"),
    "vendor":         re.compile(r"(?i)^(.+?)\s+invoice\s+date"),
    "date":           re.compile(r"(?i)invoice\s+date\s+(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})"),
    "due_date":       re.compile(r"(?i)due\s+date\s+(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})"),
    "total":          re.compile(r"(?i)^total\s*\$?\s*([\d,]+\.\d{2})", re.MULTILINE),
    "items_header":   re.compile(r"(?i)^qty\s+description\s+unit price\s+amount"),
    "item_line":      re.compile(r"^(\d+)\s+(.+?)\s+([\d,]+\.\d{2})(?:\s+([\d,]+\.\d{2}))?$")
}

# helper to extract a single field
def extract_field(pattern, text, lines=None):
    if lines:
        for l in lines:
            s = l.strip()
            m = pattern.search(s)
            if m:
                return m.group(1).strip().replace(",", "")
    else:
        m = pattern.search(text)
        if m:
            return m.group(1).strip().replace(",", "")
    return None

# parse line items by finding stripped header, then collecting until subtotal
def parse_line_items(lines):
    stripped_lines = [l.strip() for l in lines]
    # find header index on stripped lines
    try:
        idx = next(i for i, l in enumerate(stripped_lines)
                   if patterns["items_header"].match(l))
    except StopIteration:
        return []
    raw = []
    for l in stripped_lines[idx+1:]:
        if not l:
            continue
        if l.lower().startswith("subtotal"):
            break
        raw.append(l)
    items = []
    for row in raw:
        m = patterns["item_line"].match(row)
        if m:
            qty, desc, up, amt = m.groups()
            if not amt:
                try:
                    amt = f"{int(qty)*float(up.replace(',','')):.2f}"
                except:
                    amt = None
        else:
            qty, desc, up, amt = None, row, None, None
        items.append({
            "quantity":    qty,
            "description": desc,
            "unit_price":  up,
            "amount":      amt
        })
    return items

# extract sales tax info
def extract_tax(text):
    m = re.search(r"(?i)sales\s+tax\s+([\d\.]+)%\s+([\d,]+\.\d{2})", text)
    if m:
        return {"rate_percent": m.group(1), "amount": m.group(2).replace(",", "")}
    return {"rate_percent": None, "amount": None}

# validate subtotal + tax against extracted total
def validate_invoice(data):
    def to_f(v):
        try: return float(v)
        except: return 0.0
    subtotal = sum(to_f(item["amount"]) for item in data.get("line_items", []))
    tax_amt  = to_f(data.get("tax", {}).get("amount"))
    total    = to_f(data.get("total"))
    return {
        "calculated_subtotal": subtotal,
        "tax_amount":          tax_amt,
        "expected_total":      subtotal + tax_amt,
        "extracted_total":     total,
        "match":               abs((subtotal + tax_amt) - total) < 0.01
    }


In [4]:
# upload invoice pdf and extract its text
uploaded = files.upload()
pdf_file = next(iter(uploaded))
print(f"uploaded: {pdf_file}")

text = extract_text(pdf_file)
lines = text.split("\n")


Saving invoice.pdf to invoice.pdf
uploaded: invoice.pdf


In [5]:
# extract key fields into invoice_data
invoice_data = {
    "invoice_number": extract_field(patterns["invoice_number"], text),
    "vendor":         extract_field(patterns["vendor"], text, lines),
    "date":           extract_field(patterns["date"], text),
    "due_date":       extract_field(patterns["due_date"], text),
    "total":          extract_field(patterns["total"], text, lines),
    "line_items":     parse_line_items(lines),
    "tax":            extract_tax(text)
}

# show intermediate json
print(json.dumps(invoice_data, indent=2))


{
  "invoice_number": "US-001",
  "vendor": "East Repair Inc.",
  "date": "11/02/2019",
  "due_date": "26/02/2019",
  "total": "154.06",
  "line_items": [
    {
      "quantity": "1",
      "description": "Front and rear brake cables",
      "unit_price": "100.00",
      "amount": "100.00"
    },
    {
      "quantity": "2",
      "description": "New set of pedal arms",
      "unit_price": "15.00",
      "amount": "30.00"
    },
    {
      "quantity": "3",
      "description": "Labor 3hrs",
      "unit_price": "5.00",
      "amount": "15.00"
    }
  ],
  "tax": {
    "rate_percent": "6.25",
    "amount": "9.06"
  }
}


In [6]:
#validation results and print final cleaned invoice data
invoice_data["validation"] = validate_invoice(invoice_data)
print(json.dumps(invoice_data, indent=2))


{
  "invoice_number": "US-001",
  "vendor": "East Repair Inc.",
  "date": "11/02/2019",
  "due_date": "26/02/2019",
  "total": "154.06",
  "line_items": [
    {
      "quantity": "1",
      "description": "Front and rear brake cables",
      "unit_price": "100.00",
      "amount": "100.00"
    },
    {
      "quantity": "2",
      "description": "New set of pedal arms",
      "unit_price": "15.00",
      "amount": "30.00"
    },
    {
      "quantity": "3",
      "description": "Labor 3hrs",
      "unit_price": "5.00",
      "amount": "15.00"
    }
  ],
  "tax": {
    "rate_percent": "6.25",
    "amount": "9.06"
  },
  "validation": {
    "calculated_subtotal": 145.0,
    "tax_amount": 9.06,
    "expected_total": 154.06,
    "extracted_total": 154.06,
    "match": true
  }
}


In [7]:
# save the cleaned invoice_data to a json file and download it
with open("invoice_data.json", "w") as f:
    json.dump(invoice_data, f, indent=2)
files.download("invoice_data.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>