In [1]:
import os
from dotenv import load_dotenv

# access the openai api and unstract llm whisperer key from .env file and map it to the current system
os.environ['OPENAI_API_KEY']=os.getenv("openai_api_key")

In [2]:
from pydantic import BaseModel, Field
from typing import List, Optional
from datetime import date

# Your existing Pydantic models
class InvoiceItem(BaseModel):
    item: str = Field(description="Description of the item")
    cost: float = Field(description="Cost per item")
    quantity: int = Field(description="Quantity of items")
    total: float = Field(description="Total cost for this item")

class Address(BaseModel):
    name: str = Field(description="Name of the company or individual")
    street: str = Field(description="Street address")
    city_zip: str = Field(description="City and ZIP code")
    country: str = Field(description="Country")

class Invoice(BaseModel):
    invoice_number: str = Field(description="Invoice number")
    issue_date: date = Field(description="Date the invoice was issued")
    due_date: date = Field(description="Date the invoice is due")
    from_address: Address = Field(description="Sender's address")
    to_address: Address = Field(description="Recipient's address")
    items: List[InvoiceItem] = Field(description="List of invoice items")
    subtotal: float = Field(description="Subtotal before discount")
    discount_percentage: float = Field(description="Discount percentage")
    discount_amount: float = Field(description="Discount amount")
    total_before_misc: float = Field(description="Total after discount")
    misc_costs: float = Field(description="Miscellaneous costs")
    total: float = Field(description="Final total including misc costs")
    iban: str = Field(description="IBAN for payment")
    notes: Optional[str] = Field(description="Additional notes on the invoice")
    signature: Optional[str] = Field(description="Signature or name of the issuer")


In [4]:
import PyPDF2
import pytesseract
from PIL import Image
import pdf2image
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
from datetime import date
import json

def extract_text_from_scanned_pdf(pdf_path: str) -> str:
    """Extract text from scanned PDF using PyPDF2 + Tesseract"""
    try:
        images = pdf2image.convert_from_path(pdf_path)
        text = ""
        for i, image in enumerate(images):
            print(f"Processing page {i+1}...")
            text += pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return ""

def extract_invoice_data(text: str) -> str:
    """Extract invoice data using LangChain"""
    
    # Define the prompt
    system_template = """You are an expert at extracting invoice information.
    Your task is to accurately extract all details from the invoice including:
    - Invoice number and dates
    - Addresses (both from and to)
    - Line items with costs, quantities, and totals
    - All financial information including subtotals, discounts, and final total
    - Payment information like IBAN
    - Any additional notes or signatures
    
    For dates, use the format YYYY-MM-DD.
    For currency values, only include the numeric amount without currency symbols."""
    
    human_template = """Here's the text extracted from a scanned invoice:
    {text}
    
    Please extract all invoice information in the specified format.
    {format_instructions}"""
    
    # Create the chat prompt
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
    
    parser = PydanticOutputParser(pydantic_object=Invoice)
    chat_prompt = ChatPromptTemplate.from_messages([
        system_message_prompt,
        human_message_prompt
    ])
    
    # Format the prompt
    prompt = chat_prompt.format_prompt(
        text=text,
        format_instructions=parser.get_format_instructions()
    ).to_messages()
    
    # Get response from LLM
    llm = ChatOpenAI(temperature=0)
    response = llm.invoke(prompt)
    
    return response.content

# Process the invoice
pdf_path = "../raw-docs/scanned-invoice.pdf"

# Extract text
print("Extracting text from PDF...")
extracted_text = extract_text_from_scanned_pdf(pdf_path)

# Extract data using LangChain
print("\nExtracting invoice data...")
invoice_data = extract_invoice_data(extracted_text)

try:
    # Parse the response
    parsed_data = json.loads(invoice_data)
    
    # Save to file
    output_file = "../saved-docs/pypdf_invoice_data.json"
    with open(output_file, 'w') as f:
        json.dump(parsed_data, f, indent=2)
    
    print(f"\nData extracted and saved to {output_file}")
    
    # Print key information
    print("\nKey Invoice Details:")
    print(f"Invoice Number: {parsed_data['invoice_number']}")
    print(f"Total Amount: ${parsed_data['total']}")
    print(f"Due Date: {parsed_data['due_date']}")
    
except Exception as e:
    print(f"Error parsing invoice data: {str(e)}")

# Helper function to ask questions about the invoice
def ask_invoice_question(question: str, parsed_data: dict) -> str:
    """Ask questions about the extracted invoice data"""
    prompt = f"""Based on the following invoice data:
    {json.dumps(parsed_data, indent=2)}
    
    Please answer this question: {question}
    
    Provide a clear, concise answer with specific numbers when relevant."""
    
    llm = ChatOpenAI(temperature=0)
    response = llm.invoke(prompt)
    return response.content

# Example questions
questions = [
    "What is the total amount of this invoice?",
    "Who is the sender of this invoice?",
    "What items were purchased and what are their quantities?",
    "When is the payment due?",
    "Was any discount applied to this invoice?"
]

# Ask questions about the extracted data
if parsed_data:
    print("\nAnswering Questions about Invoice:")
    for question in questions:
        print(f"\nQ: {question}")
        answer = ask_invoice_question(question, parsed_data)
        print(f"A: {answer}")

Extracting text from PDF...
Processing page 1...

Extracting invoice data...

Data extracted and saved to ../saved-docs/pypdf_invoice_data.json

Key Invoice Details:
Invoice Number: N2 78493
Total Amount: $7700
Due Date: 2022-01-07

Answering Questions about Invoice:

Q: What is the total amount of this invoice?
A: The total amount of this invoice is $7700.

Q: Who is the sender of this invoice?
A: The sender of this invoice is "American Railcorp" with the address "123 Godly St., Internet, 68493 Miami, 67833, United States".

Q: What items were purchased and what are their quantities?
A: The items purchased were:
1. Travel expenses - Quantity: 1
2. Logistics expenses - Quantity: 1
3. Allowance expenses - Quantity: 1

Q: When is the payment due?
A: The payment is due on January 7, 2022.

Q: Was any discount applied to this invoice?
A: Yes, a discount of 20% was applied to this invoice. The discount amount was $1400, which was deducted from the subtotal of $8000 to get a total before mis