In [1]:
import os
from dotenv import load_dotenv

# access the openai api and unstract llm whisperer key from .env file and map it to the current system
os.environ['OPENAI_API_KEY']=os.getenv("openai_api_key")

In [2]:
import fitz
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
from datetime import date

# Your existing Pydantic models
class InvoiceItem(BaseModel):
    item: str = Field(description="Description of the item")
    cost: float = Field(description="Cost per item")
    quantity: int = Field(description="Quantity of items")
    total: float = Field(description="Total cost for this item")

class Address(BaseModel):
    name: str = Field(description="Name of the company or individual")
    street: str = Field(description="Street address")
    city_zip: str = Field(description="City and ZIP code")
    country: str = Field(description="Country")

class Invoice(BaseModel):
    invoice_number: str = Field(description="Invoice number")
    issue_date: date = Field(description="Date the invoice was issued")
    due_date: date = Field(description="Date the invoice is due")
    from_address: Address = Field(description="Sender's address")
    to_address: Address = Field(description="Recipient's address")
    items: List[InvoiceItem] = Field(description="List of invoice items")
    subtotal: float = Field(description="Subtotal before discount")
    discount_percentage: float = Field(description="Discount percentage")
    discount_amount: float = Field(description="Discount amount")
    total_before_misc: float = Field(description="Total after discount")
    misc_costs: float = Field(description="Miscellaneous costs")
    total: float = Field(description="Final total including misc costs")
    iban: str = Field(description="IBAN for payment")
    notes: Optional[str] = Field(description="Additional notes on the invoice")
    signature: Optional[str] = Field(description="Signature or name of the issuer")

def extract_text_with_ocr(pdf_path: str, dpi: int = 300) -> str:
    """Extract text from scanned document using PyMuPDF's OCR"""
    doc = fitz.open(pdf_path)
    text = ""
    
    tessdata_dir = '/opt/homebrew/Cellar/tesseract/5.4.1_1/share/tessdata'
    
    for page in doc:
        tp = page.get_textpage_ocr(
            dpi=dpi,
            full=True,
            tessdata=tessdata_dir
        )
        text += page.get_text(textpage=tp)
    
    doc.close()
    return text

def compile_template_and_get_llm_response(preamble, extracted_text, pydantic_object):
    """Process text using LangChain and OpenAI"""
    postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply."
    system_template = "{preamble}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_template = "{format_instructions}\n\n{extracted_text}\n\n{postamble}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    request = chat_prompt.format_prompt(preamble=preamble,
                                      format_instructions=parser.get_format_instructions(),
                                      extracted_text=extracted_text,
                                      postamble=postamble).to_messages()
    chat = ChatOpenAI()
    response = chat.invoke(request, temperature=0.0)
    print(f"Response from LLM:\n{response.content}")
    return response.content

def extract_invoice_from_pdf(pdf_path: str) -> str:
    """Main function to extract and process invoice"""
    # First extract text using OCR
    extracted_text = extract_text_with_ocr(pdf_path)
    
    # Define the preamble for invoice processing
    preamble = ("You're seeing text extracted from a scanned invoice. "
                "Your job is to accurately extract all invoice details including: "
                "invoice number, dates, addresses, items, costs, and any additional information. "
                "For dates, use the format YYYY-MM-DD. "
                "For currency values, only include the numeric amount without the currency symbol.")
    
    # Process with LangChain
    return compile_template_and_get_llm_response(preamble, extracted_text, Invoice)



In [3]:
import json
import ast
from datetime import date

# Use the functions
try:
    # Extract and process invoice
    invoice_text = extract_invoice_from_pdf("../raw-docs/scanned-invoice.pdf")
    # Parse the string into a Python dictionary
    try:
        parsed_data = ast.literal_eval(invoice_text)
    except (SyntaxError, ValueError):
        try:
            parsed_data = json.loads(invoice_text)
        except json.JSONDecodeError:
            print("Error: Unable to parse the response string.")
            parsed_data = {}

    # Add metadata
    combined_data = {
        "document_type": "scanned-invoice",
        "extraction_date": str(date.today()),
        "extraction_method": "PyMuPDF OCR + LangChain",
        "extracted_data": parsed_data
    }

    # Save to JSON file
    output_file = "../saved-docs/pymupdf_extracted_invoice_data.json"
    with open(output_file, 'w') as f:
        json.dump(combined_data, f, indent=2)

    print(f"\nData written to {output_file}")

except Exception as e:
    print(f"Error: {str(e)}")

Response from LLM:
{
	"Invoice Number": "N2 78493",
	"Issue Date": "2022-01-01",
	"Due Date": "2022-01-07",
	"From Address": {
		"Name": "America Rall INC",
		"Street": "123 Godly St.",
		"City Zip": "1234, Bouleward avenue st",
		"Country": "United States"
	},
	"To Address": {
		"Name": "Simon Wilson",
		"Street": "Internet",
		"City Zip": "68493 Miami",
		"Country": "United States"
	},
	"Items": [{
			"Item": "Travel expenses",
			"Cost": 2000,
			"Quantity": 1,
			"Total": 2000
		},
		{
			"Item": "Logistics expenses",
			"Cost": 5000,
			"Quantity": 1,
			"Total": 5000
		},
		{
			"Item": "Allowance expenses",
			"Cost": 1000,
			"Quantity": 1,
			"Total": 1000
		}
	],
	"Subtotal": 8000,
	"Discount Percentage": 20,
	"Discount Amount": 1400,
	"Total Before Misc": 6600,
	"Misc Costs": 1100,
	"Total": 7700,
	"Iban": "ABCD EFGH 0000 0000 0000",
	"Notes": "This invoice is to be paid by wire transfer only, unless agreed otherwise and must be pald before the date due specified above.",
	"

In [4]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import Document
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# 1. Load the extracted invoice data
with open("../saved-docs/pymupdf_extracted_invoice_data.json", 'r') as f:
    data = json.load(f)

# 2. Create document for vector store
documents = [
    Document(
        page_content=json.dumps(data['extracted_data']),
        metadata={
            "document_type": data['document_type'],
            "extraction_date": data['extraction_date'],
            "extraction_method": data['extraction_method']
        }
    )
]

# 3. Create vector store and retriever
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever()

# 4. Create the QA chain
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

# 5. Helper function for asking questions
def ask_invoice_question(question: str) -> str:
    """Helper function to ask questions about the invoice"""
    response = retrieval_chain.invoke({"input": question})
    return response['answer']

# 6. Example questions
questions = [
    "What is the total amount of this invoice?",
    "Who is the sender of this invoice?",
    "What items are listed in this invoice?",
    "When is the due date for this invoice?",
    "What is the IBAN number for payment?"
]

# Test the questions
print("\nExample Questions and Answers:")
print("-" * 50)
for question in questions:
    print(f"\nQ: {question}")
    print(f"A: {ask_invoice_question(question)}")




Example Questions and Answers:
--------------------------------------------------

Q: What is the total amount of this invoice?
A: The total amount of this invoice is $7700.

Q: Who is the sender of this invoice?
A: The sender of this invoice is "America Rall INC" as indicated in the "From Address" section.

Q: What items are listed in this invoice?
A: The items listed in this invoice are:
1. Travel expenses
2. Logistics expenses
3. Allowance expenses

Q: When is the due date for this invoice?
A: The due date for this invoice is January 7, 2022.

Q: What is the IBAN number for payment?
A: The IBAN number for payment is "ABCD EFGH 0000 0000 0000".
