In [1]:
from unstract.llmwhisperer import LLMWhispererClientV2
import os
from dotenv import load_dotenv

# access the openai api and unstract llm whisperer key from .env file and map it to the current system
os.environ['OPENAI_API_KEY']=os.getenv("openai_api_key")
UNSTRACT_API_KEY = os.getenv("unstract_api_key")
BASE_URL = os.getenv("BASE_URL")

client = LLMWhispererClientV2(api_key=UNSTRACT_API_KEY) # base_url=BASE_URL, api_key=UNSTRACT_API_KEY

2024-12-28 10:48:10,640 - unstract.llmwhisperer.client_v2 - DEBUG - logging_level set to DEBUG
2024-12-28 10:48:10,640 - unstract.llmwhisperer.client_v2 - DEBUG - base_url set to https://llmwhisperer-api.us-central.unstract.com/api/v2


In [2]:
# Processing the PDF file using OCR mode
whisper = client.whisper(
        file_path="../raw-docs/scanned-invoice.pdf",
        wait_for_completion=True,
        wait_timeout=200
    )

2024-12-28 10:49:12,874 - unstract.llmwhisperer.client_v2 - DEBUG - whisper called
2024-12-28 10:49:12,875 - unstract.llmwhisperer.client_v2 - DEBUG - api_url: https://llmwhisperer-api.us-central.unstract.com/api/v2/whisper
2024-12-28 10:49:12,875 - unstract.llmwhisperer.client_v2 - DEBUG - params: {'mode': 'form', 'output_mode': 'layout_preserving', 'page_seperator': '<<<', 'pages_to_extract': '', 'median_filter_size': 0, 'gaussian_blur_radius': 0, 'line_splitter_tolerance': 0.4, 'horizontal_stretch_factor': 1.0, 'mark_vertical_lines': False, 'mark_horizontal_lines': False, 'line_spitter_strategy': 'left-priority', 'lang': 'eng', 'tag': 'default', 'filename': '', 'webhook_metadata': '', 'use_webhook': ''}
2024-12-28 10:49:13,110 - unstract.llmwhisperer.client_v2 - DEBUG - whisper_status called
2024-12-28 10:49:13,110 - unstract.llmwhisperer.client_v2 - DEBUG - url: https://llmwhisperer-api.us-central.unstract.com/api/v2/whisper-status
2024-12-28 10:49:18,238 - unstract.llmwhisperer.cl

In [3]:
extracted_text = whisper['extraction']['result_text']
print(extracted_text)



                                                           -. 

                                                        O=O         American Railcorp 

     NÂº      78493                                     Invoice 
URGENT 

                                                       Issued                   Due 
                                                       01 Jan, 2022             07 Jan, 2022 

                                                       From                     To 
                                                       America Rall INC         Simon Wilson 
                                                       123 Godly St.            1234, Bouleward avenue st 
                                                       Internet, 68493          Miami, 67833 
                                                       United States            United States 

     Item                                              Cost             Qty                       Total 

     Travel

In [4]:
from pydantic import BaseModel, Field
from typing import List, Optional
from datetime import date

class InvoiceItem(BaseModel):
    item: str = Field(description="Description of the item")
    cost: float = Field(description="Cost per item")
    quantity: int = Field(description="Quantity of items")
    total: float = Field(description="Total cost for this item")

class Address(BaseModel):
    name: str = Field(description="Name of the company or individual")
    street: str = Field(description="Street address")
    city_zip: str = Field(description="City and ZIP code")
    country: str = Field(description="Country")

class Invoice(BaseModel):
    invoice_number: str = Field(description="Invoice number")
    issue_date: date = Field(description="Date the invoice was issued")
    due_date: date = Field(description="Date the invoice is due")
    from_address: Address = Field(description="Sender's address")
    to_address: Address = Field(description="Recipient's address")
    items: List[InvoiceItem] = Field(description="List of invoice items")
    subtotal: float = Field(description="Subtotal before discount")
    discount_percentage: float = Field(description="Discount percentage")
    discount_amount: float = Field(description="Discount amount")
    total_before_misc: float = Field(description="Total after discount")
    misc_costs: float = Field(description="Miscellaneous costs")
    total: float = Field(description="Final total including misc costs")
    iban: str = Field(description="IBAN for payment")
    notes: Optional[str] = Field(description="Additional notes on the invoice")
    signature: Optional[str] = Field(description="Signature or name of the issuer")

In [5]:
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser

def compile_template_and_get_llm_response(preamble, extracted_text, pydantic_object):
    postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply."
    system_template = "{preamble}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_template = "{format_instructions}\n\n{extracted_text}\n\n{postamble}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    request = chat_prompt.format_prompt(preamble=preamble,
                                        format_instructions=parser.get_format_instructions(),
                                        extracted_text=extracted_text,
                                        postamble=postamble).to_messages()
    chat = ChatOpenAI()
    response = chat.invoke(request, temperature=0.0)
    print(f"Response from LLM:\n{response.content}")
    return response.content

def extract_invoice_from_text(extracted_text):
    preamble = ("You're seeing the information from a scanned invoice. "
                "Your job is to accurately extract the following information: "
                "invoice number, issue date, due date, sender's address, recipient's address, "
                "invoice items (including description, cost, quantity, and total for each), "
                "subtotal, discount percentage and amount, total before miscellaneous costs, "
                "miscellaneous costs, final total, IBAN, and any additional notes or signature. "
                "Ensure all fields are filled with the available information. "
                "For dates, use the format YYYY-MM-DD. "
                "For currency values, only include the numeric amount without the currency symbol.")
    
    return compile_template_and_get_llm_response(preamble, extracted_text, Invoice)

In [6]:
invoice_text = extract_invoice_from_text(extracted_text)

Response from LLM:
{
	"Invoice Number": "78493",
	"Issue Date": "2022-01-01",
	"Due Date": "2022-01-07",
	"From Address": {
		"Name": "America Rall INC",
		"Street": "123 Godly St.",
		"City Zip": "Internet, 68493",
		"Country": "United States"
	},
	"To Address": {
		"Name": "Simon Wilson",
		"Street": "1234, Bouleward avenue st",
		"City Zip": "Miami, 67833",
		"Country": "United States"
	},
	"Items": [{
			"Item": "Travel expenses",
			"Cost": 2000,
			"Quantity": 1,
			"Total": 2000
		},
		{
			"Item": "Logistics expenses",
			"Cost": 5000,
			"Quantity": 1,
			"Total": 5000
		},
		{
			"Item": "Allowance expenses",
			"Cost": 1000,
			"Quantity": 1,
			"Total": 1000
		}
	],
	"Subtotal": 8000,
	"Discount Percentage": 20,
	"Discount Amount": 1400,
	"Total Before Misc": 6600,
	"Misc Costs": 1100,
	"Total": 7700,
	"Iban": "ABCD EFGH 0000 0000 0000",
	"Notes": "This invoice is to be paid by wire transfer only, unless agreed otherwise and must be paid before the date due specified above.

In [7]:
import json
import ast


response = invoice_text  

# Parse the string into a Python dictionary
try:
    parsed_data = ast.literal_eval(response)
except (SyntaxError, ValueError):
    # If ast.literal_eval fails, try json.loads
    try:
        parsed_data = json.loads(response)
    except json.JSONDecodeError:
        print("Error: Unable to parse the response string.")
        parsed_data = {}

# Add any additional metadata if needed
combined_data = {
    "document_type": "scanned-invoice",
    "extraction_date": str(date.today()),  # You might want to use the actual current date
    "extracted_data": parsed_data
}

# Write to a JSON file
output_file = os.path.join('../saved-docs', 'invoice_extracted_data.json')
with open(output_file, 'w') as f:
    json.dump(combined_data, f, indent=2)

print(f"Data written to {output_file}")

Data written to ../saved-docs/invoice_extracted_data.json


In [8]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Load the JSON file
with open("../saved-docs/invoice_extracted_data.json", 'r') as f:
    data = json.load(f)

# Create documents
documents = [
    Document(
        page_content=json.dumps(data['extracted_data']),
        metadata={
            "document_type": data['document_type'],
            "extraction_date": data['extraction_date']
        }
    )
]

# Create the vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Create the retriever
retriever = vectorstore.as_retriever()

# Load the openai chat model
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)

# QA prompt
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

response = retrieval_chain.invoke({"input":"What is the IBAN number?"})
response['answer']


'The IBAN number is "ABCD EFGH 0000 0000 0000".'