[Invoice or Bill Custom Parsing using Kor (LangChain Extension), Generative Language Models & Prompt Engineering](https://blog.gopenai.com/invoice-or-bill-custom-parsing-using-kor-langchain-extension-generative-language-models-prompt-7133193358fa)

The first step is to load PDF file from google drive and use PDF Miner to extract raw text. Note: you need to download invoice to your local drive.

In [None]:
from pdfminer.high_level import extract_text
text = extract_text('doc1.pdf')

#perform basic processing to remove \n
processed_text = " ".join(text.split("\n"))
print(processed_text)

The second step is to initialize ChatOpenAI model.

In [None]:
import os

#import langChain ChatOpenAI module
from langchain.chat_models import ChatOpenAI

#load GPT 3.5 model
llm_openai = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000,
    openai_api_key=os.environ["OPENAI_API_KEY"].rstrip()
)

As alternative, the second step is to initialize Azure ChatOpenAI model.

In [None]:
import os
import openai

from langchain.chat_models import AzureChatOpenAI

azure_api_base =  os.getenv("AZURE_OPENAI_API_BASE")
# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = "2023-06-01-preview"
openai.api_base = azure_api_base
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")

llm = AzureChatOpenAI(deployment_name="Document-Parsing", 
                      openai_api_key= os.getenv("AZURE_OPENAI_API_KEY"),
                      openai_api_base=azure_api_base, 
                      openai_api_version="2023-06-01-preview")



The next step is to create schema and provide examples

In [None]:
# import neccessary packages from korr
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

#object creation for invoice_number
header_schema = Object(
    id="header",
    description="header details",
    attributes=[
        Text(
            id="number",
            description= "unique number (identifier) of given purchase order",
        examples=[
            ( "PO NUMBER 12345", "PO NUMBER 35658")
        ]),
        Text(
            id="quote",
            description= "quote number",
        examples=[
            ( "QUOTE # 1-1604415497 (Bosch)", "QUOTE # \"1-1708903259, 109815 (1-1611484010), 108832 (1-1609457261), 108878\"")
        ]),
        Text(
            id="date",
            description= "date",
        ),
        Text(
            id="payment_terms",
            description= "get payment terms",
        examples=[
            ( "PAYMENT TERMS Net 45", "PAYMENT TERMS Net 30")
        ]),
        Text(
            id="shipping_terms",
            description= "get shipping terms",
        ),
        Text(
            id="currency",
            description= "currency",
        examples=[
            ( "CURRENCY EUR", "CURRENCY USD")
        ]),
        Text(
            id="contract",
            description= "contract",
        ),
        Text(
            id="contact",
            description= "primary contact",
            examples=[
                ("Racel Madamba  Racel_Rey.Madamba@genesys.com", 
                "Claire Wood claire.wood@genesys.com")
            ]
        ),
    ],

    many=False,
)

prompt generated by **Kor** from our object definition to pass it to the GPT model.

In [None]:
products_schema = Object(
    id="lines",
    description="the details of bill",
    attributes=[
        Number(id="line", description=""),
        Text(id="description", description="the description of the product or service"),
        Text(id="description2", description="extra description of the product or service"),
        Text(id="price", description="price per unit"),
        Text(id="total", description="the total price, which is number of units * unit_price"),
    ],
    examples=[
        (
            "1 1-1708903259_CAS Renewal_01/01/2020 to 30/06/2021_NXO 2,789.50 2,789.50 FRANCE_La Poste_Solution Name: WDE Plugin for MS Dynamics CRM - add-on 130 seats",
            {
               "line": 1, 
               "description": "1-1708903259_CAS Renewal_01/01/2020 to 30/06/2021_NXO",
               "description2": "FRANCE_La Poste_Solution Name: WDE Plugin for MS Dynamics CRM - add-on 130 seats",
               "price": "2,789.50", 
               "total": "2,789.50",
            },
        ),
        (
            "4 108878_CAS Renewal_01/01/2020 to 30/06/2021_NXO 10,225.00 10,225.00 FRANCE_La Poste_Solution Name:MSCRM Dynamics Connector (730 seats) ",
            {
               "line": 4,
               "description": "108878_CAS Renewal_01/01/2020 to 30/06/2021_NXO",
               "description2": "FRANCE_La Poste_Solution Name:MSCRM Dynamics Connector (730 seats)",
               "price": "10,225.00", 
               "total": "10,225.00",
            },
        ),
        (
            "6 78822 (73377)_Premise CAS Renewal_01/01/2022 to 1,838.55 1,838.55 12/31/2022_IREN S.p.A._IREN S.p.A._SipVR Licensing - additional 50 licenses CAS Renewal ",
            {
               "line": 6,
               "description": "78822 (73377)_Premise CAS Renewal_01/01/2022 to",
               "description2": "12/31/2022_IREN S.p.A._IREN S.p.A._SipVR Licensing - additional 50 licenses CAS Renewal 1,838.55 1,838.55",
               "price": "1,838.55", 
               "total": "1,838.55",
            },
        ),
    ],
    many=True
)




Create a Purchase Order schema containing all previous defined sub schema 

In [None]:

####################
# PO SCHEMA 
####################

po_schema = Object(
    id="po",
    description="extraction of relevant information from purchase order",
    attributes=[
        header_schema,
        address_schema,
        products_schema,

    ],

    many=False,
)
po_chain = create_extraction_chain(llm, po_schema, encoder_or_encoder_class="json", input_formatter=None)


Print the **prompt**  generated by **Kor** from our object definition to pass it to the GPT model.

In [None]:
print(po_chain.prompt.format_prompt(text=processed_text).to_string())

provide prompt to the llm chain and print result

In [None]:
parse_result = po_chain.predict_and_parse(text=processed_text)

parse_result