[Invoice or Bill Custom Parsing using Kor (LangChain Extension), Generative Language Models & Prompt Engineering](https://blog.gopenai.com/invoice-or-bill-custom-parsing-using-kor-langchain-extension-generative-language-models-prompt-7133193358fa)

The first step is to load PDF file from google drive and use PDF Miner to extract raw text. Note: you need to download invoice to your local drive.

In [4]:
from pdfminer.high_level import extract_text
text = extract_text('doc1.pdf')

#perform basic processing to remove \n
processed_text = " ".join(text.split("\n"))
print(processed_text)

4/15/2019  https://genesys.coupahost.com/order_headers/print_view?id=34438&version=1  25026610-8325 Smart Communications Techn Ltd 9 Seagrave Road London, SW6 7RP United Kingdom  Ship To Genesys Telecommunications Laboratories B.V. Gooimeer 6-02,1411 DD Naarden Naarden, 1411 Netherlands Attn: Racel Madamba  Genesys  PURCHASE ORDER  PO NUMBER QUOTE # DATE PAYMENT TERMS SHIPPING TERMS CURRENCY CONTRACT CONTACT  34438  1-1604415497 (Bosch)  02/11/2019  Net 45   EUR  4429  Racel Madamba  Racel_Rey.Madamba@genesys.com  Bill To Genesys Telecommunications Laboratories B.V. Gooimeer 6-02 Naarden, 1411 DD Netherlands Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com  Line  Description  1 1-1604415497 (Bosch) - (CAS) 1 year Sept 1 2018 - Aug 31 2019  Qty  Unit  Price 640.50  Total 640.50  DACH  Note to Supplier:  0 Units   640.50 EUR  IMPORTANT. Unless there is an existing contract between Vendor and Genesys applicable to the transaction subject matter of this Purchase Order, this Purchase

The second step is to initialize ChatOpenAI model.

In [5]:
import os

#import langChain ChatOpenAI module
from langchain.chat_models import ChatOpenAI

#load GPT 3.5 model
llm_openai = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    max_tokens=2000,
    openai_api_key=os.environ["OPENAI_API_KEY"].rstrip()
)

As alternative, the second step is to initialize Azure ChatOpenAI model.

In [6]:
import os
import openai

from langchain.chat_models import AzureChatOpenAI

azure_api_base =  os.getenv("AZURE_OPENAI_API_BASE")
# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = "2023-06-01-preview"
openai.api_base = azure_api_base
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")

llm = AzureChatOpenAI(deployment_name="Document-Parsing", 
                      openai_api_key= os.getenv("AZURE_OPENAI_API_KEY"),
                      openai_api_base=azure_api_base, 
                      openai_api_version="2023-06-01-preview")



The next step is to create schema and provide examples

In [7]:
# import neccessary packages from korr
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

#object creation for invoice_number
po_header = Object(
    id="po_header",
    description="extraction of relevant information from purchase order",
    attributes=[
        Text(
            id="po_number",
            description= "unique number (identifier) of given purchase order",
        examples=[
            ( "PO NUMBER 12345", "PO NUMBER 35658")
        ]),
        Text(
            id="po_quote",
            description= "quote number",
        examples=[
            ( "QUOTE # 1-1604415497 (Bosch)", "QUOTE # \"1-1708903259, 109815 (1-1611484010), 108832 (1-1609457261), 108878\"")
        ]),
        Text(
            id="po_date",
            description= "date",
        ),
        Text(
            id="po_payment_terms",
            description= "get payment terms",
        examples=[
            ( "PAYMENT TERMS Net 45", "PAYMENT TERMS Net 30")
        ]),
        Text(
            id="po_shipping_terms",
            description= "get shipping terms",
        ),
        Text(
            id="po_currency",
            description= "currency",
        examples=[
            ( "CURRENCY EUR", "CURRENCY USD")
        ]),
        Text(
            id="po_contract",
            description= "contract",
        ),
        Text(
            id="po_contact",
            description= "primary contact",
            examples=[
                ("Racel Madamba  Racel_Rey.Madamba@genesys.com", 
                "Claire Wood claire.wood@genesys.com")
            ]
        ),
    ],

    many=False,
)

prompt generated by **Kor** from our object definition to pass it to the GPT model.

In [8]:
# print(po_chain.prompt.format_prompt(text=processed_text).to_string())

In [9]:
address_schema = Object(
    id="address",
    description="address details",
    attributes=[
        Text(id="name", description="the name of person and organization"),
        Text(id="address_line", description=""),
        Text(id="attn", description=""),
    ],
    examples=[
        (
            "Genesys Telecommunications Laboratories B.V. Gooimeer 6-02 Naarden, 1411 DD Netherlands Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com",
            {
                "name": "Genesys Telecommunications Laboratories B.V.",
                "address_line": "Gooimeer 6-02 Naarden, 1411 DD Netherlands",
                "attn": "Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com",
            },
        ),
        (
            "Genesys Telecommunications Laboratories Asia Pte Ltd. 9 Raffles Place,#18-02, Republic Plaza Republic Plaza, 048619 Singapore Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            {
                "name": "Genesys Telecommunications Laboratories Asia Pte Ltd.",
                "address_line": "9 Raffles Place,#18-02, Republic Plaza Republic Plaza, 048619 Singapore",
                "attn": "Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            },
        ),
        
       (
            "Genesys Laboratories Australasia Pty Ltd Level 20,141 Walker Street North Sydney, NSW 2060 Australia Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            {
                "name": "Genesys Laboratories Australasia Pty Ltd",
                "address_line": "PLevel 20,141 Walker Street North Sydney, NSW 2060 Australia",
                "attn": "Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            },
        ) 
    ],
    many=True,
) 

In [10]:
products_schema = Object(
    id="lines",
    description="the details of bill",
    attributes=[
        Number(id="product_line", description=""),
        Text(id="product_description", description="the description of the product or service"),
        Text(id="product_description2", description="extra description of the product or service"),
        Text(id="product_price", description="price per unit"),
        Text(id="product_total", description="the total price, which is number of units * unit_price"),
    ],
    examples=[
        (
            "1 1-1708903259_CAS Renewal_01/01/2020 to 30/06/2021_NXO 2,789.50 2,789.50 FRANCE_La Poste_Solution Name: WDE Plugin for MS Dynamics CRM - add-on 130 seats",
            {
               "product_line": 1, 
               "product_description": "1-1708903259_CAS Renewal_01/01/2020 to 30/06/2021_NXO",
               "product_description2": "FRANCE_La Poste_Solution Name: WDE Plugin for MS Dynamics CRM - add-on 130 seats",
               "product_price": "2,789.50", 
               "product_total": "2,789.50",
            },
        ),
        (
            "4 108878_CAS Renewal_01/01/2020 to 30/06/2021_NXO 10,225.00 10,225.00 FRANCE_La Poste_Solution Name:MSCRM Dynamics Connector (730 seats) ",
            {
               "product_line": 4,
               "product_description": "108878_CAS Renewal_01/01/2020 to 30/06/2021_NXO",
               "product_description2": "FRANCE_La Poste_Solution Name:MSCRM Dynamics Connector (730 seats)",
               "product_price": "10,225.00", 
               "product_total": "10,225.00",
            },
        ),
        (
            "6 78822 (73377)_Premise CAS Renewal_01/01/2022 to 1,838.55 1,838.55 12/31/2022_IREN S.p.A._IREN S.p.A._SipVR Licensing - additional 50 licenses CAS Renewal ",
            {
               "product_line": 6,
               "product_description": "78822 (73377)_Premise CAS Renewal_01/01/2022 to",
               "product_description2": "12/31/2022_IREN S.p.A._IREN S.p.A._SipVR Licensing - additional 50 licenses CAS Renewal 1,838.55 1,838.55",
               "product_price": "1,838.55", 
               "product_total": "1,838.55",
            },
        ),
    ],
    many=True
)




In [11]:
####################
# HEADER CHAIN
####################
header_chain = create_extraction_chain(llm, po_header)
parse_result = header_chain.predict_and_parse(text=processed_text)

parse_result['data']['po_header'][0]

####################
# ADDRESS CHAIN
####################
address_chain = create_extraction_chain(llm, address_schema)

parse_result = address_chain.predict_and_parse(text=processed_text)

addresses = parse_result['data']['address']
print( f"number of addressed found: {len(addresses)}" )

for address in addresses:
    print(address)

####################
# PRODUCT CHAIN
####################
products_chain = create_extraction_chain(llm, products_schema)

parse_result = products_chain.predict_and_parse(text=processed_text)

lines = parse_result['data']['lines']
print( f"number of lines found: {len(lines)}" )

for line in lines:
    print(line)




number of addressed found: 3
{'name': '25026610-8325 Smart Communications Techn Ltd', 'address_line': '9 Seagrave Road London, SW6 7RP United Kingdom', 'attn': ''}
{'name': 'Genesys Telecommunications Laboratories B.V.', 'address_line': 'Gooimeer 6-02,1411 DD Naarden Naarden, 1411 Netherlands', 'attn': 'Attn: Racel Madamba'}
{'name': 'Genesys Telecommunications Laboratories B.V.', 'address_line': 'Gooimeer 6-02 Naarden, 1411 DD Netherlands', 'attn': 'Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com'}




number of lines found: 1
{'product_line': '1', 'product_description': '1-1604415497 (Bosch) - (CAS) 1 year Sept 1 2018 - Aug 31 2019', 'product_description2': 'DACH', 'product_price': '640.50', 'product_total': '640.50'}
