[Invoice or Bill Custom Parsing using Kor (LangChain Extension), Generative Language Models & Prompt Engineering](https://blog.gopenai.com/invoice-or-bill-custom-parsing-using-kor-langchain-extension-generative-language-models-prompt-7133193358fa)

Iinitialize ChatOpenAI model.

In [None]:
import os

__USE_AZURE_OPENAI = True

if __USE_AZURE_OPENAI:
    import openai
    from langchain.chat_models import AzureChatOpenAI

    # Configure OpenAI API
    llm = AzureChatOpenAI(deployment_name="document-parser", 
                        openai_api_key= os.getenv("AZURE_OPENAI_API_KEY"),
                        openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"))
else:
    from langchain.chat_models import ChatOpenAI

    #load GPT 3.5 model
    llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature=0,
        max_tokens=2000,
        openai_api_key=os.environ["OPENAI_API_KEY"].rstrip()
    )



The next step is to create schema and provide examples

In [None]:
# import neccessary packages from korr
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

#object creation for invoice_number
header_schema = Object(
    id="header",
    description="header details",
    attributes=[
        Text(
            id="number",
            description= "unique number (identifier) of given purchase order",
        examples=[
            ( "PO NUMBER 12345", "PO NUMBER 35658")
        ]),
        Text(
            id="quote",
            description= "quote number",
        examples=[
            ( "QUOTE # 1-1604415497 (Bosch)", "QUOTE # \"1-1708903259, 109815 (1-1611484010), 108832 (1-1609457261), 108878\"")
        ]),
        Text(
            id="date",
            description= "date",
        ),
        Text(
            id="payment_terms",
            description= "get payment terms",
        examples=[
            ( "PAYMENT TERMS Net 45", "PAYMENT TERMS Net 30")
        ]),
        Text(
            id="shipping_terms",
            description= "get shipping terms",
        ),
        Text(
            id="currency",
            description= "currency",
        examples=[
            ( "CURRENCY EUR", "CURRENCY USD")
        ]),
        Text(
            id="contract",
            description= "contract",
        ),
        Text(
            id="contact",
            description= "primary contact",
            examples=[
                ("Racel Madamba  Racel_Rey.Madamba@genesys.com", 
                "Claire Wood claire.wood@genesys.com")
            ]
        ),
    ],

    many=False,
)

In [None]:
address_schema = Object(
    id="address",
    description="address details",
    attributes=[
        Text(id="name", description="the name of person and organization"),
        Text(id="address_line", description=""),
        Text(id="attn", description=""),
    ],
    examples=[
        (
            "Genesys Telecommunications Laboratories B.V. Gooimeer 6-02 Naarden, 1411 DD Netherlands Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com",
            {
                "name": "Genesys Telecommunications Laboratories B.V.",
                "address_line": "Gooimeer 6-02 Naarden, 1411 DD Netherlands",
                "attn": "Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com",
            },
        ),
        (
            "Genesys Telecommunications Laboratories Asia Pte Ltd. 9 Raffles Place,#18-02, Republic Plaza Republic Plaza, 048619 Singapore Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            {
                "name": "Genesys Telecommunications Laboratories Asia Pte Ltd.",
                "address_line": "9 Raffles Place,#18-02, Republic Plaza Republic Plaza, 048619 Singapore",
                "attn": "Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            },
        ),
        
       (
            "Genesys Laboratories Australasia Pty Ltd Level 20,141 Walker Street North Sydney, NSW 2060 Australia Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            {
                "name": "Genesys Laboratories Australasia Pty Ltd",
                "address_line": "PLevel 20,141 Walker Street North Sydney, NSW 2060 Australia",
                "attn": "Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            },
        ) 
    ],
    many=True,
) 

prompt generated by **Kor** from our object definition to pass it to the GPT model.

In [None]:
products_schema = Object(
    id="lines",
    description="the details of bill",
    attributes=[
        Number(id="line", description=""),
        Text(id="description", description="the description of the product or service"),
        Text(id="price", description="price per unit"),
        Text(id="total", description="the total price, which is number of units * unit_price"),
    ],
    examples=[
        (
            "4 EMEA/100446/CUSTOMER: NTT Germany AG & Co. KG/FINAL CUSTOMER: Deutsche Börse AG /WDE Connector for SAP C/4HANA Licenses (subscription)/Payment upon delivery/Dec 1 2023 to Dec 31 2024 Subscription",
            {
               "line": 4,
               "description": "EMEA/100446/CUSTOMER: NTT Germany AG & Co. KG/FINAL CUSTOMER: Deutsche Börse AG /WDE Connector for SAP C/4HANA Licenses (subscription)/Payment upon delivery/Dec 1 2023 to Dec 31 2024 Subscription",
               "price": "10,225.00", 
               "total": "10,225.00",
            },
        ),
        (
            "7 EMEA/75440/COVERAGE:01/01/2024 to 07/31/2024/Customer:IREN S.p.A./End User:IREN S.p.A./Standalone CAS - SipVR 75 licenses CAS Renewal",
            {
               "line": 7,
               "description": "EMEA/75440/COVERAGE:01/01/2024 to 07/31/2024/Customer:IREN S.p.A./End User:IREN S.p.A./Standalone CAS - SipVR 75 licenses CAS Renewal",
               "price": "1,838.55", 
               "total": "1,838.55",
            },
        ),
    ],
    many=True
)




Create a Purchase Order schema containing all previous defined sub schema 

In [None]:

po_schema = Object(
    id="po",
    description="extraction of relevant information from purchase order",
    attributes=[
        header_schema,
        address_schema,
        products_schema
        
    ],
    many=False,
)

po_chain = create_extraction_chain(llm, po_schema, encoder_or_encoder_class="json", input_formatter=None)
po_chain_products = create_extraction_chain(llm, products_schema, encoder_or_encoder_class="json", input_formatter=None)


Load PDF file from google drive and use PDF Miner to extract raw text. Note: you need to download invoice to your local drive.

In [None]:
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.high_level import extract_text

def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fp:
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = PDFPage.get_pages(fp)

        for page_num, page in enumerate(pages, start=1):
            interpreter.process_page(page)
            layout = device.get_result()
            text = ''
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    text += lt_obj.get_text()
            yield page_num, text

def extract_whole_text(pdf_path):
    text = extract_text(pdf_path)
    # perform basic processing to remove \n
    return " ".join(text.split("\n")) 



In [None]:
import json, re

def parse_page(page_num, page):
    # Print the prompt  generated by Kor from our object definition to pass it to the GPT model.
    ## print(po_chain.prompt.format_prompt(text=page).to_string())
    print( f"PAGE# {page_num}" )
    if page_num == 1:
        parse_result_header = po_chain.run(text=page)
        return parse_result_header['data']['po']
    else:
        parse_result_lines = po_chain_products.run(text=page)
        if parse_result_lines['data'].get('lines') is not None:
            return parse_result_lines['data']['lines']
        else:
            raise f"error parsing page {page_num}"
            # json_match = re.search(r'<json>(.*?)</json>', parse_result_lines['raw'])
            # if not json_match is None:
            #     return json.loades(json_match.group(1))

def parse_document(file_path):
    for page_num, page in extract_text_by_page(file_path):
        retry = 2
        while retry > 0:
            try:
                yield parse_page(page_num, page)
                break
            except Exception as e:
                print( f"Error parsing page {page_num}: {e}" )
                retry = retry - 1
                continue # Try again

        
file_paths = ['PO_63466.pdf']

for file_path in file_paths:
    print( f"Parsing file: {file_path}" )
    for result in parse_document( f"../docs/PO/{file_path}"):
        print( json.dumps(result,indent=4), f"\n{type(result)}" )

    print( "-----------------------------------\n\n" )




