[Invoice or Bill Custom Parsing using Kor (LangChain Extension), Generative Language Models & Prompt Engineering](https://blog.gopenai.com/invoice-or-bill-custom-parsing-using-kor-langchain-extension-generative-language-models-prompt-7133193358fa)

Iinitialize ChatOpenAI model.

In [23]:
import os

__USE_AZURE_OPENAI = True

if __USE_AZURE_OPENAI:
    import openai
    from langchain.chat_models import AzureChatOpenAI

    # Configure OpenAI API
    llm = AzureChatOpenAI(deployment_name="Document-Parsing", 
                        openai_api_key= os.getenv("AZURE_OPENAI_API_KEY"),
                        openai_api_version="2023-06-01-preview")
else:
    from langchain.chat_models import ChatOpenAI

    #load GPT 3.5 model
    llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature=0,
        max_tokens=2000,
        openai_api_key=os.environ["OPENAI_API_KEY"].rstrip()
    )



The next step is to create schema and provide examples

In [24]:
# import neccessary packages from korr
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

#object creation for invoice_number
header_schema = Object(
    id="header",
    description="header details",
    attributes=[
        Text(
            id="number",
            description= "unique number (identifier) of given purchase order",
        examples=[
            ( "PO NUMBER 12345", "PO NUMBER 35658")
        ]),
        Text(
            id="quote",
            description= "quote number",
        examples=[
            ( "QUOTE # 1-1604415497 (Bosch)", "QUOTE # \"1-1708903259, 109815 (1-1611484010), 108832 (1-1609457261), 108878\"")
        ]),
        Text(
            id="date",
            description= "date",
        ),
        Text(
            id="payment_terms",
            description= "get payment terms",
        examples=[
            ( "PAYMENT TERMS Net 45", "PAYMENT TERMS Net 30")
        ]),
        Text(
            id="shipping_terms",
            description= "get shipping terms",
        ),
        Text(
            id="currency",
            description= "currency",
        examples=[
            ( "CURRENCY EUR", "CURRENCY USD")
        ]),
        Text(
            id="contract",
            description= "contract",
        ),
        Text(
            id="contact",
            description= "primary contact",
            examples=[
                ("Racel Madamba  Racel_Rey.Madamba@genesys.com", 
                "Claire Wood claire.wood@genesys.com")
            ]
        ),
    ],

    many=False,
)

In [25]:
address_schema = Object(
    id="address",
    description="address details",
    attributes=[
        Text(id="name", description="the name of person and organization"),
        Text(id="address_line", description=""),
        Text(id="attn", description=""),
    ],
    examples=[
        (
            "Genesys Telecommunications Laboratories B.V. Gooimeer 6-02 Naarden, 1411 DD Netherlands Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com",
            {
                "name": "Genesys Telecommunications Laboratories B.V.",
                "address_line": "Gooimeer 6-02 Naarden, 1411 DD Netherlands",
                "attn": "Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com",
            },
        ),
        (
            "Genesys Telecommunications Laboratories Asia Pte Ltd. 9 Raffles Place,#18-02, Republic Plaza Republic Plaza, 048619 Singapore Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            {
                "name": "Genesys Telecommunications Laboratories Asia Pte Ltd.",
                "address_line": "9 Raffles Place,#18-02, Republic Plaza Republic Plaza, 048619 Singapore",
                "attn": "Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            },
        ),
        
       (
            "Genesys Laboratories Australasia Pty Ltd Level 20,141 Walker Street North Sydney, NSW 2060 Australia Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            {
                "name": "Genesys Laboratories Australasia Pty Ltd",
                "address_line": "PLevel 20,141 Walker Street North Sydney, NSW 2060 Australia",
                "attn": "Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            },
        ) 
    ],
    many=True,
) 

prompt generated by **Kor** from our object definition to pass it to the GPT model.

In [26]:
products_schema = Object(
    id="lines",
    description="the details of bill",
    attributes=[
        Number(id="line", description=""),
        Text(id="description", description="the description of the product or service"),
        Text(id="description2", description="extra description of the product or service"),
        Text(id="price", description="price per unit"),
        Text(id="total", description="the total price, which is number of units * unit_price"),
    ],
    examples=[
        (
            "1 1-1708903259_CAS Renewal_01/01/2020 to 30/06/2021_NXO 2,789.50 2,789.50 FRANCE_La Poste_Solution Name: WDE Plugin for MS Dynamics CRM - add-on 130 seats",
            {
               "line": 1, 
               "description": "1-1708903259_CAS Renewal_01/01/2020 to 30/06/2021_NXO",
               "description2": "FRANCE_La Poste_Solution Name: WDE Plugin for MS Dynamics CRM - add-on 130 seats",
               "price": "2,789.50", 
               "total": "2,789.50",
            },
        ),
        (
            "4 108878_CAS Renewal_01/01/2020 to 30/06/2021_NXO 10,225.00 10,225.00 FRANCE_La Poste_Solution Name:MSCRM Dynamics Connector (730 seats) ",
            {
               "line": 4,
               "description": "108878_CAS Renewal_01/01/2020 to 30/06/2021_NXO",
               "description2": "FRANCE_La Poste_Solution Name:MSCRM Dynamics Connector (730 seats)",
               "price": "10,225.00", 
               "total": "10,225.00",
            },
        ),
        (
            "6 78822 (73377)_Premise CAS Renewal_01/01/2022 to 1,838.55 1,838.55 12/31/2022_IREN S.p.A._IREN S.p.A._SipVR Licensing - additional 50 licenses CAS Renewal ",
            {
               "line": 6,
               "description": "78822 (73377)_Premise CAS Renewal_01/01/2022 to",
               "description2": "12/31/2022_IREN S.p.A._IREN S.p.A._SipVR Licensing - additional 50 licenses CAS Renewal 1,838.55 1,838.55",
               "price": "1,838.55", 
               "total": "1,838.55",
            },
        ),
    ],
    many=True
)




Create a Purchase Order schema containing all previous defined sub schema 

In [27]:

po_schema = Object(
    id="po",
    description="extraction of relevant information from purchase order",
    attributes=[
        header_schema,
        address_schema,
        products_schema
        
    ],
    many=False,
)

po_chain = create_extraction_chain(llm, po_schema, encoder_or_encoder_class="json", input_formatter=None)
po_chain_products = create_extraction_chain(llm, products_schema, encoder_or_encoder_class="json", input_formatter=None)


Load PDF file from google drive and use PDF Miner to extract raw text. Note: you need to download invoice to your local drive.

In [28]:
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.high_level import extract_text

def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fp:
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = PDFPage.get_pages(fp)

        for page_num, page in enumerate(pages, start=1):
            interpreter.process_page(page)
            layout = device.get_result()
            text = ''
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    text += lt_obj.get_text()
            yield page_num, text

def extract_whole_text(pdf_path):
    text = extract_text(pdf_path)
    # perform basic processing to remove \n
    return " ".join(text.split("\n")) 



In [29]:
import json

def parse_document(file_path):
    for page_num, page in extract_text_by_page(file_path):
        # Print the prompt  generated by Kor from our object definition to pass it to the GPT model.
        ## print(po_chain.prompt.format_prompt(text=page).to_string())
        print( f"PAGE# {page_num}" )
        if page_num == 1:
            parse_result_header = po_chain.run(text=page)
            print( json.dumps(parse_result_header['data']['po'], indent=4) )
        else:
            parse_result_lines = po_chain_products.run(text=page)
            if parse_result_lines['data'].get('lines') is not None:
                print( json.dumps(parse_result_lines['data']['lines'], indent=4) )
            else:
                print( parse_result_lines )
        
file_paths = ['2023_12_04_PO_62956_H&M_replace_57579_GENNAIO_2024.pdf']

for file_path in file_paths:
    print( f"Parsing file: {file_path}" )
    parse_document( f"../docs/PO/{file_path}")
    print( "-----------------------------------\n\n" )






Parsing file: 2023_12_04_PO_62956_H&M_replace_57579_GENNAIO_2024.pdf
PAGE# 1
{
    "header": {
        "number": "62956",
        "quote": "QUOTE # \"92689\"",
        "date": "12/04/2023",
        "payment_terms": "PAYMENT TERMS Net 45",
        "shipping_terms": "SHIPPING TERMS",
        "currency": "CURRENCY EUR",
        "contract": "CONTRACT",
        "contact": "Rick Estrellado Rick.Estrellado@genesys.com"
    },
    "address": [
        {
            "name": "Genesys Cloud Services B.V.",
            "address_line": "Prins Bernhardplein 200 Amsterdam, 1097JB Netherlands",
            "attn": "Attn: Maikel Beerens"
        }
    ],
    "lines": [
        {
            "line": 1,
            "description": "EMEA/94744/CUSTOMER: H&M Hennes & Mauritz GBC AB/RESOURCE: N/A/ 5 concurrent (1st year \u2013 expected from 06-Feb-23 to 05-Feb-24) : Milestone 1 after 5 months from the Subscription start Subscription",
            "description2": "",
            "price": "420.00",
           