[Invoice or Bill Custom Parsing using Kor (LangChain Extension), Generative Language Models & Prompt Engineering](https://blog.gopenai.com/invoice-or-bill-custom-parsing-using-kor-langchain-extension-generative-language-models-prompt-7133193358fa)

Iinitialize ChatOpenAI model.

In [None]:
import os

__USE_AZURE_OPENAI = True

if __USE_AZURE_OPENAI:
    import openai
    from langchain.chat_models import AzureChatOpenAI

    # Configure OpenAI API
    llm = AzureChatOpenAI(deployment_name="document-parser-16k", 
                        openai_api_key= os.getenv("AZURE_OPENAI_API_KEY"),
                        openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"))
else:
    from langchain.chat_models import ChatOpenAI

    #load GPT 3.5 model
    llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature=0,
        max_tokens=2000,
        openai_api_key=os.environ["OPENAI_API_KEY"].rstrip()
    )



The next step is to create schema and provide examples

In [None]:
# import neccessary packages from korr
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

#object creation for invoice_number
header_schema = Object(
    id="header",
    description="header details",
    attributes=[
        Text(
            id="number",
            description= "unique number (identifier) of given purchase order",
        examples=[
            ( "PO NUMBER 12345", "12345")
        ]),
        Text(
            id="quote",
            description= "quote number",
        examples=[
            ( "QUOTE # 1-1604415497 (Bosch)", "1-1604415497 (Bosch)"),
            
        ]),
        Text(
            id="date",
            description= "date",
        ),
        Text(
            id="payment_terms",
            description= "get payment terms",
        examples=[
            ( "PAYMENT TERMS Net 45", "PAYMENT TERMS Net 30")
        ]),
        Text(
            id="shipping_terms",
            description= "get shipping terms",
        ),
        Text(
            id="currency",
            description= "currency",
        examples=[
            ( "CURRENCY EUR", "CURRENCY USD")
        ]),
        Text(
            id="contract",
            description= "contract",
        ),
        Text(
            id="contact",
            description= "primary contact",
            examples=[
                ("Racel Madamba  Racel_Rey.Madamba@genesys.com", 
                "Claire Wood claire.wood@genesys.com")
            ]),
        Number( 
            id="vendor_id",
            description= "Softphone identifier",
        ),
        
    ],

    many=False,
)

In [None]:
address_schema = Object(
    id="address",
    description="address details",
    attributes=[
        Text(id="name", description="the name of person and organization"),
        Text(id="address_line", description=""),
        Text(id="attn", description=""),
    ],
    examples=[
        (
            "Genesys Telecommunications Laboratories B.V. Gooimeer 6-02 Naarden, 1411 DD Netherlands Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com",
            {
                "name": "Genesys Telecommunications Laboratories B.V.",
                "address_line": "Gooimeer 6-02 Naarden, 1411 DD Netherlands",
                "attn": "Attn: ACCOUNTS PAYABLE Accounts.payableEMEA@genesys.com",
            },
        ),
        (
            "Genesys Telecommunications Laboratories Asia Pte Ltd. 9 Raffles Place,#18-02, Republic Plaza Republic Plaza, 048619 Singapore Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            {
                "name": "Genesys Telecommunications Laboratories Asia Pte Ltd.",
                "address_line": "9 Raffles Place,#18-02, Republic Plaza Republic Plaza, 048619 Singapore",
                "attn": "Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            },
        ),
        
       (
            "Genesys Laboratories Australasia Pty Ltd Level 20,141 Walker Street North Sydney, NSW 2060 Australia Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            {
                "name": "Genesys Laboratories Australasia Pty Ltd",
                "address_line": "PLevel 20,141 Walker Street North Sydney, NSW 2060 Australia",
                "attn": "Attn: AP-AU@genesys.com ACCOUNTS PAYABLE",
            },
        ) 
    ],
    many=True,
) 

prompt generated by **Kor** from our object definition to pass it to the GPT model.

In [None]:
products_schema = Object(
    id="lines",
    description="the details of bill",
    attributes=[
        Number(id="line", description=""),
        Text(id="description", description="the description of the product or service"),
        Text(id="price", description="price per unit"),
        Text(id="total", description="the total price, which is number of units * unit_price"),
    ],
    examples=[
        (
            "4 EMEA/100446/CUSTOMER: NTT Germany AG & Co. KG/FINAL CUSTOMER: Deutsche Börse AG /WDE Connector for SAP C/4HANA Licenses (subscription)/Payment upon delivery/Dec 1 2023 to Dec 31 2024 Subscription",
            {
               "line": 4,
               "description": "EMEA/100446/CUSTOMER: NTT Germany AG & Co. KG/FINAL CUSTOMER: Deutsche Börse AG /WDE Connector for SAP C/4HANA Licenses (subscription)/Payment upon delivery/Dec 1 2023 to Dec 31 2024 Subscription",
               "price": "10,225.00", 
               "total": "10,225.00",
            },
        ),
        (
            "7 EMEA/75440/COVERAGE:01/01/2024 to 07/31/2024/Customer:IREN S.p.A./End User:IREN S.p.A./Standalone CAS - SipVR 75 licenses CAS Renewal",
            {
               "line": 7,
               "description": "EMEA/75440/COVERAGE:01/01/2024 to 07/31/2024/Customer:IREN S.p.A./End User:IREN S.p.A./Standalone CAS - SipVR 75 licenses CAS Renewal",
               "price": "1,838.55", 
               "total": "1,838.55",
            },
        ),
    ],
    many=True
)




Create a Purchase Order schema containing all previous defined sub schema 

In [None]:

po_schema = Object(
    id="po",
    description="extraction of relevant information from purchase order",
    attributes=[
        header_schema,
        address_schema,
        products_schema,
    ],
    many=False,
)

po_lines = Object(
    id="lines",
    description="extraction order lines from purchase order",
    attributes=[
        products_schema
    ],
    many=False,
)

po_chain = create_extraction_chain(llm, po_schema, encoder_or_encoder_class="json", input_formatter=None)
po_chain_products = create_extraction_chain(llm, products_schema, encoder_or_encoder_class="json", input_formatter=None)


Load PDF file from google drive and use PDF Miner to extract raw text. Note: you need to download invoice to your local drive.

In [None]:
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.high_level import extract_text as pdf_extract_text

def pdf_extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fp:
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = PDFPage.get_pages(fp)

        for page_num, page in enumerate(pages, start=1):
            interpreter.process_page(page)
            layout = device.get_result()
            text = ''
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    text += lt_obj.get_text()
            yield page_num, text

def pdf_extract_whole_text(pdf_path):
    text = pdf_extract_text(pdf_path)
    # perform basic processing to remove \n
    return " ".join(text.split("\n")) 

def pdf_parse_page(page_num, page):
    # Print the prompt  generated by Kor from our object definition to pass it to the GPT model.
    ## print(po_chain.prompt.format_prompt(text=page).to_string())
    print( f"PAGE# {page_num}" )
    if page_num == 1:
        parse_result_header = po_chain.run(text=page)
        return parse_result_header['data']['po']
    else:
        parse_result_lines = po_chain_products.run(text=page)
        if parse_result_lines['data'].get('lines') is not None:
            return parse_result_lines['data']['lines']
        else:
            raise Exception(f"error parsing page {page_num}")
            # json_match = re.search(r'<json>(.*?)</json>', parse_result_lines['raw'])
            # if not json_match is None:
            #     return json.loades(json_match.group(1))

def pdf_parse_document(file_path):
    for page_num, page in pdf_extract_text_by_page(file_path):
            try:
                yield pdf_parse_page(page_num, page)
            except Exception as e:
                print( e )


## Extract text from html

In [None]:
from bs4 import BeautifulSoup
import json 

def html_parse_document( file_path ):
    with open('../docs/PO/PO_63466.html', 'r') as f:
        html = f.read()

    soup = BeautifulSoup(html, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return po_chain.run(text=text)

result = html_parse_document( '../docs/PO/63466.html' )

print( json.dumps(result,indent=4) )


In [None]:
import json
        
file_paths = ['PO_63466.pdf', 'PO_63563.pdf']

for file_path in file_paths:
    print( f"Parsing file: {file_path}" )
    for result in pdf_parse_document( f"../docs/PO/{file_path}"):
        print( json.dumps(result,indent=4), f"\n{type(result)}" )

    print( "-----------------------------------\n\n" )






## Parse PO line description

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
import json

system = SystemMessage(content="You are an AI assistant that helps find valuable business information from the purchase order")
human = HumanMessage(content="""
extract all possible informations from  the provided description
    
I need result formatted as 

{
"region": <end customer region>
"service": <service type. example 'CAS Renewal' or 'Subscription'>
"service_period": <service time period>                     
"service_start": <the start period of the service>
"service_end": <the end period of the service>
"customer": <the customer name>
"end_user": <the end user, null if not presented>                     
"product": <the refereed product, null if not presented>
"license": <the licence information usually related to a number of concurrent or seats, null if not presented>
"note": <the service note>
"other": <all information not recognized>
}
                     
DESCRIPTION: 
               
""")


prompt = ( system + human + "{description}")

inputs = [
{"description":
"""
EMEA/94744/CUSTOMER: H&M Hennes & Mauritz GBC AB/RESOURCE: N/A/ 5 concurrent (2nd year – expected from 06-Feb-24 to 05-Feb-25) : Milestone 1 after 17 months from the Subscription start
"""},
{"description":
"""
EMEA/69411/COVERAGE:01/01/2024 to 12/31/2024/Customer:IT Core SPA/End User:Baxi SPA/Workspace Connector for MSFT USD
CAS Renewal
"""
},

{"description":
"""
EMEA/ID 110582 (1-1705570257)/COVERAGE:01/01/2024 to 12/31/2024/Customer:IT Core SPA/End User:Baxi SPA
CAS Renewal
"""
},
]

class DescInfo(BaseModel):
    region: str = Field(description="end customer region")
    service: str = Field(description="service type could be CAS or Subscrition")

output_parser = JsonOutputParser(pydantic_object=DescInfo)

# output_parser = StrOutputParser()

chain = prompt | llm | output_parser

for input in inputs:
    print( input["description"] )
    print( "-----------------------")
    print( json.dumps(chain.invoke(input), indent=4) )
    print( "-----------------------")