Objectives

- Dataset
    - use dummy data and complete 1 round -> train + eval
    - use better competitive/challenging datasets - below is for inspiration
        - https://colab.research.google.com/drive/1o8-Km-kVHtOcdEd7i9uPY6vXg7MyCN0B?usp=sharing

        - https://medium.com/analytics-vidhya/extracting-structured-data-from-invoice-96cf5e548e40
        
        - https://www.kaggle.com/discussions/general/263960

- Extraction
    - simple extraction chain
    - better prompt engg

- Model training

- Eval

Steps
1. create documents
2. define the desired extracted output schema
3. construct prompt
4. define llm
5. define output parser
6. construct chain
7. batch invoke
8. visualize data
9. save as .csv

In [None]:
# ! pip install -qU "amazon-textract-caller>=0.2.0" amazon-textract-textractor

### create documents

In [1]:
# test for one file

from langchain_community.document_loaders import AmazonTextractPDFLoader
from textractor.data.text_linearization_config import TextLinearizationConfig

# loader = AmazonTextractPDFLoader(
#     file_path="data/invoice2.pdf",
#     textract_features=["LAYOUT"],
#     linearization_config=TextLinearizationConfig(
#         hide_header_layout=True,
#         hide_footer_layout=True,
#         hide_figure_layout=True,
#     ),
# )

# docs = loader.load()
# print(docs[0].page_content)

In [2]:
%%time
# for all pdfs in a folder

import os
invoices_dir_path = "data/"
files = []
for file in os.listdir(invoices_dir_path):
    file_path = os.path.join(invoices_dir_path, file)
    files.append(file_path)
print(f"invoices_dir_path: {invoices_dir_path}\nfiles_count: {len(files)}")


docs = []
error_files = []
for file in files:
    try:
        loader = AmazonTextractPDFLoader(
            file_path=file,
            textract_features=["LAYOUT"],
            linearization_config=TextLinearizationConfig(
                hide_header_layout=True,
                hide_footer_layout=True,
                hide_figure_layout=True,
            ),
        )
        file_docs = loader.load()
        docs.extend(file_docs)
    except Exception as e:
        print(f"Failed for file: {file}")
        print(f"error: {e}")
        error_files.append(file)

print(f"error_files count: {len(error_files)}")
print(f"docs completed count: {len(docs)}")

invoices_dir_path: data/
files_count: 7
error_files count: 0
docs completed count: 7
CPU times: user 581 ms, sys: 45.8 ms, total: 627 ms
Wall time: 25.1 s


### define schema

In [12]:
from typing import Optional, List
from langchain_core.pydantic_v1 import BaseModel, Field


# INVOICE_NUMBER, INVOICE_DATE, ACCOUNT_NUMBER, PO_NUMBER, VENDOR_NAME, COMPANY_NAME
# REMIT_TO_ADDRESS: [STREET, CITY, STATE, COUNTRY, ZIP, OTHERS]
# SHIP_TO_ADDRESS: [STREET, CITY, STATE, COUNTRY, ZIP, OTHERS]
# NET_AMOUNT, FREIGHT_AMOUNT, MISCELLANEOUS_CHARGES, PICKUP_CHARGES, SAMPLES_CHARGES, OTHER_CHARGES, INVOICE_AMOUNT, CURRENCY

# class Invoice(BaseModel):
#     """Information about an invoice document"""
#     invoice_number: Optional[str] = Field(..., description="The unique number code of invoice")
#     invoice_date: Optional[str] = Field(..., description="Date of invoice issue. Extract in this form %Year-%Month-%Day")
#     company_name:  Optional[str] = Field(..., description="Name of the company")
#     remit_to_address: Optional[str] = Field(..., description="Remittance address to send invoice or payment instructions")
#     ship_to_address: Optional[str] = Field(..., description="Delivery Address to send the items/products mentioned in invoice")
#     invoice_amount:  Optional[str] = Field(..., description="the total sum of money specified on an invoice including GST, shipping etc. Extract only the number no other symbol")
#     currency:  Optional[str] = Field(..., description="he currency in which each item/product will be invoiced and paid")

class Address(BaseModel):
    street: Optional[str] = Field(..., description="Details of street")
    city: Optional[str] = Field(..., description="Name of city")
    state: Optional[str] = Field(..., description="Name of state")
    country: Optional[str] = Field(..., description="Name of the country")
    zip_code: Optional[str] = Field(..., description="zip code details")
    others:  Optional[str] = Field(..., description="Everything else that's part of address like phone number, etc")

class LineItem(BaseModel):
    """Information about a line item in an invoice"""
    name:  Optional[str] = Field(..., description="The name of the line item")
    description:  Optional[str] = Field(..., description="The description of the line item")
    quantity:  Optional[str] = Field(..., description="Unit quantity or count of the line item ordered")
    unit_price: Optional[str] = Field(..., description="Unit price of single line item")
    total_amount:   Optional[str] = Field(..., description="Total amount. Its the product of unit price and quantity")
    

class Invoice(BaseModel):
    """Information about an invoice document"""
    invoice_number: Optional[str] = Field(..., description="The unique number code of invoice")
    invoice_date: Optional[str] = Field(..., description="Date of invoice issued")
    company_name:  Optional[str] = Field(..., description="Name of the company")
    remit_to_address: Optional[Address] = Field(..., description="Remittance address to send invoice or payment instructions")
    ship_to_address: Optional[Address] = Field(..., description="Delivery Address to send the items/products mentioned in invoice")
    invoice_amount:  Optional[str] = Field(..., description="the total sum of money specified on an invoice including GST, shipping etc. Extract only the number no other symbol")
    currency:  Optional[str] = Field(..., description="he currency in which each item/product will be invoiced and paid")
    items: List[LineItem]

### construct prompt

In [13]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    (
        "system", 
        "You are an expert at extracting relevant information from text. "
        "If you do not know the value of an attribute asked to extract, "
        "return null for the attribute's value."
        "\n{format_instructions}",
    ),
    (
        "human",
        "Make sure to wrap the answer in ```json and ``` tags. \n"
        "{text}",
    )
])

prompt.pretty_print()


You are an expert at extracting relevant information from text. If you do not know the value of an attribute asked to extract, return null for the attribute's value.
[33;1m[1;3m{format_instructions}[0m


Make sure to wrap the answer in ```json and ``` tags. 
[33;1m[1;3m{text}[0m


### define llm

In [14]:
import boto3
from langchain_community.chat_models import BedrockChat

llm = BedrockChat(
    model_id="anthropic.claude-3-haiku-20240307-v1:0",
    # model_id="anthropic.claude-3-sonnet-20240229-v1:0",
    client=boto3.client("bedrock-runtime"),
    model_kwargs={"temperature": 0.0, "max_tokens":512}
)

### define output parser

In [15]:
# 2. c. define the output parser
from langchain.output_parsers import PydanticOutputParser

output_parser = PydanticOutputParser(pydantic_object=Invoice)
print(output_parser.get_format_instructions())

prompt = prompt.partial(format_instructions=output_parser.get_format_instructions())
# prompt.pretty_print()

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"description": "Information about an invoice document", "properties": {"invoice_number": {"title": "Invoice Number", "description": "The unique number code of invoice", "type": "string"}, "invoice_date": {"title": "Invoice Date", "description": "Date of invoice issued", "type": "string"}, "company_name": {"title": "Company Name", "description": "Name of the company", "type": "string"}, "remit_to_address": {"title": "Remit To Address", "description": "Remittance address to send invoice or payment instructions", "allOf": [{"$ref": "#/definition

### construct chain

In [16]:
extract_chain = prompt | llm | output_parser

### batch invoke

#### test for one file

In [17]:
# test for one file
from langchain_community.document_loaders import AmazonTextractPDFLoader
from textractor.data.text_linearization_config import TextLinearizationConfig

# file_path="data/invoice1.pdf"
file_path="data/invoice2.pdf"

loader = AmazonTextractPDFLoader(
    file_path=file_path,
    textract_features=["LAYOUT"],
    linearization_config=TextLinearizationConfig(
        hide_header_layout=True,
        hide_footer_layout=True,
        hide_figure_layout=True,
    ),
)

file_docs = loader.load()

In [19]:
%%time
text = file_docs[0].page_content
response = extract_chain.invoke({"text": text})
response = extract_chain.invoke({"text": text})
response = extract_chain.invoke({"text": text})

OutputParserException: Failed to parse Invoice from completion {'invoice_number': 'CI-19-00017', 'invoice_date': '09/24/2019', 'company_name': 'Release agency', 'remit_to_address': {'street': '2020 Gnatty Creek Road', 'city': 'New York', 'state': None, 'country': 'United States', 'zip_code': '11716', 'others': None}, 'ship_to_address': {'street': '3247 Apple Lane', 'city': 'Dunlap', 'state': None, 'country': 'United States', 'zip_code': '61525', 'others': 'Company ID 5 VAT ID'}, 'invoice_amount': '19819.80', 'currency': 'EUR', 'items': [{'name': 'Creative work', 'description': None, 'quantity': '8.00', 'unit_price': '90.00', 'total_amount': '720.00'}, {'name': 'Account management', 'description': None, 'quantity': '10.00', 'unit_price': '70.00', 'total_amount': '700.00'}, {'name': 'Production services', 'description': None, 'quantity': '7.00', 'unit_price': '80.00', 'total_amount': '560.00'}, {'name': 'External development', 'description': None, 'quantity': '1.00', 'unit_price': '1500.00', 'total_amount': '1500.00'}, {'name': 'Other services', 'description': None, 'quantity': '6.00', 'unit_price': '2000.00', 'total_amount': '12000.00'}, {}]}. Got: 5 validation errors for Invoice
items -> 5 -> name
  field required (type=value_error.missing)
items -> 5 -> description
  field required (type=value_error.missing)
items -> 5 -> quantity
  field required (type=value_error.missing)
items -> 5 -> unit_price
  field required (type=value_error.missing)
items -> 5 -> total_amount
  field required (type=value_error.missing)

In [20]:
response

NameError: name 'response' is not defined

#### for all

In [21]:
%%time

extractions = extract_chain.batch(
    [{"text": doc.page_content} for doc in docs],
    {"max_concurrency": 5},  # limit the concurrency by passing max concurrency!
    return_exceptions=True,
)

CPU times: user 158 ms, sys: 0 ns, total: 158 ms
Wall time: 7.03 s


In [22]:
extractions

[Invoice(invoice_number='181000001348', invoice_date='18-Apr-2019', company_name='Ace Mobile Manufacturer Pvt Ltd', remit_to_address=Address(street='B-209, Park Plaza', city='Lucknow', state='Uttar Pradesh', country='India', zip_code=None, others='GSTIN/UIN: 09AABCS1429B1ZS'), ship_to_address=Address(street='Krishana Palace Road', city='Faizabad', state='Uttar Pradesh', country='India', zip_code=None, others='GSTIN/UIN : 09AAGCA1654H1ZQ'), invoice_amount='96,32,000.00', currency='INR', items=[LineItem(name='Ace A1-Smartphone', description='Batch : Batch 1', quantity='500 Nos', unit_price='6,000.00', total_amount='30,00,000.00'), LineItem(name='Ace A1 Plus-Smartphone', description='Batch : Batch 01', quantity='800 Nos', unit_price='7,000.00', total_amount='56,00,000.00')]),
 langchain_core.exceptions.OutputParserException("Failed to parse Invoice from completion {'invoice_number': 'CI-19-00017', 'invoice_date': '09/24/2019', 'company_name': 'Release agency', 'remit_to_address': {'street

In [23]:
files

['data/invoice3.pdf',
 'data/invoice2.pdf',
 'data/invoice1.pdf',
 'data/invoice6.pdf',
 'data/invoice0.pdf',
 'data/invoice4.pdf',
 'data/invoice5.pdf']

In [24]:
import json

json_data = []
errors_json = []

for i, ex in enumerate(extractions):
    if isinstance(ex, Invoice):
        try:
            json_item = json.loads(ex.json())
            json_item["file_path"] = files[i]
            json_data.append(json_item)
        except Exception as e:
            errors_json.append(ex)
            print(f"Error: {e}")
    else:
        errors_json.append(ex)

print(f"error count: {len(errors_json)}")

print(json_data[0])

error count: 1
{'invoice_number': '181000001348', 'invoice_date': '18-Apr-2019', 'company_name': 'Ace Mobile Manufacturer Pvt Ltd', 'remit_to_address': {'street': 'B-209, Park Plaza', 'city': 'Lucknow', 'state': 'Uttar Pradesh', 'country': 'India', 'zip_code': None, 'others': 'GSTIN/UIN: 09AABCS1429B1ZS'}, 'ship_to_address': {'street': 'Krishana Palace Road', 'city': 'Faizabad', 'state': 'Uttar Pradesh', 'country': 'India', 'zip_code': None, 'others': 'GSTIN/UIN : 09AAGCA1654H1ZQ'}, 'invoice_amount': '96,32,000.00', 'currency': 'INR', 'items': [{'name': 'Ace A1-Smartphone', 'description': 'Batch : Batch 1', 'quantity': '500 Nos', 'unit_price': '6,000.00', 'total_amount': '30,00,000.00'}, {'name': 'Ace A1 Plus-Smartphone', 'description': 'Batch : Batch 01', 'quantity': '800 Nos', 'unit_price': '7,000.00', 'total_amount': '56,00,000.00'}], 'file_path': 'data/invoice3.pdf'}


In [None]:
# extractions # json_data 

# json_data[1] # invoice2.pdf

In [26]:
import pandas as pd

# data = pd.DataFrame(json_data)

data = pd.json_normalize(json_data, sep='_')
data.head()

Unnamed: 0,invoice_number,invoice_date,company_name,invoice_amount,currency,items,file_path,remit_to_address_street,remit_to_address_city,remit_to_address_state,remit_to_address_country,remit_to_address_zip_code,remit_to_address_others,ship_to_address_street,ship_to_address_city,ship_to_address_state,ship_to_address_country,ship_to_address_zip_code,ship_to_address_others,ship_to_address
0,181000001348,18-Apr-2019,Ace Mobile Manufacturer Pvt Ltd,9632000.0,INR,"[{'name': 'Ace A1-Smartphone', 'description': ...",data/invoice3.pdf,"B-209, Park Plaza",Lucknow,Uttar Pradesh,India,,GSTIN/UIN: 09AABCS1429B1ZS,Krishana Palace Road,Faizabad,Uttar Pradesh,India,,GSTIN/UIN : 09AAGCA1654H1ZQ,
1,124567AB,04/05/2022,BROUR,7000.0,SGD,"[{'name': 'Mitsubishi Electric Asia Pte Ltd', ...",data/invoice1.pdf,12 Commonwealth Drive,Singapore,,Singapore,S143023,Phone: 123452 Fax: 123453,WeCan Halt Road #28-01,Singapore,,Singapore,S123456,Phone: 123445,
2,YOUR LOGO HERE 500 i 70,,Your Company,6250.0,,"[{'name': 'Website arreframe for 5 pages', 'de...",data/invoice6.pdf,123 Man Street,Anytown,,USA,,,Address Line 1,,,,,Address Line 2,
3,1582745827,7/15/22,Aristris sethren Limited,105.0,US,"[{'name': 'CROSS DOCK PRDC', 'description': No...",data/invoice0.pdf,5900 N. GOLDEN STATE BLVD,TURLOCK,CA,US,95382,,7062 PACIFIC AVE,PLEASANT GROVE,CA,US,95668,,
4,INV-42532622,2021-05-23,"Unit 1, Lingkaran Syed Putra Mid Valley City 5...",35.33,USD,"[{'name': 'Lorems ipsum', 'description': None,...",data/invoice4.pdf,"Unit 999, Lingkaran Syed Putra Mid Valley City",Kuala Lumpur,Wilayah Persekutuan Kuala Lumpur,Malaysia,59200,Tel: 03-1234 5678,"Unit 999, Lingkaran Syed Putra Mid Valley City",Kuala Lumpur,Wilayah Persekutuan Kuala Lumpur,Malaysia,59200,Tel: 03-1234 5678,


In [27]:
print(f"before columns drop data.shape: {data.shape}")
data.drop(columns=["items", "ship_to_address"], axis=1, inplace=True)
print(f"after columns drop data.shape: {data.shape}")

before columns drop data.shape: (6, 20)
after columns drop data.shape: (6, 18)


In [None]:
# json_0 = json.loads(data.iloc[0].to_json())

### create ner dataset

In [38]:
import numpy as np

dataset_ner = []
error_dataset_ner = []
for i, item in data.iterrows():
    try:
        output_json = json.loads(item.to_json())
        for key, value in output_json.items():
            if key != "file_path":
                key = key.upper()
                key = key.replace(" ", "_")
                if (value) and (value != "nan"):
                    dataset_ner.append({
                        "file": item["file_path"],
                        "type": key,
                        "value": value
                    })
    except Exception as e:
        error_dataset_ner.append({"id": i, "error": e})
print(f"error count: {len(error_dataset_ner)}")
print(f"total records: {len(dataset_ner)}")

error count: 0
total records: 74


In [39]:
error_dataset_ner

[]

### visualize

In [46]:
ner_data = pd.DataFrame(dataset_ner)
ner_data.head()

Unnamed: 0,file,type,value
0,data/invoice3.pdf,INVOICE_NUMBER,181000001348
1,data/invoice3.pdf,INVOICE_DATE,18-Apr-2019
2,data/invoice3.pdf,COMPANY_NAME,Ace Mobile Manufacturer Pvt Ltd
3,data/invoice3.pdf,INVOICE_AMOUNT,9632000.00
4,data/invoice3.pdf,CURRENCY,INR


In [47]:
ner_data['type'].value_counts()

type
INVOICE_NUMBER               6
INVOICE_AMOUNT               6
INVOICE_DATE                 5
COMPANY_NAME                 5
REMIT_TO_ADDRESS_STREET      5
REMIT_TO_ADDRESS_COUNTRY     5
REMIT_TO_ADDRESS_CITY        5
SHIP_TO_ADDRESS_STREET       5
CURRENCY                     4
SHIP_TO_ADDRESS_OTHERS       4
REMIT_TO_ADDRESS_OTHERS      4
SHIP_TO_ADDRESS_CITY         4
SHIP_TO_ADDRESS_COUNTRY      4
REMIT_TO_ADDRESS_STATE       3
SHIP_TO_ADDRESS_STATE        3
REMIT_TO_ADDRESS_ZIP_CODE    3
SHIP_TO_ADDRESS_ZIP_CODE     3
Name: count, dtype: int64

In [48]:
# IMP: make sure to update utils.entities.py with the above type values.

In [49]:
entity_type_values_series = ner_data.groupby('type')['value'].unique()
for key, value in entity_type_values_series.items():
    print(f"Key: {key}")
    print(f"Values: {value}")
    print("-"*10)

Key: COMPANY_NAME
Values: ['Ace Mobile Manufacturer Pvt Ltd' 'BROUR' 'Your Company'
 'Aristris sethren Limited'
 'Unit 1, Lingkaran Syed Putra Mid Valley City 59200 Kuala Lumpur Wilayah Persekutuan Kuala Lumpur']
----------
Key: CURRENCY
Values: ['INR' 'SGD' 'US' 'USD']
----------
Key: INVOICE_AMOUNT
Values: ['96,32,000.00' '7000' '6250' '105.00' '35.33' '12244.57']
----------
Key: INVOICE_DATE
Values: ['18-Apr-2019' '04/05/2022' '7/15/22' '2021-05-23' '27-Feb-2020']
----------
Key: INVOICE_NUMBER
Values: ['181000001348' '124567AB' 'YOUR LOGO HERE 500 i 70' '1582745827'
 'INV-42532622' 'XE4289']
----------
Key: REMIT_TO_ADDRESS_CITY
Values: ['Lucknow' 'Singapore' 'Anytown' 'TURLOCK' 'Kuala Lumpur']
----------
Key: REMIT_TO_ADDRESS_COUNTRY
Values: ['India' 'Singapore' 'USA' 'US' 'Malaysia']
----------
Key: REMIT_TO_ADDRESS_OTHERS
Values: ['GSTIN/UIN: 09AABCS1429B1ZS' 'Phone: 123452 Fax: 123453'
 'Tel: 03-1234 5678'
 'Mr. Venkateswara Rao 9652886877 Hyderabad, ,telangana']
----------
Key

### save

In [51]:
ner_data.to_csv("simple-ner.csv", index=False)