Steps:
1. move pdfs from local to s3
2. create textract blocks for each pdf
3. create annotations for each pdf
4. create output.manifest

### init

In [1]:
%load_ext autoreload

In [2]:
import os, boto3, json
import pandas as pd

data = pd.read_csv("simple-ner.csv")
data.head()

Unnamed: 0,file,type,value
0,data/invoice3.pdf,INVOICE_NUMBER,181000001348
1,data/invoice3.pdf,INVOICE_DATE,18-Apr-2019
2,data/invoice3.pdf,COMPANY_NAME,Ace Mobile Manufacturer Pvt Ltd
3,data/invoice3.pdf,INVOICE_AMOUNT,9632000.00
4,data/invoice3.pdf,CURRENCY,INR


In [3]:
workspace_bucket = "applied-agi"
local_pdfs_folder_path = "data/"
s3_pdfs_folder_path = "ner-comprehend/pdfs"
blocks_folder_path="ner-comprehend/blocks"
annotations_folder_path = "ner-comprehend/annotations"
annotation_job_name = "pdf-ner-annotation"
manifest_folder_path="ner-comprehend/manifests"
output_manifest_file_name="output.manifest"
output_manifest_file_s3_path = f"s3://{workspace_bucket}/{manifest_folder_path}/{output_manifest_file_name}"

### upload ( pdfs + textract_blocks (after creating from pdfs) ) to s3 

In [4]:
# thought: should I separate the 2 activities into 2 functions

In [4]:
# ! aws s3 rm --recursive s3://applied-agi/ner-comprehend/pdfs 
# ! aws s3 rm --recursive s3://applied-agi/ner-comprehend/blocks

delete: s3://applied-agi/ner-comprehend/pdfs/invoice0.pdf
delete: s3://applied-agi/ner-comprehend/pdfs/invoice1.pdf
delete: s3://applied-agi/ner-comprehend/pdfs/invoice3.pdf
delete: s3://applied-agi/ner-comprehend/pdfs/invoice6.pdf
delete: s3://applied-agi/ner-comprehend/pdfs/invoice2.pdf
delete: s3://applied-agi/ner-comprehend/pdfs/invoice4.pdf
delete: s3://applied-agi/ner-comprehend/pdfs/invoice5.pdf
delete: s3://applied-agi/ner-comprehend/blocks/invoice0.pdf.json
delete: s3://applied-agi/ner-comprehend/blocks/invoice1.pdf.json
delete: s3://applied-agi/ner-comprehend/blocks/invoice3.pdf.json
delete: s3://applied-agi/ner-comprehend/blocks/invoice6.pdf.json
delete: s3://applied-agi/ner-comprehend/blocks/invoice2.pdf.json
delete: s3://applied-agi/ner-comprehend/blocks/invoice4.pdf.json
delete: s3://applied-agi/ner-comprehend/blocks/invoice5.pdf.json


In [5]:
from utils import block_helper

s3_client = boto3.client("s3")

files_not_uploaded = []
for file in os.listdir(local_pdfs_folder_path):
    try:
        # part 1: upload pdf (local -> s3)
        local_pdf_file_path=os.path.join(local_pdfs_folder_path, file)
        s3_pdf_file_path = f"{s3_pdfs_folder_path}/{file}"
        s3_blocks_file_path = f"{blocks_folder_path}/{file}.json"
        
        s3_client.upload_file(
            Filename=local_pdf_file_path, 
            Bucket=workspace_bucket, 
            Key=s3_pdf_file_path
        )
        
        # part 2: create blocks
        text_response = block_helper.blocks_from_scanned_pdf(
            bucket=workspace_bucket,
            key=s3_pdf_file_path,
            page_number=1
        ) # for now only single-page PDFs # TODO: think how to do for multi-page pdfs
        blocks = block_helper.JSONHandler(text_response)
        
        # part 3: put blocks in s3
        s3_client.put_object(
            Body=json.dumps(blocks),
            Bucket=workspace_bucket,
            Key=s3_blocks_file_path,
        )
    except Exception as e:
        files_not_uploaded.append({
            "file": file,
            "error": e
        })

print(f"count(files_not_uploaded): {len(files_not_uploaded)}")
print(f"Lower the count better !")

count(files_not_uploaded): 0
Lower the count better !


### create annotations

In [6]:
files_not = [item["file"] for item in files_not_uploaded]
print(f"count(files not uploaded): {len(files_not)}")

a = os.listdir(local_pdfs_folder_path)
b = files_not
uploaded_files = list(set(a)-set(b))
print(f"count(files uploaded): {len(uploaded_files)}")

count(files not uploaded): 0
count(files uploaded): 7


In [7]:
from utils import annotations_helper

s3_resource = boto3.resource("s3")

files_not_annotated = []
for file in uploaded_files:
    try:
        print(f"++++++++++ Processing file: {file}")
        local_pdf_file_path=os.path.join(local_pdfs_folder_path, file) # ends with .pdf
        s3_pdf_file_path = f"{s3_pdfs_folder_path}/{file}" # ends with .pdf
        s3_blocks_file_path = f"{blocks_folder_path}/{file}.json" # ends with .pdf.json
        s3_annotations_file_path = f"{annotations_folder_path}/{file}.json" # ends with .pdf.json

        # part 1: get blocks from s3
        blocks = s3_resource.Object(workspace_bucket, s3_blocks_file_path)
        blocks = json.load(blocks.get()["Body"])
        blocks = annotations_helper.clean_blocks(blocks)

        # part 2: create annotations
        ners_per_file_df = data[data["file"]==local_pdf_file_path]
        annotations = annotations_helper.get_annotations(
            df=ners_per_file_df,
            blocks=blocks,
            s3_blocks_path=s3_blocks_file_path,
            ann_file_key=f"{file}.json",
        )

        # part 3: put annotations obj to s3
        s3_client.put_object(
            Body=json.dumps(annotations),
            Bucket=workspace_bucket,
            Key=s3_annotations_file_path,
        )
    except Exception as e:
        files_not_annotated.append({
            "file": file,
            "error": e
        })

print(f"count(files_not_annotated): {len(files_not_annotated)}")
print(f"Lower the count better !")    

++++++++++ Processing file: invoice4.pdf
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word

word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!
word=Malaysia not found!
word=Malaysia not found!
word=USD not found!


In [8]:
files_not_annotated

[]

### create output.manifest file

In [9]:
files_not = [item["file"] for item in files_not_annotated]
print(f"count(files not annotated): {len(files_not)}")

a = os.listdir(local_pdfs_folder_path)
b = files_not
annotated_files = list(set(a)-set(b))
print(f"count(files annotated_files): {len(annotated_files)}")

count(files not annotated): 0
count(files annotated_files): 7


In [16]:
from utils.entities import entity_names_list
manifest_jsonl = []

annotation_job_name = "pdf-ner-annotation"
for file in annotated_files:
    pdf_s3_path = f"s3://{workspace_bucket}/{s3_pdfs_folder_path}/{file}" # ends with .pdf
    annotation_s3_path = f"s3://{workspace_bucket}/{annotations_folder_path}/{file}.json"  # ends with .pdf.json
    
    manifest_item = {
        "source-ref": pdf_s3_path,
        "page": "1",
        "metadata": {
            "pages": "1",
            "use-textract-only": False,
            "labels": entity_names_list
        }
    }
    manifest_item[annotation_job_name] = {
        "annotation-ref": annotation_s3_path
    }
    metadata_key = f"{annotation_job_name}-metadata"
    
    manifest_item[metadata_key] = {
            "type": "groundtruth/custom",
            "job-name": annotation_job_name,
            "human-annotated": "yes",
            "creation-date": "2023-08-14T07:33:56.556000"
        }

    manifest_jsonl.append(manifest_item)
        
print(f"len(manifest_jsonl): {len(manifest_jsonl)}")

manifest_jsonl[:1]

len(manifest_jsonl): 7


[{'source-ref': 's3://applied-agi/ner-comprehend/pdfs/invoice4.pdf',
  'page': '1',
  'metadata': {'pages': '1',
   'use-textract-only': False,
   'labels': ['INVOICE_NUMBER',
    'INVOICE_DATE',
    'COMPANY_NAME',
    'INVOICE_AMOUNT',
    'CURRENCY',
    'REMIT_TO_ADDRESS_STREET',
    'REMIT_TO_ADDRESS_CITY',
    'REMIT_TO_ADDRESS_STATE',
    'REMIT_TO_ADDRESS_COUNTRY',
    'REMIT_TO_ADDRESS_ZIP_CODE',
    'REMIT_TO_ADDRESS_OTHERS',
    'SHIP_TO_ADDRESS_STREET',
    'SHIP_TO_ADDRESS_CITY',
    'SHIP_TO_ADDRESS_STATE',
    'SHIP_TO_ADDRESS_COUNTRY',
    'SHIP_TO_ADDRESS_ZIP_CODE',
    'SHIP_TO_ADDRESS_OTHERS']},
  'pdf-ner-annotation': {'annotation-ref': 's3://applied-agi/ner-comprehend/annotations/invoice4.pdf.json'},
  'pdf-ner-annotation-metadata': {'type': 'groundtruth/custom',
   'job-name': 'pdf-ner-annotation',
   'human-annotated': 'yes',
   'creation-date': '2023-08-14T07:33:56.556000'}}]

In [17]:
# in order to complete the training with minm 250 samples  
# TODO: use fairly complex & large dataset to not be doing below
manifest_jsonl = manifest_jsonl + manifest_jsonl
manifest_jsonl = manifest_jsonl + manifest_jsonl
manifest_jsonl = manifest_jsonl + manifest_jsonl
manifest_jsonl = manifest_jsonl + manifest_jsonl
manifest_jsonl = manifest_jsonl + manifest_jsonl
manifest_jsonl = manifest_jsonl + manifest_jsonl

print(f"len(manifest_jsonl): {len(manifest_jsonl)}")

len(manifest_jsonl): 448


In [18]:
with open(output_manifest_file_name, "w") as f:
    for item in manifest_jsonl:
        f.write(json.dumps(item) + "\n")

print(f"Created file: {output_manifest_file_name}")

Created file: output.manifest


In [19]:
! aws s3 cp $output_manifest_file_name $output_manifest_file_s3_path
! rm $output_manifest_file_name

# print(f"{output_manifest_file_name} uploaded to s3 path: {output_manifest_file_s3_path}")

upload: ./output.manifest to s3://applied-agi/ner-comprehend/manifests/output.manifest
