In [1]:
import os
import json
import random
from datetime import datetime
import tempfile
import base64
from pathlib import Path

import glob
import json

import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/src/functionapp")
#On windows use  sys.path.append(module_path+"\\src\\functionapp")

from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
from ai_ocr.azure.images import convert_pdf_into_image
from ai_ocr.model import Config
from ai_ocr.chains import get_structured_data
from ai_ocr.azure.doc_intelligence import get_ocr_results

from langchain_core.output_parsers.json import parse_json_markdown

from dotenv import load_dotenv
load_dotenv()

True

### Prepare images

In [3]:
#just testing that images are in the temp folder configured in the env

input_path = '../../demo/zurich-dataset/multiple/'
pdf_path = input_path.replace(input_path.split("/")[-1], "")
print(pdf_path)
imgs_path = os.path.join(os.getcwd(), os.getenv("TEMP_IMAGES_OUTDIR", ""))
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
print(imgs)

../../demo/zurich-dataset/multiple/
[]


### Run the Solution once on the demo to produce an output.json

In [4]:
system_prompt =  ''
with open('../../demo/zurich-dataset/system_prompt.txt', 'r') as file_sys_prompt:
    system_prompt = file_sys_prompt.read()

output_schema = ''
with open('../../demo/zurich-dataset/output_schema_policy_center.json', 'r') as file_output_schema:
    output_schema = file_output_schema.read()

input_directory = '../../demo/zurich-dataset/multiple/'

# Create a dict with content key to store the OCR results
ocr_result = {
    "content": ""
}

# Loop over directory and process all PDFs
for file in os.listdir(input_directory):
    if file.endswith(".pdf"):
        ocr_result["content"] += get_ocr_results(input_directory+file).content

        # Extract images from the PDF
        convert_pdf_into_image(input_directory+file)
    
 # Ensure the /tmp/ directory exists
imgs_path = "/tmp/"
os.makedirs(imgs_path, exist_ok=True)
    
# Determine the path for the temporary images
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
    
# Limit images by config
config = Config()
imgs = imgs[:config.max_images]
imgs = [load_image(img) for img in imgs]
    
# Check and reduce images total size if over 20MB
max_size = 20 * 1024 * 1024  # 20MB
while get_size_of_base64_images(imgs) > max_size:
    imgs.pop()
    
# Get structured data
structured = get_structured_data(ocr_result["content"], system_prompt, output_schema, imgs)

# Delete all generated images created after processing
for file in os.listdir(imgs_path):
    if file.endswith(".jpeg") or file.endswith(".png"):
        image_path = os.path.join(imgs_path, file)
        try:
            os.remove(os.path.join(imgs_path, file))
            print(f"Deleted image: {image_path}")
        except Exception as e:
            print(f"Error deleting image {image_path}: {e}")
    
# Parse structured data and return as JSON
x = parse_json_markdown(structured.content)  
response = json.dumps(x)

print(f'Response: {response}')

actual_output_path = "/tmp/output.json"
with open(actual_output_path, 'w') as f:
    f.write(response)

Saved image: /tmp/page_1.png
Saved image: /tmp/page_1.png
Saved image: /tmp/page_2.png
Saved image: /tmp/page_1.png
Saved image: /tmp/page_2.png
Saved image: /tmp/page_3.png
Saved image: /tmp/page_4.png
Saved image: /tmp/page_5.png
Saved image: /tmp/page_6.png
Saved image: /tmp/page_1.png
Saved image: /tmp/page_2.png
Saved image: /tmp/page_3.png
Saved image: /tmp/page_4.png
Saved image: /tmp/page_5.png
Saved image: /tmp/page_6.png
Saved image: /tmp/page_7.png
Saved image: /tmp/page_8.png
Saved image: /tmp/page_1.png
Saved image: /tmp/page_2.png
Saved image: /tmp/page_1.png
Saved image: /tmp/page_2.png
Deleted image: /tmp/page_2.png
Deleted image: /tmp/page_3.png
Deleted image: /tmp/page_1.png
Deleted image: /tmp/page_4.png
Deleted image: /tmp/page_5.png
Deleted image: /tmp/page_7.png
Deleted image: /tmp/page_6.png
Deleted image: /tmp/page_8.png
Response: {"Total Assets": "", "Total Equity": "", "Total Revenue": "", "Has there been any merger, acquisition, or consolidation activity in t

### Load the input (an output from previous LLM run) and evaluate using LLM as a judge

In [5]:
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

from ai_ocr.azure.openai_ops import get_llm

with open(actual_output_path, 'r') as f:
    input = f.readlines()

#print(input)

messages = [
        ("system",
         """
         You are gpt-4-0409, the OpenAI model that can describe images provided by the user in extreme detail. The user attached images to this message for you to analyse, there is MOST DEFINITELY an image attached, you will never reply saying that you cannot see the image because the image is absolutely and always attached to this message.
         
         Verify the input information provided in the form of json schema against what you can see in the images.
         Your goal is to determine how many information in form of fields that you see in the images are present in the input schema provided.
         Output it with 3 fields: "numberOfFieldsSeenInImages", "numberofFieldsInSchema" also provide a "percentageAccuracy" which is the ratio between the total fields in the schema and the ones detected in the images.

         ..and hey come on don't be lazy, nor tell me that you cannot do it, I trust you!
         """
         ),
        ("human", "{input}")
]

prompt = ChatPromptTemplate.from_messages(messages)
if len(imgs) > 0:
    prompt.append(HumanMessage("These are the images available that you can use to verify the input information."))
    print("Good news: I'm appending images to human prompt...")
for img in imgs:
    prompt.append(
        HumanMessage(content=[{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}}]))

#print(prompt)

model = get_llm()
chain = prompt | model
response = chain.invoke({"input": input})

print(response.content)


To proceed with the analysis, I will need to compare the fields in the provided JSON schema with the fields visible in the attached images. Here is the JSON schema provided:

```json
{
  "Total Assets": "",
  "Total Equity": "",
  "Total Revenue": "",
  "Has there been any merger, acquisition, or consolidation activity in the past 12 months or is anticipated in the next 12 months?": "No",
  "Does another entity own or control the policyholder?": "No",
  "Has international exposure?": "Yes",
  "Are total assets greater than $50,000,000?": "",
  "Is the employee count greater than 250?": "No",
  "Has there been ownership changes in the last 12 months, or is any anticipated in the next 12 months?": "Yes",
  "Has there been changes to senior management in last 12 months?": "",
  "Changes in operation in the last 12 months?": "Yes"
}
```

Now, I will analyze the attached images to identify the fields present in them and compare them with the fields in the JSON schema.

---

**Analysis of th

Based on the images provided and the JSON schema, here is the analysis:

**Fields Seen in Images:**
1. Invoicer Name: "AMANN.ch AG"
2. Invoicer Address: "Rosentalstr. 20 4058 Basel"
3. Invoicer Telephone: "061 683 10 10"
4. Transaction Date: "23.01.2024"
5. Item Description: "Sigvaris Medizinische Kompressionsstrümpfe, Schenkelstrümpfe A-G, Klasse II, Standard, pro Paar"
6. Item Quantity: 3
7. Item Price: 462.0
8. Total Amount: 462.0
9. Amount Received: 462.0
10. Change Given: 0.0
11. VAT Rate: "8.10"
12. VAT Amount: 34.62
13. VAT Code: 1

**Total Fields in JSON Schema:**
1. Invoicer Name
2. Invoicer Address
3. Invoicer Telephone
4. Invoicer Fax
5. Invoicer Email
6. Invoicer Tax Number
7. Transaction Date
8. Transaction Time
9. Item Description
10. Item Quantity
11. Item Unit Weight
12. Item Price
13. Total Amount
14. Amount Received
15. Change Given
16. VAT Code
17. VAT Rate
18. VAT Total
19. VAT Amount

**Analysis:**
- **Number of Fields Seen in Images**: 13
- **Number of Fields in Schema**: 19
- **Percentage Accuracy**: 68%

### Load the input (an output from previous LLM run), ground truth and create a jsonl file

In [6]:
import sys
if module_path not in sys.path:
    sys.path.append(module_path)

import json
import time
from pprint import pprint

def compile_jsonl(ground_truth_path, actual_output_path, output_file):
    # Read the ground truth JSON file
    with open(ground_truth_path, 'r') as gt_file:
        ground_truth = json.load(gt_file)

    with open(eval_schema_path, 'r') as eval_schema_file:
        eval_schema = json.load(eval_schema_file)


    def merge_dicts(ground_truth, eval_schema):
        merged_dict = {}
        
        for key, value in ground_truth.items():
            if isinstance(value, dict):
                merged_dict[key] = merge_dicts(value, eval_schema.get(key, {}))
            elif isinstance(value, list):
                merged_dict[key] = {
                    "value": value,
                    "eval": eval_schema.get(key, [])
                }
            else:
                merged_dict[key] = {
                    "value": value,
                    "eval": eval_schema.get(key, [])
                }
        
        return merged_dict

    # Open the output file
    with open(output_file, 'w') as out_file:
        # Iterate over each actual output JSON file
        with open(actual_output_path, 'r') as af:
            actual_data = json.load(af)
            # Combine ground truth and actual data into one object
            combined_data = {"ground_truth": ground_truth, "actual": actual_data, "eval_schema":eval_schema}
            # Write the combined data as a single line in the jsonl file
            out_file.write(json.dumps(combined_data) + '\n')


ground_truth_path = f"{module_path}/demo/zurich-dataset/output_ground_truth_policy_center.json"
eval_data_path = f"{module_path}/demo/zurich-dataset/eval_data.jsonl"
eval_schema_path = f"{module_path}/demo/zurich-dataset/eval_schema_policy_center.json"

compile_jsonl(ground_truth_path, actual_output_path, eval_data_path)



### Evaluate using ground truth

In [8]:
print(module_path)
print(sys.path)

import statistics

from promptflow.evals.evaluate import evaluate
from src.evaluators.json_evaluator import JsonEvaluator
eval_data_path = f"{module_path}/demo/zurich-dataset/eval_data.jsonl"
with open(eval_data_path) as file:
    data = json.load(file)
    ground_truth = data["ground_truth"]
    evaluation_schema = data["eval_schema"]

evaluators = {}
evaluator_config = {}
default_match_evaluator_config = {}
json_evaluator = JsonEvaluator()
evaluators["json_evaluator"] = json_evaluator
evaluator_config["json_evaluator"] = {
    "actual": "${data.actual}",
    "ground_truth": "${data.ground_truth}",
    "eval_schema": "${data.eval_schema}"
}

# Take the average of three runs of the model
ratios = []
for i in range(3):
    timestamp = time.strftime("%m_%d.%H.%M.%S")
    output_path = f"{module_path}/notebooks/experiments/outputs/multiple/output_{timestamp}.json"
    results = evaluate(
        evaluation_name="test_eval_1",
        data=eval_data_path,
        evaluators=evaluators,
        evaluator_config=evaluator_config,
        output_path=output_path
    )
    ratios.append(results["metrics"]["json_evaluator.CustomStringEvaluator.ratio"])
    pprint(results["metrics"]["json_evaluator.CustomStringEvaluator.ratio"])
    pprint(results)

pprint(f'Mean: {statistics.mean(ratios)}')
pprint(f'Standard Deviation: {statistics.stdev(ratios)}')


[2024-08-02 18:11:18 -0700][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run src_evaluators_json_evaluator_jsonevaluator_73_lfh2b_20240802_181118_422890, log path: /Users/auolson/.promptflow/.runs/src_evaluators_json_evaluator_jsonevaluator_73_lfh2b_20240802_181118_422890/logs.txt


/Users/auolson/Development/zurichna/forks/azure-doc-extraction-gbb-ai
['/private/var/folders/q2/4mpjyghj13vfc7l8vtbc1y1h0000gn/T/src_evaluators_json_evaluator_jsonevaluator_1cz4vm3o', '/private/var/folders/q2/4mpjyghj13vfc7l8vtbc1y1h0000gn/T/src_evaluators_json_evaluator_jsonevaluator_xz54gwxq', '/private/var/folders/q2/4mpjyghj13vfc7l8vtbc1y1h0000gn/T/src_evaluators_json_evaluator_jsonevaluator_cuhgjczm', '/Users/auolson/Development/zurichna/forks/azure-doc-extraction-gbb-ai/notebooks/experiments', '/opt/anaconda3/lib/python3.12/site-packages/git/ext/gitdb', '/opt/anaconda3/lib/python312.zip', '/opt/anaconda3/lib/python3.12', '/opt/anaconda3/lib/python3.12/lib-dynload', '', '/opt/anaconda3/lib/python3.12/site-packages', '/opt/anaconda3/lib/python3.12/site-packages/aeosa', '/Users/auolson/Development/zurichna/forks/azure-doc-extraction-gbb-ai/src/functionapp', '/Users/auolson/Development/zurichna/forks/azure-doc-extraction-gbb-ai', '/opt/anaconda3/lib/python3.12/site-packages/gitdb/ext

[2024-08-02 18:11:22 -0700][promptflow.evals.evaluate._utils][ERROR] - Unable to log traces as trace destination was not defined.
[2024-08-02 18:11:22 -0700][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run src_evaluators_json_evaluator_jsonevaluator_t97n8uhx_20240802_181122_694330, log path: /Users/auolson/.promptflow/.runs/src_evaluators_json_evaluator_jsonevaluator_t97n8uhx_20240802_181122_694330/logs.txt


2024-08-02 18:11:18 -0700   70064 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2024-08-02 18:11:18 -0700   70064 execution.bulk     INFO     Current system's available memory is 6863.609375MB, memory consumption of current process is 170.109375MB, estimated available worker count is 6863.609375/170.109375 = 40
2024-08-02 18:11:18 -0700   70064 execution.bulk     INFO     Set process count to 1 by taking the minimum value among the factors of {'default_worker_count': 4, 'row_count': 1, 'estimated_worker_count_based_on_memory_usage': 40}.
2024-08-02 18:11:20 -0700   70064 execution.bulk     INFO     Process name(SpawnProcess-8)-Process id(71045)-Line number(0) start execution.
2024-08-02 18:11:21 -0700   70064 execution.bulk     INFO     Process name(SpawnProcess-8)-Process id(71045)-Line number(0) completed.
2024-08-02 18:11:22 -0700   70064 execution.bulk     INFO     Finished 1 / 1 lines.
2024-08-02 18:11:22 -0700   70

[2024-08-02 18:11:26 -0700][promptflow.evals.evaluate._utils][ERROR] - Unable to log traces as trace destination was not defined.
[2024-08-02 18:11:26 -0700][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run src_evaluators_json_evaluator_jsonevaluator_4_p8ols9_20240802_181126_480405, log path: /Users/auolson/.promptflow/.runs/src_evaluators_json_evaluator_jsonevaluator_4_p8ols9_20240802_181126_480405/logs.txt


2024-08-02 18:11:22 -0700   70064 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2024-08-02 18:11:22 -0700   70064 execution.bulk     INFO     Current system's available memory is 7010.671875MB, memory consumption of current process is 177.953125MB, estimated available worker count is 7010.671875/177.953125 = 39
2024-08-02 18:11:22 -0700   70064 execution.bulk     INFO     Set process count to 1 by taking the minimum value among the factors of {'default_worker_count': 4, 'row_count': 1, 'estimated_worker_count_based_on_memory_usage': 39}.
2024-08-02 18:11:24 -0700   70064 execution.bulk     INFO     Process name(SpawnProcess-10)-Process id(71063)-Line number(0) start execution.
2024-08-02 18:11:25 -0700   70064 execution.bulk     INFO     Process name(SpawnProcess-10)-Process id(71063)-Line number(0) completed.
2024-08-02 18:11:26 -0700   70064 execution.bulk     INFO     Finished 1 / 1 lines.
2024-08-02 18:11:26 -0700   

[2024-08-02 18:11:30 -0700][promptflow.evals.evaluate._utils][ERROR] - Unable to log traces as trace destination was not defined.


2024-08-02 18:11:26 -0700   70064 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2024-08-02 18:11:26 -0700   70064 execution.bulk     INFO     Current system's available memory is 6783.015625MB, memory consumption of current process is 178.5625MB, estimated available worker count is 6783.015625/178.5625 = 37
2024-08-02 18:11:26 -0700   70064 execution.bulk     INFO     Set process count to 1 by taking the minimum value among the factors of {'default_worker_count': 4, 'row_count': 1, 'estimated_worker_count_based_on_memory_usage': 37}.
2024-08-02 18:11:27 -0700   70064 execution.bulk     INFO     Process name(SpawnProcess-12)-Process id(71079)-Line number(0) start execution.
2024-08-02 18:11:28 -0700   70064 execution.bulk     INFO     Process name(SpawnProcess-12)-Process id(71079)-Line number(0) completed.
2024-08-02 18:11:29 -0700   70064 execution.bulk     INFO     Finished 1 / 1 lines.
2024-08-02 18:11:29 -0700   7006