In [None]:
from fuzzywuzzy import fuzz
from io import BytesIO
from PIL import Image
import requests
import BytesIO
import base64
import json
import tqdm
import os
from io 

In [None]:
def getImage(JsonFilePath):
    """ Extracts the first image from a JSON file and returns it as a PIL Image object."""
    with open(JsonFilePath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    base64_image = data["pages"][0]["image"]["content"]
    image_data = base64.b64decode(base64_image)
    image = Image.open(BytesIO(image_data))

    return image
def getJsonFiles(DataFolder,Type):
    """ Returns a list of JSON file paths from the specified folder and type."""
    Files = []
    subFolder = os.path.join(DataFolder, Type)
    for folder in os.listdir(subFolder):  
        folder_path = os.path.join(subFolder, folder)
        for file in os.listdir(folder_path):
            if file.endswith(".json") and not file.endswith('_processed.json'):
                file_path = os.path.join(folder_path, file)
                Files.append(file_path)
    return Files
labels = [
    "Adresse-prescripteur",
    "Date-de-la-prescription",
    "Nom-du-medecin",
    "Numero-ADELI",
    "Numero-AM-Finess",
    "Numero-RPPS",
    "Signature",
    "Texte-manuscrit",
    "Texte-Signature",
    "Texte-soin-ALD",
    "Texte-soin-sans-ALD",
]


In [5]:
Files = getJsonFiles("../Data",'test')

### Calling DocumentAI API

In [None]:

def process_document(img_pil,labels):
    """ Sends a PIL Image to the OCR service and processes the response."""
    url = 'http://localhost:3000/ocr/'

    # Convert PIL Image to bytes buffer
    img_buffer = BytesIO()
    img_pil.save(img_buffer, format='PNG')  
    img_buffer.seek(0)

    files = {'file': ('image.png', img_buffer, 'image/png')}
    try:
        response = requests.post(url, files=files)
        response.raise_for_status()

        data = response.json()
        return processResponse(data, labels)

    except requests.exceptions.RequestException as e:
        print(f'Error processing document: {e}')
        return None

def processResponse(response, labels):
    """ Processes the OCR response and extracts relevant information based on labels."""
    if response is None:
        return None
    result = {}
    for label in labels:
        result[label] = ""
    for entity in response:
        if entity['type'] in labels:
            result[entity['type']] = entity['mentionText']
    return result


In [None]:
image = getImage(Files[0])
result = process_document(image,labels)

In [None]:

def benchmark_json(true_json, pred_json):
    """ Compare two JSON objects and return hard and fuzzy match scores for each label."""
    results = {}
    
    for label in true_json:
        true_value = (true_json.get(label) or "").strip()
        pred_value = (pred_json.get(label) or "").strip()

        if pred_value == "":
            pred_value = "None"
        if true_value == "":
            true_value = "None"
        
        hard_match = int(true_value == pred_value)

        fuzzy_match = fuzz.ratio(true_value, pred_value) / 100.0  
        
        results[label] = {
            "hard_match": hard_match,
            "fuzzy_match": round(fuzzy_match, 4)  
        }

    return results

    
def merge_benchmarks(all_step_jsons):
    """ Merges multiple benchmark results and averages the scores for each label."""
    from collections import defaultdict
    merged = defaultdict(lambda: {"hard_match": [], "fuzzy_match": []})

    for step_json in all_step_jsons:
        for label, scores in step_json.items():
            merged[label]["hard_match"].append(scores.get("hard_match", 0))
            merged[label]["fuzzy_match"].append(scores.get("fuzzy_match", 0.0))

    averaged = {}
    for label, scores in merged.items():
        avg_hard = round(sum(scores["hard_match"]) / len(scores["hard_match"]), 4)
        avg_fuzzy = round(sum(scores["fuzzy_match"]) / len(scores["fuzzy_match"]), 4)
        averaged[label] = {
            "hard_match": avg_hard,
            "fuzzy_match": avg_fuzzy
        }

    return averaged


In [None]:
def benchmark_model(Files):
    """ Benchmarks the OCR model by processing each document and comparing results."""
    all_step_jsons = []
    for file in tqdm.tqdm(Files):
        true_json_path = file.replace('.json', '_processed.json')
        if not os.path.exists(true_json_path):
            print(f"Processed JSON file not found for {file}")
            continue
        with open(true_json_path, 'r',encoding="utf-8") as f:
            true_json = json.load(f)
        pred_json = process_document(getImage(file), labels)
        if pred_json is None:
            print(f"Failed to process document for {file}")
            continue

        benchmark_result = benchmark_json(true_json, pred_json)
        all_step_jsons.append(benchmark_result)
        
    merged_benchmark = merge_benchmarks(all_step_jsons)
    return merged_benchmark


In [80]:
benchmark_model(Files)

100%|██████████| 87/87 [14:50<00:00, 10.23s/it]


{'Adresse-prescripteur': {'hard_match': 0.2069, 'fuzzy_match': 0.2733},
 'Date-de-la-prescription': {'hard_match': 0.8276, 'fuzzy_match': 0.936},
 'Nom-du-medecin': {'hard_match': 0.8276, 'fuzzy_match': 0.8985},
 'Numero-ADELI': {'hard_match': 0.8966, 'fuzzy_match': 0.8983},
 'Numero-AM-Finess': {'hard_match': 0.3793, 'fuzzy_match': 0.4033},
 'Numero-RPPS': {'hard_match': 0.8966, 'fuzzy_match': 0.9353},
 'Signature': {'hard_match': 0.0805, 'fuzzy_match': 0.0834},
 'Texte-manuscrit': {'hard_match': 0.5402, 'fuzzy_match': 0.5728},
 'Texte-Signature': {'hard_match': 0.5862, 'fuzzy_match': 0.6118},
 'Texte-soin-ALD': {'hard_match': 0.4828, 'fuzzy_match': 0.787},
 'Texte-soin-sans-ALD': {'hard_match': 0.5287, 'fuzzy_match': 0.7209}}