# EU MRL Compliance Pipeline 

## Imports & loading data

In [1]:
import pandas as pd
import json
import os 
    
from openai import OpenAI

from utils import create_and_dump_data, pdf_to_base64_images
from utils import EXAMPLES_PATH, JSON_PATH, system_prompt_json

In [2]:
if not os.path.exists("eu_products.csv") or not os.path.exists("eu_pesticides.csv"):
    create_and_dump_data()

In [3]:
pest = pd.read_csv("eu_pesticides.csv", sep="|")
prod = pd.read_csv("eu_products.csv", sep="|")

In [4]:
def analyze_lab_report(pdf_path, api_key, model="gpt-4o"):
    """
    Analyse a base64-encoded PDF laboratory report using OpenAI Vision API and extract structured information as JSON.
    """

    # Init
    client = OpenAI(api_key=api_key)
    
    # Convertir le PDF en images
    images_base64 = pdf_to_base64_images(pdf_path)
    
    # Préparer le contenu avec toutes les images
    content = [
        {"type": "text", "text": "Analyze this laboratory report and extract the information as JSON."}
    ]
    
    # There can be multiple images (one per page)
    for img_b64 in images_base64:
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{img_b64}",
                "detail": "high"
            }
        })
    
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt_json},
            {"role": "user", "content": content}
        ],
        max_tokens=4096,
        temperature=0
    )
    
    result_text = response.choices[0].message.content
    
    # Parsing
    try:
        # Clean potential markdown formatting
        if "```json" in result_text:
            result_text = result_text.split("```json")[1].split("```")[0].strip()
        elif "```" in result_text:
            result_text = result_text.split("```")[1].split("```")[0].strip()
        
        result_json = json.loads(result_text)
        return result_json
    except json.JSONDecodeError:
        print("Erreur lors du parsing JSON. Réponse brute:")
        print(result_text)
        return {"error": "JSON parsing failed", "raw_response": result_text}

In [None]:
for pdf_file in os.listdir(EXAMPLES_PATH):
    
    pdf_path = os.path.join(EXAMPLES_PATH, pdf_file)
    # DEBUG
    images_base64 = pdf_to_base64_images(pdf_path)
    print(f"{len(images_base64[0])}")
    
    # Pipeline
    # result = analyze_lab_report(pdf_path, api_key, model="gpt-4o")
    # print("\n" + "="*50)
    # print("RÉSULTAT DE L'ANALYSE:")
    # print("="*50)
    # print(result)

    # Saving JSON outputs
    json_filename = os.path.splitext(pdf_file)[0] + ".json"
    output_file = os.path.join(JSON_PATH, json_filename)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)


Converting 1 page(s) in base64 images...
  Page 1/1 done
635632
Analysis-Example-1_B.json
E:\Taf\Tests\relasuitedetacandidatureseniordatascientistt\json_outputs\Analysis-Example-1_B.json


NameError: name 'fdssd' is not defined