# EU MRL Compliance Pipeline 

## Imports & loading data

In [1]:
import pandas as pd
import json
import os 
    
from openai import OpenAI

from utils import create_and_dump_data, pdf_to_base64_images, fetch_mrl_data
from utils import EXAMPLES_PATH, JSON_PATH, system_prompt_json
from api_key import OPENAI_API_KEY

In [2]:
if not os.path.exists("eu_products.csv") or not os.path.exists("eu_pesticides.csv"):
    create_and_dump_data()

In [3]:
pest = pd.read_csv("eu_pesticides.csv", sep="|")
prod = pd.read_csv("eu_products.csv", sep="|")

## Main Analysis Function

In [4]:
def analyze_lab_report(pdf_path: str, model: str = "gpt-4o"):
    """
    Analyse a base64-encoded PDF laboratory report using OpenAI Vision API and extract structured information as JSON.
    """

    # Init
    client = OpenAI(api_key=OPENAI_API_KEY)
    
    # Convert Imgs to Base64 and prepare messages
    images_base64 = pdf_to_base64_images(pdf_path)
    content = [
        {"type": "text", "text": "Analyze this laboratory report and extract the information as JSON."}
    ]
    
    # There can be multiple images (one per page)
    for img_b64 in images_base64:
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{img_b64}",
                "detail": "high"
            }
        })
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt_json},
            {"role": "user", "content": content}
        ],
        max_tokens=4096,
        temperature=0
    )
    
    result_text = response.choices[0].message.content
    
    # Parsing
    try:
        # Clean potential markdown formatting
        if "```json" in result_text:
            result_text = result_text.split("```json")[1].split("```")[0].strip()
        elif "```" in result_text:
            result_text = result_text.split("```")[1].split("```")[0].strip()
        
        result_json = json.loads(result_text)
        return result_json
    except json.JSONDecodeError:
        print("Error parsing JSON :")
        print(result_text)
        return {"error": "JSON parsing failed", "raw_response": result_text}

## Pipeline 

In [5]:
for pdf_file in os.listdir(EXAMPLES_PATH):
    
    pdf_path = os.path.join(EXAMPLES_PATH, pdf_file)
    # # DEBUG
    # images_base64 = pdf_to_base64_images(pdf_path)
    # print(f"{len(images_base64[0])}")
    
    # Pipeline
    result = analyze_lab_report(pdf_path)
    print("\n" + "="*50)
    print("RÉSULTAT DE L'ANALYSE:")
    print("="*50)
    print(result)

    # Saving JSON outputs
    json_filename = os.path.splitext(pdf_file)[0] + ".json"
    output_file = os.path.join(JSON_PATH, json_filename)
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)


Converting 1 page(s) in base64 images...
  Page 1/1 done

RÉSULTAT DE L'ANALYSE:
{'Product': 'Mango', 'Product_EU': 'Mango', 'Substances': [{'Name': 'Azoxystrobine', 'Name_EU': 'Azoxystrobin', 'MRL': '4.0'}]}
Converting 2 page(s) in base64 images...
  Page 1/2 done
  Page 2/2 done

RÉSULTAT DE L'ANALYSE:
{'Product': 'piñas - fruta', 'Product_EU': 'pineapples - fruit', 'Substances': [{'Name': 'etefon', 'Name_EU': 'ethephon', 'MRL': '2 mg/kg'}]}
Converting 1 page(s) in base64 images...
  Page 1/1 done

RÉSULTAT DE L'ANALYSE:
{'Product': 'Ananas', 'Product_EU': 'Pineapple', 'Substances': [{'Name': 'Ethephon', 'Name_EU': 'Ethephon', 'MRL': '2'}, {'Name': 'Fludioxonil', 'Name_EU': 'Fludioxonil', 'MRL': '7'}]}


## MRL Compliance Analysis

In [6]:
for jsons in os.listdir(JSON_PATH):
    
    json_path = os.path.join(JSON_PATH, jsons)
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    product_name = data.get("Product_EU", "")
    
    for substances in data.get("Substances", []):
        
        substance_name = substances.get("Name_EU", "")
        mrl_value = float(substances.get("MRL", "0").replace("mg/kg", "").strip())
        fetch_mrl_data(product_name, substance_name, mrl_value)
        print("\n" + "-"*50)
    

MRL data found for product 'Mango' and substance 'Azoxystrobin'.
MRL options from EU database: No MRL required
MRL value from report: 4.0

COMPLIANCE RESULT:
CONFORME

--------------------------------------------------

--------------------------------------------------
MRL data found for product 'Pineapple' and substance 'Ethephon'.
MRL options from EU database: 0.03*
MRL value from report: 2.0

COMPLIANCE RESULT:
NON CONFORME

--------------------------------------------------
MRL data found for product 'Pineapple' and substance 'Fludioxonil'.
MRL options from EU database: 0.01*
MRL value from report: 7.0

COMPLIANCE RESULT:
NON CONFORME

--------------------------------------------------
