In [2]:
import os
import pickle
import shutil
import tempfile
from glob import glob

from dotenv import load_dotenv

from pipeline import GPT, OCR, LabelStorage, analyze
from pipeline.inspection import FertilizerInspection


In [4]:
# Load environment variables
load_dotenv()

# Define label folder numbers dynamically from 1 to 35
label_folders = range(1, 36)
# label_folders = [1, 6, 8]

# Define possible image extensions
image_extensions = [".jpg", ".png"]

# Mock environment setup for OCR and GPT
api_endpoint_ocr = os.getenv("AZURE_API_ENDPOINT")
api_key_ocr = os.getenv("AZURE_API_KEY")
api_endpoint_gpt = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key_gpt = os.getenv("AZURE_OPENAI_KEY")
api_deployment_gpt = os.getenv("AZURE_OPENAI_DEPLOYMENT")

# Initialize OCR and GPT objects (reusable)
ocr = OCR(api_endpoint=api_endpoint_ocr, api_key=api_key_ocr)
gpt = GPT(
    api_endpoint=api_endpoint_gpt, api_key=api_key_gpt, deployment_id=api_deployment_gpt
)

# Create a temporary directory for image copies
with tempfile.TemporaryDirectory() as backup_dir:
    print(f"Backup directory created at: {backup_dir}")

    # Dictionary to store inspection results for all labels
    all_inspections = {}

    # Loop through each label folder
    for label_num in label_folders:
        label_folder = f"test_data/labels/label_{label_num:03d}"  # Format as label_001, label_002, etc.
        label_storage = (
            LabelStorage()
        )  # Initialize a new LabelStorage for each label folder

        # Find relevant image files in the label folder
        for ext in image_extensions:
            pattern = os.path.join(label_folder, f"img_*{ext}")
            image_files = glob(pattern)

            # Copy found images to the backup directory and use copies for processing
            for image_path in image_files:
                backup_path = os.path.join(backup_dir, os.path.basename(image_path))
                shutil.copy(image_path, backup_path)
                print(f"Copied {image_path} to {backup_path}")

                # Add the copied image to the label storage
                label_storage.add_image(backup_path)

        if label_storage.images:
            # Run the analyze function using the copied images
            inspection = analyze(label_storage, ocr, gpt)

            # Store the result in the dictionary with the label number as the key
            all_inspections[f"label_{label_num:03d}"] = inspection

            # Print the inspection results as they are processed
            print(f"\nLabel: label_{label_num:03d}")
            print(f"  Company website: {inspection.company_website}")
            print(f"  Manufacturer website: {inspection.manufacturer_website}")

    # Pickle all the results in a single file
    pickle.dump(all_inspections, open("all_inspections.pkl", "wb"))

print("\nAll inspections have been processed and saved to all_inspections.pkl")


Backup directory created at: /var/folders/43/4zssf0gs6sj_j_kzmzgdzx0m0000gn/T/tmpuwj3w81v
Copied test_data/labels/label_001/img_001.png to /var/folders/43/4zssf0gs6sj_j_kzmzgdzx0m0000gn/T/tmpuwj3w81v/img_001.png

Label: label_001
  Company website: www.soil-aid.com
  Manufacturer website: www.soil-aid.com
Copied test_data/labels/label_002/img_001.png to /var/folders/43/4zssf0gs6sj_j_kzmzgdzx0m0000gn/T/tmpuwj3w81v/img_001.png

Label: label_002
  Company website: None
  Manufacturer website: None
Copied test_data/labels/label_003/img_001.png to /var/folders/43/4zssf0gs6sj_j_kzmzgdzx0m0000gn/T/tmpuwj3w81v/img_001.png

Label: label_003
  Company website: None
  Manufacturer website: None
Copied test_data/labels/label_004/img_001.png to /var/folders/43/4zssf0gs6sj_j_kzmzgdzx0m0000gn/T/tmpuwj3w81v/img_001.png

Label: label_004
  Company website: None
  Manufacturer website: None
Copied test_data/labels/label_005/img_001.png to /var/folders/43/4zssf0gs6sj_j_kzmzgdzx0m0000gn/T/tmpuwj3w81v/img_

HttpResponseError: (InvalidRequest) Invalid request.
Code: InvalidRequest
Message: Invalid request.
Inner error: {
    "code": "InvalidContent",
    "message": "The file is corrupted or format is unsupported. Refer to documentation for the list of supported formats."
}

## Analysis

Do not mind the error. It's probably due to using `tempfile`. Will investigate later.

Before the fix, the main problems were: 
- `www.` missing in some websites
- mix of lowercase and uppercase chars

After the fix, from the results, we can see that the llm is handling pretty well the website fields formatting.

My concern is that, we can't be sure how it will react when all the other fields are loaded with descriptions (prompts). Will it be cognitively overloaded and hallucinate?

Should we just code the formatting? The lower case is already coded (post processing) and we can add `www.` when it is missing, but how cool is it to let the llm do it!

In [None]:
# # Load the pickled data
# with open("all_inspections.pkl", "rb") as f:
#     all_inspections: dict[str, FertilizerInspection] = pickle.load(f)

# for label, inspection in all_inspections.items():
#     print(f"Label: {label}")
#     print(f"  Company website: {inspection.company_website}")
#     print(f"  Manufacturer website: {inspection.manufacturer_website}")
#     print()
