In [44]:
import os
import json
import csv

def create_data_list():
    # Set the directory path
    # base_dir = "test_data/labels"
    # base_dir = os.path.join(os.path.dirname(__file__), "test_data/labels")
    base_dir = os.path.join(os.getcwd(), "../test_data/labels")
    
    # Initialize a list to hold the rows for the CSV file
    rows = []
    # Iterate over each instance directory in `all_labels`
    for instance_dir in os.listdir(base_dir):
        instance_path = os.path.join(base_dir, instance_dir)
        
        # Ensure we're looking at a directory
        if os.path.isdir(instance_path):
            # Get the list of image files in the directory
            image_files = [file for file in os.listdir(instance_path) if file.endswith(('.png', '.jpg', '.jpeg'))]
            
            # Read the JSON object from expected_output.json
            json_path = os.path.join(instance_path, "expected_output.json")
            if os.path.exists(json_path):
                with open(json_path, "r") as json_file:
                    expected_output = json.load(json_file)
            else:
                expected_output = None  # If no expected_output.json file is found, set to None or handle as needed
            # Append the row to the list with the image files list and JSON object
            rows.append([instance_dir, image_files, json.dumps(expected_output, ensure_ascii=False)])
    return rows

def create_output_csv():
    output_csv = "test_data.csv"
    # Write to the CSV file
    with open(output_csv, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        # Write the header
        writer.writerow(["instance_path", "image_files", "expected_output"])
        # Write each row
        rows = create_data_list()
        for row in rows:
            writer.writerow(row)
    print(f"CSV file '{output_csv}' created successfully.")


In [45]:
create_output_csv()

CSV file 'test_data.csv' created successfully.


In [46]:
import pandas as pd

df = pd.read_csv('test_data.csv')

devset = df.sample(frac=0.3, random_state=2)

In [47]:
devset.head()

Unnamed: 0,instance_path,image_files,expected_output
13,label_011,['img_001.png'],"{""company_name"": ""Advancing Eco Agriculture, L..."
14,label_010,['img_001.png'],"{""company_name"": ""Synagri L.P./S.E.C."", ""compa..."
26,label_001,['img_001.png'],"{""company_name"": ""Nature's aid"", ""company_addr..."
9,label_018,"['img_001.jpg', 'img_002.jpg']","{""company_name"": ""PREMIER TECH HOME & GARDEN I..."
28,label_024,['img_001.png'],"{""company_name"": ""Matériaux Paysagers Savaria ..."


In [10]:
devset.to_csv('devset.csv', index=False)

In [49]:
# import sys
# import os
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))  # Adjust path to the project root

In [50]:
import os
from dotenv import load_dotenv
import dspy
from pipeline_new.components.label import LabelStorage
from pipeline_new.components.ocr import OCR
from pipeline_new.schemas.inspection import FertilizerInspection

In [74]:
SUPPORTED_MODELS = {
    "gpt-3.5-turbo": {
        "max_tokens": 12000,
        "api_version": "2024-02-01",
        "response_format": {"type": "json_object"},
    },
    "gpt-4o": {
        "max_tokens": None,
        "api_version": "2024-02-15-preview",
        "response_format": {"type": "json_object"},
    },
}

REQUIREMENTS = """
The content of keys with the suffix _en must be in English.
The content of keys with the suffix _fr must be in French.
Translation of the text is prohibited.
You are prohibited from generating any text that is not part of the JSON.
The JSON must contain exclusively keys specified in "keys".
"""

# Signatures
class Inspector(dspy.Signature):
    """
    You are a fertilizer label inspector working for the Canadian Food Inspection Agency.
    Your task is to classify all information present in the provided text using the specified keys.
    Your response should be accurate, intelligible, information in JSON, and contain all the text from the provided text.
    """

    text: str = dspy.InputField(
        desc="The text of the fertilizer label extracted using OCR."
    )

    # TODO remove the depency on the pseudo prompt engineering
    requirements: str = dspy.InputField(
        desc="The instructions and guidelines to follow."
    )

    inspection: FertilizerInspection = dspy.OutputField(desc="The inspection results.")

class LanguageProgram(dspy.Module):
    def __init__(self, azure_openai_key, azure_openai_endpoint, azure_ocr_key, azure_ocr_endpoint, deployment_id):
        # initialize all the components to be used in the forward method
        lm = dspy.LM(
            model=f"azure/{deployment_id}",
            api_base=azure_openai_endpoint,
            api_key=azure_openai_key,
            max_tokens=SUPPORTED_MODELS.get(deployment_id)["max_tokens"],
            api_version=SUPPORTED_MODELS.get(deployment_id)["api_version"],
        )
        dspy.configure(lm=lm)
        self.ocr = OCR(azure_ocr_endpoint, azure_ocr_key)
        self.label_storage = LabelStorage()
        self.inspector = dspy.TypedChainOfThought(Inspector)

    def forward(self, image_paths):
        print("loading images...")
        for image_path in image_paths:
            self.label_storage.add_image(image_path)

        print("turning images into pdfs...")
        document = self.label_storage.get_document()
        
        print("sending the pdf to ocr...")
        ocr_results = self.ocr.extract_text(document=document)

        print("sending the text to llm...")
        inspection = self.inspector(text=ocr_results.content, requirements=REQUIREMENTS)

        print("done")
        # self.label_storage.clear()

        return inspection

In [71]:
from ast import literal_eval

def call_dspy(images, instance):
    load_dotenv()    

    required_vars = [
        "AZURE_API_ENDPOINT",
        "AZURE_API_KEY",
        "AZURE_OPENAI_ENDPOINT",
        "AZURE_OPENAI_KEY",
        "AZURE_OPENAI_DEPLOYMENT",
    ]
    missing_vars = [var for var in required_vars if not os.getenv(var)]
    if missing_vars:
        raise RuntimeError(f"Missing required environment variables: {', '.join(missing_vars)}")

    AZURE_API_ENDPOINT = os.getenv('AZURE_API_ENDPOINT')
    AZURE_API_KEY = os.getenv('AZURE_API_KEY')
    AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
    AZURE_OPENAI_KEY = os.getenv('AZURE_OPENAI_KEY')
    AZURE_OPENAI_DEPLOYMENT = os.getenv('AZURE_OPENAI_DEPLOYMENT')

    images_path = []
    images = literal_eval(images)
    for image in images:
        file_path = "../test_data/labels/" + instance + "/" + image
        full_path = os.path.join(os.getcwd(), file_path)
        images_path.append(full_path)
        
    # print(images_path)
    
    # test_image = os.path.join(os.getcwd(), "../test_data/labels/label_011/")
    # test_image = os.path.join('/Users/saratavakoli/Documents/sara/CFIA/fertiscan-pipeline/notebooks/../test_data/labels/label_011')
    # print(test_image)

    language_program = LanguageProgram(AZURE_OPENAI_KEY, AZURE_OPENAI_ENDPOINT,AZURE_API_KEY,AZURE_API_ENDPOINT, AZURE_OPENAI_DEPLOYMENT)

    inspection = language_program.forward(images_path)

    return inspection.inspection

In [60]:
inspection = call_dspy(['img_001.png'], "label_011")

['/Users/saratavakoli/Documents/sara/CFIA/fertiscan-pipeline/notebooks/../test_data/labels/label_011/img_001.png']
/Users/saratavakoli/Documents/sara/CFIA/fertiscan-pipeline/notebooks/../test_data/labels/label_011
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done


In [68]:
df.head()

Unnamed: 0,instance_path,image_files,expected_output
0,label_004,['img_001.png'],"{""company_name"": ""Evergreen liquid plant food ..."
1,label_003,['img_001.png'],"{""company_name"": ""Novago Coopérative"", ""compan..."
2,label_035,"['img_001.png', 'img_002.png']","{""company_name"": ""Gloco"", ""company_address"": ""..."
3,label_032,"['img_001.png', 'img_003.png', 'img_002.png']","{""company_name"": ""Novago Coopérative"", ""compan..."
4,label_033,"['img_001.png', 'img_002.png']","{""company_name"": ""La Coop fédérée"", ""company_a..."


In [75]:
def process_row(row):
    try:
        print(row['instance_path'])
        # Call your function and process the row
        inspection_dict = call_dspy(row['image_files'], row['instance_path']).dict()
        return json.dumps(inspection_dict, ensure_ascii=False)
    except Exception as e:
        # Print the error for debugging and return None
        print(f"Error processing row with instance_path {row['instance_path']}: {e}")
        return None
    
df['dspy_output'] = df.apply(process_row, axis=1)

label_004
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_003
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_035
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_032
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_033
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_034
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_002
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_005
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_027
loading images...
turning images into pdfs...
sending the pdf to ocr...
sendin

In [76]:
df.head()

Unnamed: 0,instance_path,image_files,expected_output,dspy_output
0,label_004,['img_001.png'],"{""company_name"": ""Evergreen liquid plant food ...","{""company_name"": ""Evergreen Liquid Plant Food ..."
1,label_003,['img_001.png'],"{""company_name"": ""Novago Coopérative"", ""compan...","{""company_name"": ""Novago Coopérative"", ""compan..."
2,label_035,"['img_001.png', 'img_002.png']","{""company_name"": ""Gloco"", ""company_address"": ""...","{""company_name"": ""9288-3354 Quebec inc."", ""com..."
3,label_032,"['img_001.png', 'img_003.png', 'img_002.png']","{""company_name"": ""Novago Coopérative"", ""compan...","{""company_name"": ""Sollio Agriculture"", ""compan..."
4,label_033,"['img_001.png', 'img_002.png']","{""company_name"": ""La Coop fédérée"", ""company_a...","{""company_name"": ""La Coop fédérée"", ""company_a..."


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   instance_path    35 non-null     object
 1   image_files      35 non-null     object
 2   expected_output  35 non-null     object
 3   dspy_output      35 non-null     object
dtypes: object(4)
memory usage: 1.2+ KB


In [79]:
df.describe()

Unnamed: 0,instance_path,image_files,expected_output,dspy_output
count,35,35,35,35
unique,35,8,35,35
top,label_004,['img_001.png'],"{""company_name"": ""Evergreen liquid plant food ...","{""company_name"": ""Evergreen Liquid Plant Food ..."
freq,1,17,1,1


In [80]:
import re

def preprocess_string(input_string):
    """
    Preprocesses a string by:
    1. Converting to lowercase.
    2. Removing extra whitespace, punctuation, and special characters.

    Args:
        input_string (str): The input string to preprocess.

    Returns:
        str: The preprocessed string.
    """
    # Step 1: Convert to lowercase
    lowercased = input_string.lower()
    
    # Step 2: Remove punctuation and special characters using regex
    cleaned = re.sub(r'[^a-z0-9\s]', '', lowercased)
    
    # Step 3: Remove extra whitespace
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    return cleaned

In [None]:
def process_row(row):
    try:
        print(row['instance_path'])
        # Call your function and process the row
        inspection_dict = call_dspy(row['image_files'], row['instance_path']).dict()
        return json.dumps(inspection_dict, ensure_ascii=False)
    except Exception as e:
        # Print the error for debugging and return None
        print(f"Error processing row with instance_path {row['instance_path']}: {e}")
        return None
    
df['dspy_output'] = df.apply(process_row, axis=1)

# Test

In [29]:
string1_lower = inspection.dict().get("company_name")
out = json.loads(devset.iloc[0].expected_output)
string2_lower = out.get("company_name")

similarity_score = jellyfish.levenshtein_distance(string1_lower, string2_lower)
print(f"levenshtein_distance score: {similarity_score}")

similarity_score = jellyfish.damerau_levenshtein_distance(string1_lower, string2_lower)
print(f"damerau_levenshtein_distance score: {similarity_score}")

similarity_score = jellyfish.hamming_distance(string1_lower, string2_lower)
print(f"hamming_distance score: {similarity_score}")

similarity_score = jellyfish.jaro_similarity(string1_lower, string2_lower)
print(f"Jaro similarity score: {similarity_score}")

similarity_score = jellyfish.jaro_winkler_similarity(string1_lower, string2_lower)
print(f"Jaro-Winkler similarity score: {similarity_score}")

similarity_score = jellyfish.match_rating_comparison(string1_lower, string2_lower)
print(f"match_rating_comparison score: {similarity_score}")

levenshtein_distance score: 0
damerau_levenshtein_distance score: 0
hamming_distance score: 0
Jaro similarity score: 1.0
Jaro-Winkler similarity score: 1.0
match_rating_comparison score: None


In [21]:
out = json.loads(devset.iloc[0].expected_output)

In [24]:
out.get("company_name")

'Advancing Eco Agriculture, LLC'

In [27]:
print(string1_lower, string2_lower)

Advancing Eco Agriculture, LLC Advancing Eco Agriculture, LLC


In [43]:
os.getcwd()

'/Users/saratavakoli/Documents/sara/CFIA/fertiscan-pipeline/notebooks'

In [64]:
inspection.dict(

FertilizerInspection(company_name='Advancing Eco Agriculture, LLC', company_address='4551 Parks West Road, Middlefield, OH 44062 USA', company_website='www.advancingecoag.com', company_phone_number='+18004956603', manufacturer_name=None, manufacturer_address=None, manufacturer_website=None, manufacturer_phone_number=None, fertiliser_name='Accelerate™M', registration_number='2020134A', lot_number=None, weight=[Value(value=3.78, unit='L'), Value(value=4.23, unit='kg'), Value(value=208.2, unit='L'), Value(value=233.18, unit='kg'), Value(value=18.92, unit='L'), Value(value=21.19, unit='kg'), Value(value=113.56, unit='L'), Value(value=127.18, unit='kg'), Value(value=1041.0, unit='L'), Value(value=1165.9, unit='kg')], density=Value(value=1.12, unit='kg/L'), volume=None, npk='0-1-1', guaranteed_analysis_en=GuaranteedAnalysis(title='Guaranteed Minimum Analysis', nutrients=[NutrientValue(nutrient='Available Phosphate (P2Os)', value=1.0, unit='%'), NutrientValue(nutrient='Soluble Potash (K2O)', 