In [58]:
import os
import json
import csv

def create_data_list(base_directory, expected_output_file_name):
    base_dir = os.path.join(os.getcwd(), base_directory)
    
    rows = []
    for instance_dir in os.listdir(base_dir):
        instance_path = os.path.join(base_dir, instance_dir)
        
        if os.path.isdir(instance_path):
            image_files = [file for file in os.listdir(instance_path) if file.endswith(('.png', '.jpg', '.jpeg'))]
            json_path = os.path.join(instance_path,expected_output_file_name)
            if os.path.exists(json_path):
                with open(json_path, "r") as json_file:
                    expected_output = json.load(json_file)
            else:
                expected_output = None
            rows.append([instance_dir, image_files, json.dumps(expected_output, ensure_ascii=False)])
    return rows

def create_output_csv(data_source_file):
    output_csv = data_source_file
    with open(output_csv, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["instance_path", "image_files", "expected_output"])
        rows = create_data_list("../test_data/labels", "expected_output.json")
        for row in rows:
            writer.writerow(row)
    print(f"CSV file '{output_csv}' created successfully.")


In [59]:
create_output_csv("test_data.csv")

CSV file 'test_data.csv' created successfully.


# Importing DSPy related modules.
The first cell after this cell is just for notebook use, not in source code.

In [60]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [61]:
from dotenv import load_dotenv
import dspy
from pipeline_new.components.label import LabelStorage
from pipeline_new.components.ocr import OCR
from pipeline_new.schemas.inspection import FertilizerInspection

In [62]:
SUPPORTED_MODELS = {
    "gpt-3.5-turbo": {
        "max_tokens": 12000,
        "api_version": "2024-02-01",
        "response_format": {"type": "json_object"},
    },
    "gpt-4o": {
        "max_tokens": None,
        "api_version": "2024-02-15-preview",
        "response_format": {"type": "json_object"},
    },
}

REQUIREMENTS = """
The content of keys with the suffix _en must be in English.
The content of keys with the suffix _fr must be in French.
Translation of the text is prohibited.
You are prohibited from generating any text that is not part of the JSON.
The JSON must contain exclusively keys specified in "keys".
"""

# Signatures
class Inspector(dspy.Signature):
    """
    You are a fertilizer label inspector working for the Canadian Food Inspection Agency.
    Your task is to classify all information present in the provided text using the specified keys.
    Your response should be accurate, intelligible, information in JSON, and contain all the text from the provided text.
    """

    text: str = dspy.InputField(
        desc="The text of the fertilizer label extracted using OCR."
    )

    # TODO remove the depency on the pseudo prompt engineering
    requirements: str = dspy.InputField(
        desc="The instructions and guidelines to follow."
    )

    inspection: FertilizerInspection = dspy.OutputField(desc="The inspection results.")

class LanguageProgram(dspy.Module):
    def __init__(self, azure_openai_key, azure_openai_endpoint, azure_ocr_key, azure_ocr_endpoint, deployment_id):
        # initialize all the components to be used in the forward method
        lm = dspy.LM(
            model=f"azure/{deployment_id}",
            api_base=azure_openai_endpoint,
            api_key=azure_openai_key,
            max_tokens=SUPPORTED_MODELS.get(deployment_id)["max_tokens"],
            api_version=SUPPORTED_MODELS.get(deployment_id)["api_version"],
        )
        dspy.configure(lm=lm)
        self.ocr = OCR(azure_ocr_endpoint, azure_ocr_key)
        self.label_storage = LabelStorage()
        self.inspector = dspy.TypedChainOfThought(Inspector)

    def forward(self, image_paths):
        print("loading images...")
        for image_path in image_paths:
            self.label_storage.add_image(image_path)

        print("turning images into pdfs...")
        document = self.label_storage.get_document()
        
        print("sending the pdf to ocr...")
        ocr_results = self.ocr.extract_text(document=document)

        print("sending the text to llm...")
        inspection = self.inspector(text=ocr_results.content, requirements=REQUIREMENTS)

        print("done")
        # self.label_storage.clear()

        return inspection

In [63]:
SUPPORTED_MODELS = {
    "gpt-3.5-turbo": {
        "max_tokens": 12000,
        "api_version": "2024-02-01",
        "response_format": {"type": "json_object"},
    },
    "gpt-4o": {
        "max_tokens": None,
        "api_version": "2024-02-15-preview",
        "response_format": {"type": "json_object"},
    },
}

REQUIREMENTS = """
The content of keys with the suffix _en must be in English.
The content of keys with the suffix _fr must be in French.
Translation of the text is prohibited.
You are prohibited from generating any text that is not part of the JSON.
The JSON must contain exclusively keys specified in "keys".
"""

# Signatures
class Inspector(dspy.Signature):
    """
    You are a fertilizer label inspector working for the Canadian Food Inspection Agency.
    Your task is to classify all information present in the provided text using the specified keys.
    Your response should be accurate, intelligible, information in JSON, and contain all the text from the provided text.
    """

    text: str = dspy.InputField(
        desc="The text of the fertilizer label extracted using OCR."
    )

    # TODO remove the depency on the pseudo prompt engineering
    requirements: str = dspy.InputField(
        desc="The instructions and guidelines to follow."
    )

    inspection: FertilizerInspection = dspy.OutputField(desc="The inspection results.")

class LanguageProgram(dspy.Module):
    def __init__(self, azure_openai_key, azure_openai_endpoint, azure_ocr_key, azure_ocr_endpoint, deployment_id):
        # initialize all the components to be used in the forward method
        lm = dspy.LM(
            model=f"azure/{deployment_id}",
            api_base=azure_openai_endpoint,
            api_key=azure_openai_key,
            max_tokens=SUPPORTED_MODELS.get(deployment_id)["max_tokens"],
            api_version=SUPPORTED_MODELS.get(deployment_id)["api_version"],
        )
        dspy.configure(lm=lm)
        self.ocr = OCR(azure_ocr_endpoint, azure_ocr_key)
        self.label_storage = LabelStorage()
        self.inspector = dspy.TypedChainOfThought(Inspector)

    def forward(self, image_paths):
        print("loading images...")
        for image_path in image_paths:
            self.label_storage.add_image(image_path)

        print("turning images into pdfs...")
        document = self.label_storage.get_document()
        
        print("sending the pdf to ocr...")
        ocr_results = self.ocr.extract_text(document=document)

        print("sending the text to llm...")
        inspection = self.inspector(text=ocr_results.content, requirements=REQUIREMENTS)

        print("done")
        # self.label_storage.clear()

        return inspection

In [64]:
from ast import literal_eval

def call_dspy(images, instance):
    load_dotenv()    

    required_vars = [
        "AZURE_API_ENDPOINT",
        "AZURE_API_KEY",
        "AZURE_OPENAI_ENDPOINT",
        "AZURE_OPENAI_KEY",
        "AZURE_OPENAI_DEPLOYMENT",
    ]
    missing_vars = [var for var in required_vars if not os.getenv(var)]
    if missing_vars:
        raise RuntimeError(f"Missing required environment variables: {', '.join(missing_vars)}")

    AZURE_API_ENDPOINT = os.getenv('AZURE_API_ENDPOINT')
    AZURE_API_KEY = os.getenv('AZURE_API_KEY')
    AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
    AZURE_OPENAI_KEY = os.getenv('AZURE_OPENAI_KEY')
    AZURE_OPENAI_DEPLOYMENT = os.getenv('AZURE_OPENAI_DEPLOYMENT')

    images_path = []
    images = literal_eval(images)
    for image in images:
        file_path = "../test_data/labels/" + instance + "/" + image
        full_path = os.path.join(os.getcwd(), file_path)
        images_path.append(full_path)

    language_program = LanguageProgram(AZURE_OPENAI_KEY, AZURE_OPENAI_ENDPOINT,AZURE_API_KEY,AZURE_API_ENDPOINT, AZURE_OPENAI_DEPLOYMENT)

    inspection = language_program.forward(images_path)

    return inspection.inspection

In [65]:
import pandas as pd

df = pd.read_csv('test_data.csv')

In [66]:
df.head()

Unnamed: 0,instance_path,image_files,expected_output
0,label_004,['img_001.png'],"{""company_name"": ""Evergreen liquid plant food ..."
1,label_003,['img_001.png'],"{""company_name"": ""Novago Coopérative"", ""compan..."
2,label_035,"['img_001.png', 'img_002.png']","{""company_name"": ""Gloco"", ""company_address"": ""..."
3,label_032,"['img_001.png', 'img_003.png', 'img_002.png']","{""company_name"": ""Novago Coopérative"", ""compan..."
4,label_033,"['img_001.png', 'img_002.png']","{""company_name"": ""La Coop fédérée"", ""company_a..."


In [67]:
def process_row(row):
    try:
        print(row['instance_path'])
        # Call your function and process the row
        inspection_dict = call_dspy(row['image_files'], row['instance_path']).dict()
        return json.dumps(inspection_dict, ensure_ascii=False)
    except Exception as e:
        # Print the error for debugging and return None
        print(f"Error processing row with instance_path {row['instance_path']}: {e}")
        return None
    
df['dspy_output'] = df.apply(process_row, axis=1)

label_004
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_003
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_035
loading images...
turning images into pdfs...
sending the pdf to ocr...
sending the text to llm...
done
label_032
loading images...
turning images into pdfs...
sending the pdf to ocr...


KeyboardInterrupt: 

In [None]:
df.head()

Unnamed: 0,instance_path,image_files,expected_output,dspy_output
0,label_004,['img_001.png'],"{""company_name"": ""Evergreen liquid plant food ...","{""company_name"": ""Evergreen Liquid Plant Food ..."
1,label_003,['img_001.png'],"{""company_name"": ""Novago Coopérative"", ""compan...","{""company_name"": ""Novago Coopérative"", ""compan..."
2,label_035,"['img_001.png', 'img_002.png']","{""company_name"": ""Gloco"", ""company_address"": ""...","{""company_name"": ""9288-3354 Quebec inc."", ""com..."
3,label_032,"['img_001.png', 'img_003.png', 'img_002.png']","{""company_name"": ""Novago Coopérative"", ""compan...","{""company_name"": ""Sollio Agriculture"", ""compan..."
4,label_033,"['img_001.png', 'img_002.png']","{""company_name"": ""La Coop fédérée"", ""company_a...","{""company_name"": ""La Coop fédérée"", ""company_a..."


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   instance_path    35 non-null     object
 1   image_files      35 non-null     object
 2   expected_output  35 non-null     object
 3   dspy_output      35 non-null     object
dtypes: object(4)
memory usage: 1.2+ KB


In [None]:
df.to_csv('test_data_output.csv', index=False)

In [None]:
def faltten_jsons(df, json_columns):
    for col in json_columns:
        df[col] = df[col].apply(json.loads)
        flattened_col1 = pd.json_normalize(df[col])
        df = pd.concat([df, flattened_col1.add_prefix(col + "_")], axis=1)
    return df

In [None]:
df_flatten = df.copy()

df_flatten = faltten_jsons(df_flatten, ['expected_output', 'dspy_output'])

In [None]:
df_flatten.head()

Unnamed: 0,instance_path,image_files,expected_output,dspy_output,expected_output_company_name,expected_output_company_address,expected_output_company_website,expected_output_company_phone_number,expected_output_manufacturer_name,expected_output_manufacturer_address,...,dspy_output_guaranteed_analysis_en.nutrients,dspy_output_guaranteed_analysis_en.is_minimal,dspy_output_guaranteed_analysis_en,dspy_output_density.value,dspy_output_density.unit,dspy_output_guaranteed_analysis_fr.title,dspy_output_guaranteed_analysis_fr.nutrients,dspy_output_guaranteed_analysis_fr.is_minimal,dspy_output_volume.value,dspy_output_volume.unit
0,label_004,['img_001.png'],{'company_name': 'Evergreen liquid plant food ...,{'company_name': 'Evergreen Liquid Plant Food ...,Evergreen liquid plant food ltd,"790 st charles street x. rr1, Breslau, Ontario...",,,,,...,"[{'nutrient': 'Total Nitrogen (N)', 'value': 0...",True,,,,,,,,
1,label_003,['img_001.png'],"{'company_name': 'Novago Coopérative', 'compan...","{'company_name': 'Novago Coopérative', 'compan...",Novago Coopérative,"839, rue papineau, Joliette, QC J6E 2L6",,,,,...,,,,97.46,kg/hl,ANALYSE GARANTIE MINIMALE,"[{'nutrient': 'AZOTE TOTALE (N)', 'value': 15....",True,,
2,label_035,"['img_001.png', 'img_002.png']","{'company_name': 'Gloco', 'company_address': '...","{'company_name': '9288-3354 Quebec inc.', 'com...",Gloco,"26, route Maritime, Forestville, Quebec G0T 1E0",http://bionik.ca,,,,...,"[{'nutrient': 'Total nitrogen (N)', 'value': 0...",True,,,,ANALYSE MINIMUM GARANTIE,"[{'nutrient': 'Azote total (N)', 'value': 0.6,...",True,,
3,label_032,"['img_001.png', 'img_003.png', 'img_002.png']","{'company_name': 'Novago Coopérative', 'compan...","{'company_name': 'Sollio Agriculture', 'compan...",Novago Coopérative,"60 rue Fenne, St-Jacques, Québec, JOK 2R0",,,Sollio Agriculture,"Montréal, Québec H4N 3H7",...,,,,107.39,KG/HL,ANALYSE MINIMALE GARANTIE,"[{'nutrient': 'AZOTE TOTAL (N)', 'value': 9.0,...",True,,
4,label_033,"['img_001.png', 'img_002.png']","{'company_name': 'La Coop fédérée', 'company_a...","{'company_name': 'La Coop fédérée', 'company_a...",La Coop fédérée,"4225, rue Saint-Joseph, Trois-Rivières (Québec...",,,,,...,"[{'nutrient': 'Total nitrogen (N)', 'value': 2...",True,,,,ANALYSE MINIMALE GARANTIE,"[{'nutrient': 'Azote total (N)', 'value': 2.0,...",True,,


In [None]:
df_flatten.isna().sum()

instance_path                                     0
image_files                                       0
expected_output                                   0
dspy_output                                       0
expected_output_company_name                      2
                                                 ..
dspy_output_guaranteed_analysis_fr.is_minimal    11
dspy_output_volume.value                         26
dspy_output_volume.unit                          26
expected_output_company_name_cleaned              2
dspy_output_company_name_cleaned                  0
Length: 66, dtype: int64

# company_name

In [None]:
exact_match_num = len(df_flatten[df_flatten['expected_output_company_name'] == df_flatten['dspy_output_company_name']])
print("There are " + str(exact_match_num) + " instances with the exact same company name")
print("Percentage of exact matches: " + str(exact_match_num/len(df_flatten[df_flatten['expected_output_company_name'].notna()]) * 100) + "%")

There are 15 instances with the exact same company name
Percentage of exact matches: 45.45454545454545%


In [None]:
not_match = df_flatten[df_flatten['expected_output_company_name'] != df_flatten['dspy_output_company_name']]

In [None]:
not_match[['instance_path', 'expected_output_company_name', 'dspy_output_company_name']]

Unnamed: 0,instance_path,expected_output_company_name,dspy_output_company_name
0,label_004,Evergreen liquid plant food ltd,Evergreen Liquid Plant Food Ltd.
2,label_035,Gloco,9288-3354 Quebec inc.
3,label_032,Novago Coopérative,Sollio Agriculture
6,label_002,synagri,synAgri
7,label_005,"K.T Sunny, inc","K.T. Sunny, Inc."
8,label_027,Synagri s.e.c./l.p,Synagri s.e.c./1.p.
9,label_018,PREMIER TECH HOME & GARDEN INC.,Premier Tech Home & Garden Inc.
12,label_029,Synagri s.e.c./l.p.,Synagri s.e.c./I.p.
15,label_017,PREMIER TECH HOME & GARDEN INC.,Premier Tech Home & Garden Inc.
16,label_028,"Cameron Chemicals, Inc","Cameron Chemicals, Inc."


In [None]:
import re

def preprocess_string(input_string):
    """
    Preprocesses a string by:
    1. Converting to lowercase.
    2. Removing extra whitespace, punctuation, and special characters.

    Args:
        input_string (str): The input string to preprocess.

    Returns:
        str: The preprocessed string.
    """
    if input_string is None:
        return None
    lowercased = input_string.lower()
    cleaned = re.sub(r'[^a-z0-9\s]', '', lowercased)
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

In [None]:
df_flatten['expected_output_company_name_cleaned'] = df_flatten['expected_output_company_name'].apply(preprocess_string)
df_flatten['dspy_output_company_name_cleaned'] = df_flatten['dspy_output_company_name'].apply(preprocess_string)

In [None]:
exact_match_num = len(df_flatten[df_flatten['expected_output_company_name_cleaned'] == df_flatten['dspy_output_company_name_cleaned']])
print("There are " + str(exact_match_num) + " instances with the exact same company name")
print("Percentage of exact matches: " + str(exact_match_num/len(df_flatten[df_flatten['expected_output_company_name'].notna()]) * 100) + "%")

There are 22 instances with the exact same company name
Percentage of exact matches: 66.66666666666666%


In [None]:
df_flatten[df_flatten['expected_output_company_name_cleaned'] != df_flatten['dspy_output_company_name_cleaned']][['instance_path', 'expected_output_company_name_cleaned', 'dspy_output_company_name_cleaned']]

Unnamed: 0,instance_path,expected_output_company_name_cleaned,dspy_output_company_name_cleaned
2,label_035,gloco,92883354 quebec inc
3,label_032,novago cooprative,sollio agriculture
8,label_027,synagri seclp,synagri sec1p
12,label_029,synagri seclp,synagri secip
17,label_021,big blue,lep
18,label_026,,eti maden mines and products
24,label_030,amp advanced micronutrient product inc,advanced micronutrient products inc
26,label_001,natures aid,diamond fertilizers inc
28,label_024,matriaux paysagers savaria lte,savaria matriaux paysagers savaria lte
29,label_012,contrecoeur synagri,synagri lpsec


Score

In [68]:
!pip install jellyfish



In [74]:
import jellyfish

company_name = df_flatten[df_flatten['expected_output_company_name'].notna()]
print(len(company_name))

company_name = company_name[['instance_path', 'expected_output_company_name', 'dspy_output_company_name', 'expected_output_company_name_cleaned', 'dspy_output_company_name_cleaned']]

company_name.head()

33


Unnamed: 0,instance_path,expected_output_company_name,dspy_output_company_name,expected_output_company_name_cleaned,dspy_output_company_name_cleaned
0,label_004,Evergreen liquid plant food ltd,Evergreen Liquid Plant Food Ltd.,evergreen liquid plant food ltd,evergreen liquid plant food ltd
1,label_003,Novago Coopérative,Novago Coopérative,novago cooprative,novago cooprative
2,label_035,Gloco,9288-3354 Quebec inc.,gloco,92883354 quebec inc
3,label_032,Novago Coopérative,Sollio Agriculture,novago cooprative,sollio agriculture
4,label_033,La Coop fédérée,La Coop fédérée,la coop fdre,la coop fdre


In [None]:
company_name['company_name_jw_score'] = company_name.apply(lambda x: jellyfish.jaro_winkler_similarity(x['expected_output_company_name'], x['dspy_output_company_name']), axis=1)
company_name.head()

Unnamed: 0,instance_path,expected_output_company_name,dspy_output_company_name,expected_output_company_name_cleaned,dspy_output_company_name_cleaned,company_name_jw_score
0,label_004,Evergreen liquid plant food ltd,Evergreen Liquid Plant Food Ltd.,evergreen liquid plant food ltd,evergreen liquid plant food ltd,0.920721
1,label_003,Novago Coopérative,Novago Coopérative,novago cooprative,novago cooprative,1.0
2,label_035,Gloco,9288-3354 Quebec inc.,gloco,92883354 quebec inc,0.0
3,label_032,Novago Coopérative,Sollio Agriculture,novago cooprative,sollio agriculture,0.546296
4,label_033,La Coop fédérée,La Coop fédérée,la coop fdre,la coop fdre,1.0


In [None]:
company_name['company_name_jw_score'].unique()

numpy.ndarray