In [None]:
!pip install -r requirements.txt
!pip install numpy --upgrade


In [None]:
!pip install tenacity

In [None]:
from src.extraction.llm_extractor import LLMExtractor
from src.extraction.output_parser import LLMOutputParser
from src.extraction.extraction_template import template
from src.extraction.extraction_responses import LLMResponse
import numpy as np 

In [None]:
import os

results_path = "./results/llama_multimodal/test"
error_log_path = f"{results_path}/error_log.txt"
results_name = "results_gpt_4_1_llama_multimodal.json"

if not os.path.exists(results_path):
    os.makedirs(results_path)

In [None]:
model_mapping = {
    "gpt-4-nano" : "gpt-4.1-nano-2025-04-14",
    "gpt-4-mini" : "gpt-4o-mini-2024-07-18",
}

In [None]:
from langchain_openai import ChatOpenAI
from core.settings import settings
# from src.llms.llama import LLama3_2_11B_V
llm = ChatOpenAI(
    model=model_mapping["gpt-4-mini"],
    temperature=0,
    openai_api_key="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",
    max_retries=3
)

# llm_llama = LLama3_2_11B_V(temperature=1.5, top_p=0.9)

output_parser = LLMOutputParser(
    serializable=LLMResponse,
    llm=llm,
)


llm_extractor = LLMExtractor(
    llm=llm,
    output_parser=output_parser,
    extraction_template=template,
)

In [None]:
import pandas as pd

df = pd.read_pickle("./data/df_concat_with_text.pkl")

In [None]:
df.head()

In [None]:
df_val = df[df["data_split"]== "val"].copy()

In [None]:
df_val

In [None]:
df_test = df[df["data_split"]== "test"].copy()

In [None]:

import os
async def extract_and_save(context,question, question_id, results_path=None):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result = await llm_extractor.aextract(
                    context=context,
                    question=question,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")
        print(f"Error processing question {question_id}: {e}")

In [None]:
import asyncio
from tqdm import tqdm


async def process_df(df, results_path=None):
    if not results_path:
        raise ValueError("results_path must be provided")
    tasks_list = []
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            context = row["text"]
            tasks = asyncio.create_task(
                extract_and_save(
                    context=context,
                    question=question,
                    question_id=question_id,
                    results_path=results_path
                )
            )
            tasks_list.append(tasks)
            await asyncio.sleep(0.1)
        except Exception as e:
            continue
    await asyncio.gather(*tasks_list)
            

In [None]:
df_test.head(5)

In [None]:
# await process_df(df_test)

In [None]:
list_of_files = os.listdir(results_path)
len(list_of_files)

In [None]:
import json

def format_results_from_path(results_path, name):
    list_of_files = os.listdir(results_path)
    list_of_results = []
    for file in list_of_files:
        try:
            if not file.endswith(".json"):
                continue
            with open(f"{results_path}/{file}", "r") as f:
                data = json.load(f)
            list_of_results.append({
                "questionId": int(file.split(".")[0]),
                "answer": data["answer"],
            }
            )
        except Exception as e:
            print(f"Error loading file {file}: {e}")
            raise e
    with open(name, "w") as f:
        json.dump(list_of_results, f, indent=4)
        

In [None]:
from src.extraction.llm_extractor import LLMExtractorMultimodal
from src.extraction.extraction_template import create_image_only_template

llm_extractor_multimodal = LLMExtractorMultimodal(
    llm=llm,
    output_parser=output_parser,
    extraction_template=template,
    image_template_func=create_image_only_template,
)

In [None]:
async def extract_and_save_multimodal(context,question, question_id, image_base64, results_path=None):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result = await llm_extractor_multimodal.aextract(
                    context=context,
                    question=question,
                    image=image_base64,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")

In [None]:
import cv2
import base64
async def process_df_multimodal(df, results_path=None):
    tasks_list = []
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            context = row["text"]
            image_path = row["image_path"]
            saving_path = f"{results_path}/{question_id}.json"
            if os.path.exists(saving_path):
                continue
            image = cv2.imread(image_path)
            # encode the image as base64
            _, buffer = cv2.imencode('.jpg', image)
            image_base64 = base64.b64encode(buffer).decode('utf-8')
            tasks = asyncio.create_task(
                extract_and_save_multimodal(
                    context=context,
                    question=question,
                    question_id=question_id,
                    image_base64=image_base64,
                    results_path=results_path
                )
            )
            tasks_list.append(tasks)
            await asyncio.sleep(0.1
                                )
        except Exception as e:
            print(f"Error processing question {question_id}: {e}")
            continue
    return await asyncio.gather(*tasks_list)
    

In [None]:
# results= await process_df_multimodal(df_test)

In [None]:
async def extract_and_save_only_image(question, question_id, image_base64, results_path=None):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result = await llm_extractor_multimodal.aextract(
                    question=question,
                    image=image_base64,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")

In [None]:

async def process_df_only_image(df, results_path=None):
    tasks_list = []
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            image_path = row["image_path"]
            saving_path = f"{results_path}/{question_id}.json"
            if os.path.exists(saving_path):
                continue
            image = cv2.imread(image_path)
            # encode the image as base64
            _, buffer = cv2.imencode('.jpg', image)
            image_base64 = base64.b64encode(buffer).decode('utf-8')
            tasks = asyncio.create_task(
                extract_and_save_only_image(
                    question=question,
                    question_id=question_id,
                    image_base64=image_base64,
                    results_path=results_path
                )
            )
            tasks_list.append(tasks)
            await asyncio.sleep(0.1
                                )
        except Exception as e:
            print(f"Error processing question {question_id}: {e}")
            continue
    return await asyncio.gather(*tasks_list)
    

In [None]:
def transform_local_path_to_drive_path(path: str):
  path = path.replace("./images/","../images/")
  return path

In [None]:
# df["image_path"] = df["image_path"].apply(transform_local_path_to_drive_path)

In [None]:
df.head()

In [None]:
# df_test["image_path"] = df_test["image_path"].apply(transform_local_path_to_drive_path)

In [None]:
image = cv2.imread(df.iloc[0]["image_path"])
            # encode the image as base64
_, buffer = cv2.imencode('.jpg', image)
image_base64 = base64.b64encode(buffer).decode('utf-8')

llm_extractor_multimodal.extract(
    df.iloc[0]["question"],
    df.iloc[0]["text"],
    image=image_base64,
)

In [None]:


def extract_and_save_multimodal_local(context,question, question_id, image_base64, results_path=None):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result = llm_extractor_multimodal.extract(
                    context=context,
                    question=question,
                    image=image_base64,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")
            
def extract_and_save_only_image_local(context,question, question_id, image_base64,results_path=None):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result = llm_extractor_multimodal.extract(
                    question=question,
                    image=image_base64,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")
            
def extract_and_save_local(context,question, question_id, image_base64=None, results_path=None):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result =llm_extractor_multimodal.extract(
                    context=context,
                    question=question,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")


def process_df_multimodal_local(df, results_path=None):
    if not results_path:
        raise ValueError("results_path must be provided")
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            context = row["text"]
            image_path = row["image_path"]
            saving_path = f"{results_path}/{question_id}.json"
            if os.path.exists(saving_path):
                continue
            image = cv2.imread(image_path)
            # encode the image as base64
            _, buffer = cv2.imencode('.jpg', image)
            image_base64 = base64.b64encode(buffer).decode('utf-8')
            extract_and_save_multimodal_local(
                    context=context,
                    question=question,
                    question_id=question_id,
                    image_base64=image_base64,
                    results_path=results_path,
                )
        except Exception as e:
            print(f"Error processing question {question_id}: {e}")
            
            continue
    return df
    
    
def process_df_only_text_local(df,results_path=None):
    if not results_path:
        raise ValueError("results_path must be provided")
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            context = row["text"]
            image_path = row["image_path"]
            saving_path = f"{results_path}/{question_id}.json"
            if os.path.exists(saving_path):
                continue
            extract_and_save_local(
                    context=context,
                    question=question,
                    question_id=question_id,
                    image_base64=None,
                )
        except Exception as e:
            print(f"Error processing question {question_id}: {e}")
            continue
    return df

def process_df_only_image_local(df,results_path=None):
    if not results_path:
        raise ValueError("results_path must be provided")
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            context = row["text"]
            image_path = row["image_path"]
            saving_path = f"{results_path}/{question_id}.json"
            if os.path.exists(saving_path):
                continue
            image = cv2.imread(image_path)
            # encode the image as base64
            _, buffer = cv2.imencode('.jpg', image)
            image_base64 = base64.b64encode(buffer).decode('utf-8')
            extract_and_save_only_image(
                    question=question,
                    question_id=question_id,
                    image_base64=image_base64,
                )
        except Exception as e:
            print(f"Error processing question {question_id}: {e}")
            continue
    return df

In [None]:
def format_results_from_path_with_errors(results_path, name):
    list_of_files = os.listdir(results_path)
    list_of_results = []
    for file in list_of_files:
        try:
            if not file.endswith(".json"):
                continue
            with open(f"{results_path}/{file}", "r") as f:
                data = json.load(f)
            answer = data["answer"]
            if not answer:
                answer = data.get("unparsed_output")
            if not answer:
                answer= ""
            list_of_results.append({
                "questionId": int(file.split(".")[0]),
                "answer": answer,
            }
            )
        except Exception as e:
            print(f"Error loading file {file}: {e}")
            raise e
    with open(name, "w") as f:
        json.dump(list_of_results, f, indent=4)




In [None]:
async def process_experiment_async(base_name, process_df_func, df, data_type="test"):
    results_path = f"./results/{base_name}/{data_type}"
    results_name = f"results_{base_name}_{data_type}.json"
    if not os.path.exists(results_path):
        os.makedirs(results_path)
    results= await process_df_func(df, results_path=results_path)
    list_of_files = os.listdir(results_path)
    print(len(list_of_files))
    format_results_from_path_with_errors(results_path, results_name)

In [None]:
EXPERIMENTS_ASYNC = [
    {
        "base_name": "gpt-4-1-mini",
        "process_df_func": process_df,
        "df": df_val,
        "data_type": "val",
    },
    # gpt only images val
    { 
        "base_name": "gpt-4-1-mini-only-image",
        "process_df_func": process_df_only_image,
        "df": df_val,
        "data_type": "val",
    },
    # gpt multimodal val
    {
        "base_name": "gpt-4-1-mini-multimodal",
        "process_df_func": process_df_multimodal,
        "df": df_val,
        "data_type": "val",
    },
]

In [None]:
# await process_experiment_async(EXPERIMENTS_ASYNC[2]["base_name"],
#                         EXPERIMENTS_ASYNC[2]["process_df_func"],
#                         EXPERIMENTS_ASYNC[2]["df"].head(5),
#                         data_type=EXPERIMENTS_ASYNC[2]["data_type"])

for experiment in EXPERIMENTS_ASYNC:
    print("--"*20)
    print(f"Processing experiment: {experiment['base_name']}")
    print(f"Data type: {experiment['data_type']}")
    print(f"Processing function: {experiment['process_df_func'].__name__}")
    print("--"*20)
    await process_experiment_async(
        experiment["base_name"],
        experiment["process_df_func"],
        experiment["df"],
        data_type=experiment["data_type"]
    )

In [None]:



def process_experiment(base_name, process_df_func, df, data_type="test"):
    results_path = f"./results/{base_name}/{data_type}"
    results_name = f"results_{base_name}.json"
    if not os.path.exists(results_path):
        os.makedirs(results_path)
    process_df_func(df, results_path=results_path)
    list_of_files = os.listdir(results_path)
    print(len(list_of_files))
    format_results_from_path_with_errors(results_path, results_name)

In [None]:
EXPERIMENTS = [
    {
        "base_name": "llama_finetuned_multimodal_2_test",
        "process_df_func": process_df_multimodal_local,
        "df": df_test,
        "data_type": "test",
    },
    {
        "base_name": "llama_finetuned_text_only_2_test",
        "process_df_func": process_df_only_text_local,
        "df": df_test,
         "data_type": "test",
    },
    {
        "base_name": "llama_finetuned_image_only_2_test",
        "process_df_func": process_df_only_image_local,
        "df": df_test,
        "data_type": "test",
    },
    {
        "base_name": "llama_finetuned_multimodal_2_val",
        "process_df_func": process_df_multimodal_local,
        "df": df_val,
        "data_type": "val",
    },
    {
        "base_name": "llama_finetuned_text_only_2_val",
        "process_df_func": process_df_only_text_local,
        "df": df_val,
        "data_type": "val",
    },
    {
        "base_name": "llama_finetuned_image_only_2_val",
        "process_df_func": process_df_only_image_local,
        "df": df_val,
        "data_type": "val",
    },
]