In [1]:
from src.extraction.llm_extractor import LLMExtractor
from src.extraction.output_parser import LLMOutputParser
from src.extraction.extraction_template import template
from src.extraction.extraction_responses import LLMResponse



In [2]:
import os

results_path = "./results/gpt-4-1-nano-only-image/test"
error_log_path = f"{results_path}/error_log.txt"

if not os.path.exists(results_path):
    os.makedirs(results_path)

In [3]:
model_mapping = {
    "gpt-4-nano" : "gpt-4.1-nano-2025-04-14"
}

In [4]:
from langchain_openai import ChatOpenAI
from core.settings import settings

llm = ChatOpenAI(
    model=model_mapping["gpt-4-nano"],
    temperature=0,
    openai_api_key=settings.OPENAI_API_KEY,
    max_retries=3
)

output_parser = LLMOutputParser(
    serializable=LLMResponse,
    llm=llm,
)


llm_extractor = LLMExtractor(
    llm=llm,
    output_parser=output_parser,
    extraction_template=template,
)

In [5]:
import pandas as pd

df = pd.read_pickle("df_concat_with_text.pkl")

In [6]:
df.head()

Unnamed: 0,questionId,question,question_types,image,docId,ucsf_document_id,ucsf_document_page_no,answers,data_split,image_path,text,llm_tokens_count,text_length,words_count
0,337,what is the date mentioned in this letter?,"[handwritten, form]",documents/xnbl0037_1.png,279,xnbl0037,1,[1/8/93],train,./images/spdocvqa_images/xnbl0037_1.png,<Page 1> Confidential .. .. RJRT PR APPROVAL D...,112,389,39
1,338,what is the contact person name mentioned in l...,"[handwritten, form]",documents/xnbl0037_1.png,279,xnbl0037,1,"[P. Carter, p. carter]",train,./images/spdocvqa_images/xnbl0037_1.png,<Page 1> Confidential .. .. RJRT PR APPROVAL D...,112,389,39
2,339,Which corporation's letterhead is this?,[layout],documents/mxcj0037_1.png,280,mxcj0037,1,[Brown & Williamson Tobacco Corporation],train,./images/spdocvqa_images/mxcj0037_1.png,<Page 1> B&W BROWN & WILLIAMSON TOBACCO CORPOR...,451,2183,246
3,340,Who is in cc in this letter?,"[form, layout]",documents/mxcj0037_1.png,280,mxcj0037,1,[T.F. Riehl],train,./images/spdocvqa_images/mxcj0037_1.png,<Page 1> B&W BROWN & WILLIAMSON TOBACCO CORPOR...,451,2183,246
4,341,what is the subject of this letter?,"[form, layout]",documents/mxcj0037_1.png,280,mxcj0037,1,[Review of existing Brainstorming Ideas/483],train,./images/spdocvqa_images/mxcj0037_1.png,<Page 1> B&W BROWN & WILLIAMSON TOBACCO CORPOR...,451,2183,246


In [7]:
df_val = df[df["data_split"]== "val"].copy()

In [8]:
df_val

Unnamed: 0,questionId,question,question_types,image,docId,ucsf_document_id,ucsf_document_page_no,answers,data_split,image_path,text,llm_tokens_count,text_length,words_count
39463,49153,"What is the ‘actual’ value per 1000, during th...",[figure/diagram],documents/pybv0228_81.png,14465,pybv0228,81,[0.28],val,./images/spdocvqa_images/pybv0228_81.png,<Page 1> FIGURE C. 2. AGE ADJUSTED MOTOR VEHIC...,90,186,12
39464,24580,What is name of university?,[others],documents/nkbl0226_1.png,7027,nkbl0226,1,"[university of california, University of Calif...",val,./images/spdocvqa_images/nkbl0226_1.png,"<Page 1> UNIVERSITY OF CALIFORNIA, SAN DIEGO T...",78,278,30
39465,57349,What is the name of the company?,[layout],documents/snbx0223_22.png,4733,snbx0223,22,"[itc limited, ITC Limited]",val,./images/spdocvqa_images/snbx0223_22.png,<Page 1> ITC Limited REPORT AND ACCOUNTS 2013 ...,283,1253,153
39466,24581,Where is the university located ?,[others],documents/nkbl0226_1.png,7027,nkbl0226,1,"[san diego, San Diego]",val,./images/spdocvqa_images/nkbl0226_1.png,"<Page 1> UNIVERSITY OF CALIFORNIA, SAN DIEGO T...",78,278,30
39467,24582,To whom is the document sent?,"[handwritten, form]",documents/nkbl0226_1.png,7027,nkbl0226,1,[Paul],val,./images/spdocvqa_images/nkbl0226_1.png,"<Page 1> UNIVERSITY OF CALIFORNIA, SAN DIEGO T...",78,278,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44807,24564,who is the vice president and secretary ?,[table/list],documents/ntnk0226_15.png,7020,ntnk0226,15,"[Raymond C. Guth, Raymond c. Guth]",val,./images/spdocvqa_images/ntnk0226_15.png,<Page 1> AMSTAR'S SENIOR REPRESENTATIVES Rober...,144,635,49
44808,49146,What is the ‘title’ of the plot?,[layout],documents/pybv0228_81.png,14465,pybv0228,81,[Age adjusted motor vehicle accident mortality...,val,./images/spdocvqa_images/pybv0228_81.png,<Page 1> FIGURE C. 2. AGE ADJUSTED MOTOR VEHIC...,90,186,12
44809,49147,What is ‘figure C.2.’?,[layout],documents/pybv0228_81.png,14465,pybv0228,81,[age adjusted motor vehicle accident mortality...,val,./images/spdocvqa_images/pybv0228_81.png,<Page 1> FIGURE C. 2. AGE ADJUSTED MOTOR VEHIC...,90,186,12
44810,49150,What is the maximum value at x axis?,[figure/diagram],documents/pybv0228_81.png,14465,pybv0228,81,[1980],val,./images/spdocvqa_images/pybv0228_81.png,<Page 1> FIGURE C. 2. AGE ADJUSTED MOTOR VEHIC...,90,186,12


In [9]:
df_test = df[df["data_split"]== "test"].copy()

In [10]:

import os
async def extract_and_save(context,question, question_id):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result = await llm_extractor.aextract(
                    context=context,
                    question=question,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")
        print(f"Error processing question {question_id}: {e}")

In [11]:
import asyncio
from tqdm import tqdm


async def process_df(df):
    tasks_list = []
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            context = row["text"]
            tasks = asyncio.create_task(
                extract_and_save(
                    context=context,
                    question=question,
                    question_id=question_id,
                )
            )
            tasks_list.append(tasks)
            await asyncio.sleep(0.07)
        except Exception as e:
            continue
    await asyncio.gather(*tasks_list)
            

In [12]:
df_test.head(5)

Unnamed: 0,questionId,question,question_types,image,docId,ucsf_document_id,ucsf_document_page_no,answers,data_split,image_path,text,llm_tokens_count,text_length,words_count
44812,57344,What is the dividend payout in 2012?,,documents/rnbx0223_193.png,4720,rnbx0223,193,,test,./images/spdocvqa_images/rnbx0223_193.png,<Page 1> ITC Limited . Report and Accounts 201...,352,960,78
44813,16384,What is the name of the person in the CC field ?,,documents/lflm0081_1.png,5160,lflm0081,1,,test,./images/spdocvqa_images/lflm0081_1.png,<Page 1> MEMORANDUM TO : W. C. Combs Product R...,107,443,52
44814,57346,What is the % of Employees in 2012 based on gr...,,documents/rnbx0223_191.png,4753,rnbx0223,191,,test,./images/spdocvqa_images/rnbx0223_191.png,<Page 1> FINANCIAL HIGHLIGHTS 40000 18000 3600...,445,1140,74
44815,61870,What is the personnel costs in the 4th year?,,documents/hrfw0227_24.png,8103,hrfw0227,24,,test,./images/spdocvqa_images/hrfw0227_24.png,<Page 1> SECTION II - PRIVILEGED COMMUNICATION...,495,1723,162
44816,57348,What is the % of 'Providers of Capital' in the...,,documents/rnbx0223_191.png,4753,rnbx0223,191,,test,./images/spdocvqa_images/rnbx0223_191.png,<Page 1> FINANCIAL HIGHLIGHTS 40000 18000 3600...,445,1140,74


In [13]:
# await process_df(df_test)

In [14]:
list_of_files = os.listdir(results_path)
len(list_of_files)

0

In [15]:
import json

def format_results_from_path(results_path, name):
    list_of_files = os.listdir(results_path)
    list_of_results = []
    for file in list_of_files:
        try:
            if not file.endswith(".json"):
                continue
            with open(f"{results_path}/{file}", "r") as f:
                data = json.load(f)
            list_of_results.append({
                "questionId": int(file.split(".")[0]),
                "answer": data["answer"],
            }
            )
        except Exception as e:
            print(f"Error loading file {file}: {e}")
            raise e
    with open(name, "w") as f:
        json.dump(list_of_results, f, indent=4)
        

In [16]:
from src.extraction.llm_extractor import LLMExtractorMultimodal
from src.extraction.extraction_template import create_image_only_template

llm_extractor_multimodal = LLMExtractorMultimodal(
    llm=llm,
    output_parser=output_parser,
    extraction_template=template,
    image_template_func=create_image_only_template,
)

In [17]:
async def extract_and_save_multimodal(context,question, question_id, image_base64):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result = await llm_extractor_multimodal.aextract(
                    context=context,
                    question=question,
                    image=image_base64,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")

In [18]:
import cv2
import base64
async def process_df_multimodal(df):
    tasks_list = []
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            context = row["text"]
            image_path = row["image_path"]
            saving_path = f"{results_path}/{question_id}.json"
            if os.path.exists(saving_path):
                continue
            image = cv2.imread(image_path)
            # encode the image as base64
            _, buffer = cv2.imencode('.jpg', image)
            image_base64 = base64.b64encode(buffer).decode('utf-8')
            tasks = asyncio.create_task(
                extract_and_save_multimodal(
                    context=context,
                    question=question,
                    question_id=question_id,
                    image_base64=image_base64,
                )
            )
            tasks_list.append(tasks)
            await asyncio.sleep(0.5
                                )
        except Exception as e:
            print(f"Error processing question {question_id}: {e}")
            continue
    return await asyncio.gather(*tasks_list)
    

In [19]:
# results= await process_df_multimodal(df_test)

In [20]:
async def extract_and_save_only_image(question, question_id, image_base64):
    try:
        saving_path = f"{results_path}/{question_id}.json"
        if os.path.exists(saving_path):
            return
        result = await llm_extractor_multimodal.aextract(
                    question=question,
                    image=image_base64,
                )
        with open(f"{results_path}/{question_id}.json", "w") as f:
            f.write(result.model_dump_json())
    except Exception as e:
        with open(error_log_path, "a") as f:
            f.write(f"Error processing question {question_id}: {e}\n")

In [21]:

async def process_df_only_image(df):
    tasks_list = []
    df = df.copy()
    df["extraction"] = None
    df["extraction_error"] = None
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question_id = row["questionId"]
            question = row["question"]
            image_path = row["image_path"]
            saving_path = f"{results_path}/{question_id}.json"
            if os.path.exists(saving_path):
                continue
            image = cv2.imread(image_path)
            # encode the image as base64
            _, buffer = cv2.imencode('.jpg', image)
            image_base64 = base64.b64encode(buffer).decode('utf-8')
            tasks = asyncio.create_task(
                extract_and_save_only_image(
                    question=question,
                    question_id=question_id,
                    image_base64=image_base64,
                )
            )
            tasks_list.append(tasks)
            await asyncio.sleep(0.5
                                )
        except Exception as e:
            print(f"Error processing question {question_id}: {e}")
            continue
    return await asyncio.gather(*tasks_list)
    

In [25]:
await process_df_only_image(df_test.head(5))

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:02<00:00,  1.76it/s]


[None, None, None, None, None]

In [22]:
list_of_files = os.listdir(results_path)
len(list_of_files)

0

In [23]:
format_results_from_path(results_path, "results_gpt_4_1_mini_multimodal_test.json")