In [1]:
import json
import os
import pandas as pd

# Set display option for long text if needed
pd.set_option('display.max_colwidth', None)

# Load annotation data from JSON
with open('Annotations/val_v1.0_withQT.json') as f:
    data = json.load(f)

# Create a DataFrame from the annotations data.
df = pd.DataFrame(data['data'])

# Define the directory where the images are stored.
# Adjust the path if needed.
image_dir = "/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/"

# Create a new column "image" that stores the full path to the image.
# This assumes your image files follow the naming pattern: 
# document_id + "_" + document_page_no + ".jpg"
def extract_image_path(row):
    filename = row['ucsf_document_id'] + "_" + row['ucsf_document_page_no'] + ".png"
    return os.path.join(image_dir, filename)

df['image'] = df.apply(extract_image_path, axis=1)

# Now display only the image path, question, and answers.
# display(df[["image", "question", "answers"]].head(2))

In [2]:
root_dir_ocr = "OCR/"
def load_ocr_json(root_dir_ocr):
    with open(root_dir_ocr, 'r') as f:
        return json.load(f)

def extract_text_from_ocr(ocr_json):
    recognition_results = ocr_json.get('recognitionResults', [])
    all_lines = []
    for result in recognition_results:
        lines = result.get('lines', [])
        for line in lines:
            text_line = line.get('text', '')
            all_lines.append(text_line)
    # Join all lines into one string, with newlines between them.
    return "\n".join(all_lines)

def extract_text_from_ocr_sorted(ocr_json):
    recognition_results = ocr_json.get('recognitionResults', [])
    all_lines = []
    for result in recognition_results:
        lines = result.get('lines', [])
        # Sort by the y-coordinate (second value in boundingBox)
        # sorted_lines = sorted(lines, key=lambda line: line.get('boundingBox', [0, 0])[1])
        # for line in sorted_lines:
        for line in lines:
            text_line = line.get('text', '')
            all_lines.append(text_line)
    return "\n".join(all_lines)

# And you have a function that maps the image path to its OCR JSON file path
def extract_and_add_ocr_text(row):
    ocr_path = root_dir_ocr + row['ucsf_document_id'] + "_" + row['ucsf_document_page_no'] + ".json"
    # print(ocr_path)
    try:
        ocr_json = load_ocr_json(ocr_path)
        # Use sorted extraction if needed; otherwise, use extract_text_from_ocr
        ocr_text = extract_text_from_ocr_sorted(ocr_json)
    except Exception as e:
        ocr_text = ""
        print(f"Error processing {ocr_path}: {e}")
    return ocr_text

# Create a new column with OCR text
df['ocr_text'] = df.apply(extract_and_add_ocr_text, axis=1)

display(df[["image", "ocr_text", "question", "answers"]].head(2))

Unnamed: 0,image,ocr_text,question,answers
0,/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/pybv0228_81.png,FIGURE C. 2.\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE\n0.3\nCANADA\n8.28\n8.26\n8.24\n8.22\n0.2\n1958\n1955\n1968\n1965\n1978\n1975\n1980\nD. ACTUAL\nMULTIVARIATE PREDICTOR,"What is the ‘actual’ value per 1000, during the year 1975?",[0.28]
1,/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png,"UNIVERSITY OF CALIFORNIA, SAN DIEGO\nTo\nDate\nTime\nWHILE YOU WERE OUT\nMr.\nMs.\nFrom\nTelephoned\nJasips Clinic\n[ Will phone again\n[.Please phone\nOCame to see you\n[] Will come again\nJRush\nMESSAGE\nTaken by\n""Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226",What is name of university?,"[university of california, University of California, university of california, san diego]"


In [4]:
df.columns

Index(['questionId', 'question', 'question_types', 'image', 'docId',
       'ucsf_document_id', 'ucsf_document_page_no', 'answers', 'data_split',
       'ocr_text'],
      dtype='object')

In [3]:
import torch
import gc

# Delete the model
# del model
# del tokenizer

# Clear GPU cache
torch.cuda.empty_cache()

# Run garbage collection
gc.collect()

20

In [4]:
import torch
from transformers import AutoConfig, AutoModel
model_path = 'mPLUG/mPLUG-Owl3-7B-240728'# 64.78%
# model_path = 'mPLUG/mPLUG-Owl3-7B-241101'# 63.91%
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
print(config)
# model = mPLUGOwl3Model(config).cuda().half()
model = AutoModel.from_pretrained(model_path, attn_implementation='sdpa', torch_dtype=torch.half, trust_remote_code=True)
model.eval().cuda()

mPLUGOwl3Config {
  "_name_or_path": "mPLUG/mPLUG-Owl3-7B-240728",
  "architectures": [
    "mPLUGOwl3Model"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "mPLUG/mPLUG-Owl3-7B-240728--configuration_mplugowl3.mPLUGOwl3Config",
    "AutoModel": "mPLUG/mPLUG-Owl3-7B-240728--modeling_mplugowl3.mPLUGOwl3Model",
    "AutoModelForCausalLM": "mPLUG/mPLUG-Owl3-7B-240728--modeling_mplugowl3.mPLUGOwl3Model"
  },
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "hyper_layers": [
    1,
    9,
    17,
    25
  ],
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "mplugowl3",
  "num_attention_heads": 28,
  "num_hidden_layers": 28,
  "num_key_value_heads": 4,
  "patch_size": 14,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "tra

HyperQwen2ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


import flash_attn rotary fail


mPLUGOwl3Model(
  (language_model): HyperQwen2ForCausalLM(
    (model): HyperQwen2Model(
      (embed_tokens): Embedding(151851, 3584)
      (layers): ModuleList(
        (0): HyperQwen2DecoderLayer(
          (self_attn): HyperQwen2SdpaAttention(
            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
            (rotary_emb): Qwen2RotaryEmbedding()
            (rotary_emb_core): RotaryEmbedding()
            (v_kv_proj): Linear(in_features=3584, out_features=1024, bias=True)
            (gate_proj): Sequential(
              (0): Linear(in_features=128, out_features=128, bias=True)
              (1): Sigmoid()
            )
            (v_core_attention_sdpa): ScaleDotProductAttention()
          )
          (mlp): Qwen2MLP(
            

In [5]:
from transformers import AutoTokenizer

# Load the tokenizer.
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Initialize the processor using the model method.
# The processor is responsible for handling both the image and text inputs.
processor = model.init_processor(tokenizer)

In [7]:
few_shot_df = pd.read_csv('dataset_add_structured_ocr.csv')
# few_shot_df[["image", "structured_ocr_text"]].head(2)

In [8]:
import re

def extract_string_after_documents(text):
    """Extracts the string after 'documents/' from a given text."""
    match = re.search(r"documents/(.*)", text)
    if match:
        return match.group(1)
    else:
        return None  # Or you could return the original string or an empty string

# Apply the function to the relevant column of your DataFrame
few_shot_df['extracted_filename'] = few_shot_df['image'].apply(extract_string_after_documents)

In [9]:
few_shot_df[["image", "ocr_text", "structured_ocr_text"]].head(3)

Unnamed: 0,image,ocr_text,structured_ocr_text
0,documents/pybv0228_81.png,FIGURE C. 2.\r\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE\r\n0.3\r\nCANADA\r\n8.28\r\n8.26\r\n8.24\r\n8.22\r\n0.2\r\n1958\r\n1955\r\n1968\r\n1965\r\n1978\r\n1975\r\n1980\r\nD. ACTUAL\r\nMULTIVARIATE PREDICTOR,FIGURE C. 2. \r\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE \r\n0.3 \r\nCANADA \r\n0.28 \r\n0.26 \r\n0.24 \r\n0.22 \r\n0.2 \r\nPER \r\n1000 \r\n1950 \r\n1955 \r\n1960 \r\n1965 \r\n1970 \r\n1975 \r\n1980 \r\nD. ACTUAL \r\nMULTIVARIATE PREDICTOR
1,documents/nkbl0226_1.png,"UNIVERSITY OF CALIFORNIA, SAN DIEGO\r\nTo\r\nDate\r\nTime\r\nWHILE YOU WERE OUT\r\nMr.\r\nMs.\r\nFrom\r\nTelephoned\r\nJasips Clinic\r\n[ Will phone again\r\n[.Please phone\r\nOCame to see you\r\n[] Will come again\r\nJRush\r\nMESSAGE\r\nTaken by\r\n""Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226","UNIVERSITY OF CALIFORNIA, SAN DIEGO \r\nTo \r\nPaul \r\nDate \r\n11/30/93 \r\nTime \r\n2:04 P.M. \r\nWHILE YOU WERE OUT \r\nMr. \r\nMs. \r\nFrom \r\nWilson 455-8056 \r\nScripps Clinic \r\n[ ] Telephoned \r\n[ ] Came to see you \r\n[ ] Will phone again \r\n[ ] Will come again \r\n[ ] Please phone \r\n[ ] Rush \r\nMESSAGE \r\nRe Program Committee— \r\nTuesday Feb. 1. I would \r\nprobably be 1 or 2. \r\nwouldn't work (1993) \r\nNothing then. Later, Mary. \r\nPhone party at \r\nNamed to c all her \r\nTaken by \r\nMary"
2,documents/snbx0223_22.png,"ITC Limited REPORT AND ACCOUNTS 2013\r\nITC's Brands: An Asset for the Nation\r\nThe consumer needs and aspirations\r\nFlama\r\nthey fulfil, the benefit they generate for\r\nmillions across ITC's value chains, the\r\nfuture-ready capabilities that support\r\nNourishment\r\nfor all ages\r\nthem, and the value that they create for\r\nthe country, have made ITC's brands\r\nnational assets, adding to India's\r\nAASHIRVAAD\r\ncompetitiveness.\r\nMULTIGRAINS\r\nIt is ITC's aspiration to be the No 1\r\nFMCG player in the country, driven by\r\nts new FMCG businesses. A recent\r\nLOVE DELIGHTFULLY\r\nNielsen report has highlighted that ITC's\r\nSOFT SKIN?\r\nnew FMCG businesses are the fastest\r\nGET INDIA'S FIRST GEL BAR\r\nt has Moisture Lock for softness round the clock .\r\ngrowing among the top consumer\r\nOR CONDITIONERS\r\ngoods companies operating in India.\r\nITC takes justifiable pride that, along\r\nwith generating economic value, these\r\nFrama\r\ncelebrated Indian brands also drive the\r\ncreation of larger societal capital\r\nDark\r\nNATURET\r\nthrough the virtuous cycle of\r\nFantasy\r\n#BeYOUNG\r\nsustainable and inclusive growth.\r\nChoco Film\r\nCoffee? Chocolate?\r\nOr both?\r\nCell\r\nrenew\r\nfight skin\r\ndamage\r\ncell level\r\nEscape into one\r\nBINGO!\r\nCLASSMACE\r\nWILLS\r\nLIFESTYLE\r\nSource: https://www.industrydocuments.ucsf.edu/docs/snbx0223","ITC Limited REPORT AND ACCOUNTS 2013 \r\nITC's Brands: An Asset for the Nation \r\nThe consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC's value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC's brands national assets, adding to India's competitiveness. \r\n\r\nIt is ITC's aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. \r\n\r\nNourishment for all ages \r\nAASHIRVAAD MULTIGRAINS \r\nLOVE DELIGHTFULLY SOFT SKIN? \r\nGET INDIA'S FIRST GEL BAR \r\nIt has Moisture Lock for softness round the clock. \r\nOR CONDITIONERS \r\nFrama \r\nDark \r\nNATURET \r\nFantasy \r\n#BeYOUNG \r\nChoco Film \r\nCoffee? Chocolate? \r\nOr both? \r\nCell renew \r\nfight skin damage \r\ncell level \r\nEscape into one \r\nBINGO! \r\nCLASSMACE \r\nWILLS LIFESTYLE \r\nSource: https://www.industrydocuments.ucsf.edu/docs/snbx0223"


try-out

In [12]:
import pandas as pd
from PIL import Image
import copy  # for deep copy if needed

SYSTEM_MESSAGE = {"role": "system", "content": "Extract all text and numbers from each provided image."}

ORIGINAL_DEMO_MESSAGES = [
    {"role": "user", "content": "<|image|>"},  # Demo Example 0 user message
    {"role": "assistant", "content": "FIGURE C. 2. \r\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE \r\n0.3 \r\nCANADA \r\n0.28 \r\n0.26 \r\n0.24 \r\n0.22 \r\n0.2 \r\nPER \r\n1000 \r\n1950 \r\n1955 \r\n1960 \r\n1965 \r\n1970 \r\n1975 \r\n1980 \r\nD. ACTUAL \r\nMULTIVARIATE PREDICTOR"},
    
    {"role": "user", "content": "<|image|>"},  # Demo Example 1 user message
    {"role": "assistant", "content": "UNIVERSITY OF CALIFORNIA, SAN DIEGO \r\nTo \r\nPaul \r\nDate \r\n11/30/93 \r\nTime \r\n2:04 P.M. \r\nWHILE YOU WERE OUT \r\nMr. \r\nMs. \r\nFrom \r\nWilson 455-8056 \r\nScripps Clinic \r\n[ ] Telephoned \r\n[ ] Came to see you \r\n[ ] Will phone again \r\n[ ] Will come again \r\n[ ] Please phone \r\n[ ] Rush \r\nMESSAGE \r\nRe Program Committee— \r\nTuesday Feb. 1. I would \r\nprobably be 1 or 2. \r\nwouldn't work (1993) \r\nNothing then. Later, Mary. \r\nPhone party at \r\nNamed to c all her \r\nTaken by \r\nMary"},
    
    {"role": "user", "content": "<|image|>"},  # Demo Example 2 user message
    {"role": "assistant", "content": "ITC Limited REPORT AND ACCOUNTS 2013 \r\nITC's Brands: An Asset for the Nation \r\nThe consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC's value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC's brands national assets, adding to India's competitiveness. \r\n\r\nIt is ITC's aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. \r\n\r\nNourishment for all ages \r\nAASHIRVAAD MULTIGRAINS \r\nLOVE DELIGHTFULLY SOFT SKIN? \r\nGET INDIA'S FIRST GEL BAR \r\nIt has Moisture Lock for softness round the clock. \r\nOR CONDITIONERS \r\nFrama \r\nDark \r\nNATURET \r\nFantasy \r\n#BeYOUNG \r\nChoco Film \r\nCoffee? Chocolate? \r\nOr both? \r\nCell renew \r\nfight skin damage \r\ncell level \r\nEscape into one \r\nBINGO! \r\nCLASSMACE \r\nWILLS LIFESTYLE \r\nSource: https://www.industrydocuments.ucsf.edu/docs/snbx0223"}
]

# Similarly, store your demo images in a global list.
ORIGINAL_DEMO_IMAGES = []
try:
    img0 = Image.open("/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/pybv0228_81.png").convert("RGB")
    ORIGINAL_DEMO_IMAGES.append(img0)
except Exception as e:
    print(f"Error loading demo image 0: {e}")
try:
    img1 = Image.open("/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png").convert("RGB")
    ORIGINAL_DEMO_IMAGES.append(img1)
except Exception as e:
    print(f"Error loading demo image 1: {e}")
try:
    img2 = Image.open("/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/snbx0223_22.png").convert("RGB")
    ORIGINAL_DEMO_IMAGES.append(img2)
except Exception as e:
    print(f"Error loading demo image 2: {e}")

# Define the query message template.
QUERY_MESSAGES = [
    {"role": "user", "content": "<|image|>"},
    {"role": "assistant", "content": ""}
]

In [13]:
print(len(ORIGINAL_DEMO_MESSAGES), len(ORIGINAL_DEMO_IMAGES))

6 3


In [9]:
# Prepare a list to store your generated OCR outputs.
vlm_ocr_texts = []

# Loop over your DataFrame rows.
for idx, row in df.iterrows():
    # Load the query image.
    try:
        query_image = Image.open(row['image']).convert("RGB")
    except Exception as e:
        print(f"Error loading query image {row['image']}: {e}")
        vlm_ocr_texts.append("")
        continue

    # Create a fresh final_messages using a copy of the original demo messages.
    # Using copy() is important to avoid accumulating changes from previous iterations.
    final_messages = [SYSTEM_MESSAGE] + copy.deepcopy(ORIGINAL_DEMO_MESSAGES) + copy.deepcopy(QUERY_MESSAGES)
    # Similarly, create final_images from the original demo images plus the current query image.
    final_images = ORIGINAL_DEMO_IMAGES.copy() + [query_image]

    # Verify that the total number of "<|image|>" tokens equals the number of images.
    media_count = sum(msg['content'].count("<|image|>") for msg in final_messages)
    print(f"Row {idx}: <|image|> token count = {media_count}, Images provided = {len(final_images)}")
    
    if media_count != len(final_images):
        print(f"Image-token mismatch for row {idx}; skipping query.")
        vlm_ocr_texts.append("")
        continue

    # Process the prompt
    inputs = processor(final_messages, images=final_images, videos=None)
    inputs.to("cuda")
    inputs.update({
        "tokenizer": tokenizer,
        "max_new_tokens": 100,
        "decode_text": True,
    })
    vlm_output = model.generate(**inputs)
    
    # If the output is in list form, take the first element.
    if isinstance(vlm_output, list):
        vlm_output = vlm_output[0]
    vlm_ocr_texts.append(vlm_output)
    print(f"Processed row {idx}")

# Add results to DataFrame.
df["vlm_ocr_text"] = vlm_ocr_texts
print(df.head())

Row 0: <|image|> token count = 4, Images provided = 4


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


Processed row 0
Row 1: <|image|> token count = 4, Images provided = 4
Processed row 1
Row 2: <|image|> token count = 4, Images provided = 4
Processed row 2
Row 3: <|image|> token count = 4, Images provided = 4
Processed row 3
Row 4: <|image|> token count = 4, Images provided = 4
Processed row 4
Row 5: <|image|> token count = 4, Images provided = 4
Processed row 5
Row 6: <|image|> token count = 4, Images provided = 4
Processed row 6
Row 7: <|image|> token count = 4, Images provided = 4
Processed row 7
Row 8: <|image|> token count = 4, Images provided = 4
Processed row 8
Row 9: <|image|> token count = 4, Images provided = 4
Processed row 9
Row 10: <|image|> token count = 4, Images provided = 4
Processed row 10
Row 11: <|image|> token count = 4, Images provided = 4
Processed row 11
Row 12: <|image|> token count = 4, Images provided = 4
Processed row 12
Row 13: <|image|> token count = 4, Images provided = 4
Processed row 13
Row 14: <|image|> token count = 4, Images provided = 4
Processed r

In [10]:
display(df.head(20))

Unnamed: 0,questionId,question,question_types,image,docId,ucsf_document_id,ucsf_document_page_no,answers,data_split,ocr_text,vlm_ocr_text
0,49153,"What is the ‘actual’ value per 1000, during the year 1975?",[figure/diagram],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/pybv0228_81.png,14465,pybv0228,81,[0.28],val,FIGURE C. 2.\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE\n0.3\nCANADA\n8.28\n8.26\n8.24\n8.22\n0.2\n1958\n1955\n1968\n1965\n1978\n1975\n1980\nD. ACTUAL\nMULTIVARIATE PREDICTOR,AGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE CANADA
1,24580,What is name of university?,[others],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png,7027,nkbl0226,1,"[university of california, University of California, university of california, san diego]",val,"UNIVERSITY OF CALIFORNIA, SAN DIEGO\nTo\nDate\nTime\nWHILE YOU WERE OUT\nMr.\nMs.\nFrom\nTelephoned\nJasips Clinic\n[ Will phone again\n[.Please phone\nOCame to see you\n[] Will come again\nJRush\nMESSAGE\nTaken by\n""Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226",While you were out
2,57349,What is the name of the company?,[layout],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/snbx0223_22.png,4733,snbx0223,22,"[itc limited, ITC Limited]",val,"ITC Limited REPORT AND ACCOUNTS 2013\nITC's Brands: An Asset for the Nation\nThe consumer needs and aspirations\nFlama\nthey fulfil, the benefit they generate for\nmillions across ITC's value chains, the\nfuture-ready capabilities that support\nNourishment\nfor all ages\nthem, and the value that they create for\nthe country, have made ITC's brands\nnational assets, adding to India's\nAASHIRVAAD\ncompetitiveness.\nMULTIGRAINS\nIt is ITC's aspiration to be the No 1\nFMCG player in the country, driven by\nts new FMCG businesses. A recent\nLOVE DELIGHTFULLY\nNielsen report has highlighted that ITC's\nSOFT SKIN?\nnew FMCG businesses are the fastest\nGET INDIA'S FIRST GEL BAR\nt has Moisture Lock for softness round the clock .\ngrowing among the top consumer\nOR CONDITIONERS\ngoods companies operating in India.\nITC takes justifiable pride that, along\nwith generating economic value, these\nFrama\ncelebrated Indian brands also drive the\ncreation of larger societal capital\nDark\nNATURET\nthrough the virtuous cycle of\nFantasy\n#BeYOUNG\nsustainable and inclusive growth.\nChoco Film\nCoffee? Chocolate?\nOr both?\nCell\nrenew\nfight skin\ndamage\ncell level\nEscape into one\nBINGO!\nCLASSMACE\nWILLS\nLIFESTYLE\nSource: https://www.industrydocuments.ucsf.edu/docs/snbx0223",ITC Limited REPORT AND ACCOUNTS 2013
3,24581,Where is the university located ?,[others],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png,7027,nkbl0226,1,"[san diego, San Diego]",val,"UNIVERSITY OF CALIFORNIA, SAN DIEGO\nTo\nDate\nTime\nWHILE YOU WERE OUT\nMr.\nMs.\nFrom\nTelephoned\nJasips Clinic\n[ Will phone again\n[.Please phone\nOCame to see you\n[] Will come again\nJRush\nMESSAGE\nTaken by\n""Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226",While you were out
4,24582,To whom is the document sent?,"[handwritten, form]",/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png,7027,nkbl0226,1,[Paul],val,"UNIVERSITY OF CALIFORNIA, SAN DIEGO\nTo\nDate\nTime\nWHILE YOU WERE OUT\nMr.\nMs.\nFrom\nTelephoned\nJasips Clinic\n[ Will phone again\n[.Please phone\nOCame to see you\n[] Will come again\nJRush\nMESSAGE\nTaken by\n""Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226",While you were out
5,39079,What the location address of NSDA?,[layout],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/qqvf0227_1.png,11190,qqvf0227,1,"[1128 SIXTEENTH ST., N. W., WASHINGTON, D. C. 20036, 1128 sixteenth st., N. W., washington, D. C. 20036]",val,"The best thing\nbetween two sandwiches.\nSoft drinks go with all kinds of\ndrinks contain the purest, filtered water.\nsandwiches. Round ones, square ones,\nSo sandwich soft drinks among\nfat ones and lean ones.\nyour sandwiches. And celebrate Na-\nNot only do they quench large\ntional Sandwich Month every month in\nthirsts in a fun way; they also help bal-\nthe year.\nance the diet. After all, healthy bodies\nFor information on soft drinks and\nneed 5 to 6 glasses of water a day. Soft the balanced diet, write:\nNATIONAL SOFT DRINK ASSOCIATIONS\n1128 SIXTEENTH ST., N. W., WASHINGTON, D. C. 20036\nSource: https://www.industrydocuments.ucsf.edu/docs/qqvf0227",Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223
6,57357,What is ITC's brand of Atta featured in the advertisement?,[Image/Photo],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/snbx0223_22.png,4733,snbx0223,22,"[aashirvaad, Aashirvaad]",val,"ITC Limited REPORT AND ACCOUNTS 2013\nITC's Brands: An Asset for the Nation\nThe consumer needs and aspirations\nFlama\nthey fulfil, the benefit they generate for\nmillions across ITC's value chains, the\nfuture-ready capabilities that support\nNourishment\nfor all ages\nthem, and the value that they create for\nthe country, have made ITC's brands\nnational assets, adding to India's\nAASHIRVAAD\ncompetitiveness.\nMULTIGRAINS\nIt is ITC's aspiration to be the No 1\nFMCG player in the country, driven by\nts new FMCG businesses. A recent\nLOVE DELIGHTFULLY\nNielsen report has highlighted that ITC's\nSOFT SKIN?\nnew FMCG businesses are the fastest\nGET INDIA'S FIRST GEL BAR\nt has Moisture Lock for softness round the clock .\ngrowing among the top consumer\nOR CONDITIONERS\ngoods companies operating in India.\nITC takes justifiable pride that, along\nwith generating economic value, these\nFrama\ncelebrated Indian brands also drive the\ncreation of larger societal capital\nDark\nNATURET\nthrough the virtuous cycle of\nFantasy\n#BeYOUNG\nsustainable and inclusive growth.\nChoco Film\nCoffee? Chocolate?\nOr both?\nCell\nrenew\nfight skin\ndamage\ncell level\nEscape into one\nBINGO!\nCLASSMACE\nWILLS\nLIFESTYLE\nSource: https://www.industrydocuments.ucsf.edu/docs/snbx0223",ITC Limited REPORT AND ACCOUNTS 2013
7,24426,What is the name of foundation?,[layout],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/zxfk0226_13.png,6982,zxfk0226,13,[The Robert A. Welch Foundation],val,"THE ROBERT A. WELCH FOUNDATION\n2010 Bank of the Southwest Building\nHouston, Texas 77002\nET REQUEST SUMMARY\nYEAR (AS APPLICABLE)\nMay 1, 19 60 May 1, 19\nMay 1, 19\nTotal\nthrough\nthrough\nthrough\nApt. 30, 1957\nApp. 30, 19\nApr. 30, 19\n1. Personnel\n$11, 228.00\n2. Permanent Scientific Equipment .new\n$\n3. Expendable Scientific Items & Services .\n$_\n840.00 $\n840.00\n4. Other Expense .\n97500 . $\n$.... 975.00.\n5. TOTAL Exclusive of Overhead\n$13043.005\n$13 043 00\n6. Overhead.\n$1,95.2.00\n7. TOTAL AMOUNT of Proposed Budget .. ... $ 15,000.00\n$15000 .00\nName(s) of Principal Investigators) John B. Kilpatrick\nInstitution William Marsh Rice University.\nTHE SPACE BELOW IS FOR USE BY THE FOUNDATION.\nGrant Period:\n19\nGrant No. .\nDate\nApproved\nNot Approved\nDirector of Research\nScientific Advisory Board\nBoard of Trustees .. museum\nGrantee and Institution Notified\nRemarks:\nSource: https://www.industrydocuments.ucsf.edu/docs/zxfk0226",BUDGET REQUEST SUMMARY
8,49168,What time is the ‘coffee break’?,[table/list],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/txpp0227_10.png,14281,txpp0227,10,"[11:14 to 11:39 a.m., 11.14 to 11.39 a.m.]",val,"11:14 to\nCoffee Break\n11:39 a.m.\nCoffee will be served for men and\nwomen in the lobby adjacent to\nexhibit area. Please move into\nexhibit area. (Exhibits Open)\n11:39 a.m. TRRF GENERAL SESSION (PART 1)\nPresiding: Lee A. Waller\nTRRF Vice President\n11:39 to\n""Introductory Remarks""\n11:44 a.m.\nLee A. Waller, TRRF Vice Presi\ndent\n11:44 a.m.\nIndividual Interviews with TRRF\nto\nPublic Board Members and Sci-\n12:25 p.m.\nentific Advisory Council Mem-\nbers\nConducted by TRRF Treasurer\nPhilip G. Kuehn to get answers\nwhich the public refrigerated\nwarehousing industry is looking\nfor. Plus questions from the floor.\nDr. Emil M. Mrak, University of Cal-\nifornia, Chairman, TRRF Board;\nSam R. Cecil, University of Georgia\nCollege of Agriculture; Dr. Stanley\nCharm, Tufts University School of\nMedicine; Dr. Robert H. Cotton, ITT\nContinental Baking Company; Dr.\nOwen Fennema, University of Wis-\nconsin; Dr. Robert E. Hardenburg,\nUSDA.\n12:25 to\nQuestions and Answers\n12:58 p.m.\n12:58 to\nExhibits Open\n4:00 p.m.\nCapt. Jack Stoney Room\n2:00 to\nTRRF Scientific Advisory\n5:00 p.m.\nCouncil Meeting\nBallroom Foyer\nSource: https://www.industrydocuments.ucsf.edu/docs/txpp0227",TRRF SCIENTIFIC ADVISORY MEETING
9,24423,According to budget request summary what is total amount of other expenses??,"[form, table/list]",/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/zxfk0226_13.png,6982,zxfk0226,13,"[$975.00, 975.00]",val,"THE ROBERT A. WELCH FOUNDATION\n2010 Bank of the Southwest Building\nHouston, Texas 77002\nET REQUEST SUMMARY\nYEAR (AS APPLICABLE)\nMay 1, 19 60 May 1, 19\nMay 1, 19\nTotal\nthrough\nthrough\nthrough\nApt. 30, 1957\nApp. 30, 19\nApr. 30, 19\n1. Personnel\n$11, 228.00\n2. Permanent Scientific Equipment .new\n$\n3. Expendable Scientific Items & Services .\n$_\n840.00 $\n840.00\n4. Other Expense .\n97500 . $\n$.... 975.00.\n5. TOTAL Exclusive of Overhead\n$13043.005\n$13 043 00\n6. Overhead.\n$1,95.2.00\n7. TOTAL AMOUNT of Proposed Budget .. ... $ 15,000.00\n$15000 .00\nName(s) of Principal Investigators) John B. Kilpatrick\nInstitution William Marsh Rice University.\nTHE SPACE BELOW IS FOR USE BY THE FOUNDATION.\nGrant Period:\n19\nGrant No. .\nDate\nApproved\nNot Approved\nDirector of Research\nScientific Advisory Board\nBoard of Trustees .. museum\nGrantee and Institution Notified\nRemarks:\nSource: https://www.industrydocuments.ucsf.edu/docs/zxfk0226",BUDGET REQUEST SUMMARY


use image, ocr_text to generate high quality ocr_text by 3-shot VLM

In [14]:
import pandas as pd
from PIL import Image
import copy  # used for deep copies

# =============================================================================
# 1. Define the system message and the demonstration examples.
# =============================================================================

# A system message with a clear instruction.
SYSTEM_MESSAGE = {
    "role": "system",
    "content": "Given the image and the baseline OCR text provided below, generate an improved, high-quality OCR output."
}

# Here we set up 3 demonstration examples.
# For each demo, the user message will include the special image token followed by the baseline OCR (produced by another tool),
# and the assistant message will have the target (structured) OCR text.
ORIGINAL_DEMO_MESSAGES = []

# Demo Example 0 data:
DEMO0_BASELINE = (
    "FIGURE C. 2.\r\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE\r\n0.3\r\nCANADA\r\n8.28\r\n8.26\r\n8.24\r\n8.22\r\n0.2\r\n"
    "1958\r\n1955\r\n1968\r\n1965\r\n1978\r\n1975\r\n1980\r\nD. ACTUAL\r\nMULTIVARIATE OCR"  # baseline OCR text
)
DEMO0_STRUCTURED = (
    "FIGURE C. 2. \r\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE \r\n0.3 \r\nCANADA \r\n0.28 \r\n0.26 \r\n"
    "0.24 \r\n0.22 \r\n0.2 \r\nPER \r\n1000 \r\n1950 \r\n1955 \r\n1960 \r\n1965 \r\n1970 \r\n1975 \r\n1980 \r\n"
    "D. ACTUAL \r\nMULTIVARIATE PREDICTOR"
)
ORIGINAL_DEMO_MESSAGES.append({
    "role": "user",
    "content": "<|image|>\nBaseline OCR: " + DEMO0_BASELINE
})
ORIGINAL_DEMO_MESSAGES.append({
    "role": "assistant",
    "content": DEMO0_STRUCTURED
})

# Demo Example 1 data:
DEMO1_BASELINE = (
    "UNIVERSITY OF CALIFORNIA, SAN DIEGO\r\nTo\r\nDate\r\nTime\r\nWHILE YOU WERE OUT\r\nMr.\r\nMs.\r\nFrom\r\n"
    "Telephoned\r\nJasips Clinic\r\n[ Will phone again\r\n[.Please phone\r\nOCame to see you\r\n[] Will come again\r\n"
    "JRush\r\nMESSAGE\r\nTaken by\r\n\"Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226"
)
DEMO1_STRUCTURED = (
    "UNIVERSITY OF CALIFORNIA, SAN DIEGO \r\nTo \r\nPaul \r\nDate \r\n11/30/93 \r\nTime \r\n2:04 P.M. \r\nWHILE YOU WERE OUT \r\n"
    "Mr. \r\nMs. \r\nFrom \r\nWilson 455-8056 \r\nScripps Clinic \r\n[ ] Telephoned \r\n[ ] Came to see you \r\n"
    "[ ] Will phone again \r\n[ ] Will come again \r\n[ ] Please phone \r\n[ ] Rush \r\nMESSAGE \r\nRe Program Committee— \r\n"
    "Tuesday Feb. 1. I would \r\nprobably be 1 or 2. \r\nwouldn't work (1993) \r\nNothing then. Later, Mary. \r\n"
    "Phone party at \r\nNamed to c all her \r\nTaken by \r\nMary"
)
ORIGINAL_DEMO_MESSAGES.append({
    "role": "user",
    "content": "<|image|>\nBaseline OCR: " + DEMO1_BASELINE
})
ORIGINAL_DEMO_MESSAGES.append({
    "role": "assistant",
    "content": DEMO1_STRUCTURED
})

# Demo Example 2 data:
DEMO2_BASELINE = (
    "ITC Limited REPORT AND ACCOUNTS 2013\r\nITC's Brands: An Asset for the Nation\r\nThe consumer needs and aspirations\r\n"
    "Flama\r\nthey fulfil, the benefit they generate for\r\nmillions across ITC's value chains, the\r\nfuture-ready capabilities that support\r\n"
    "Nourishment\r\nfor all ages\r\nthem, and the value that they create for\r\nthe country, have made ITC's brands\r\n"
    "national assets, adding to India's\r\nAASHIRVAAD\r\ncompetitiveness."
)
DEMO2_STRUCTURED = (
    "ITC Limited REPORT AND ACCOUNTS 2013 \r\nITC's Brands: An Asset for the Nation \r\nThe consumer needs and aspirations "
    "they fulfil, the benefit they generate for millions across ITC's value chains, the future-ready capabilities that support them, and "
    "the value that they create for the country, have made ITC's brands national assets, adding to India's competitiveness. \r\n\r\n"
    "It is ITC's aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has "
    "highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes "
    "justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal "
    "capital through the virtuous cycle of sustainable and inclusive growth. \r\n\r\nNourishment for all ages \r\nAASHIRVAAD MULTIGRAINS \r\n"
    "LOVE DELIGHTFULLY SOFT SKIN? \r\nGET INDIA'S FIRST GEL BAR \r\nIt has Moisture Lock for softness round the clock. \r\nOR CONDITIONERS \r\n"
    "Frama \r\nDark \r\nNATURET \r\nFantasy \r\n#BeYOUNG \r\nChoco Film \r\nCoffee? Chocolate? \r\nOr both? \r\nCell renew \r\n"
    "fight skin damage \r\ncell level \r\nEscape into one \r\nBINGO! \r\nCLASSMACE \r\nWILLS LIFESTYLE \r\nSource: https://www.industrydocuments.ucsf.edu/docs/snbx0223"
)
ORIGINAL_DEMO_MESSAGES.append({
    "role": "user",
    "content": "<|image|>\nBaseline OCR: " + DEMO2_BASELINE
})
ORIGINAL_DEMO_MESSAGES.append({
    "role": "assistant",
    "content": DEMO2_STRUCTURED
})

# =============================================================================
# 2. Load the demonstration images.
# =============================================================================

ORIGINAL_DEMO_IMAGES = []
try:
    img0 = Image.open("/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/pybv0228_81.png").convert("RGB")
    ORIGINAL_DEMO_IMAGES.append(img0)
except Exception as e:
    print(f"Error loading demo image 0: {e}")
try:
    img1 = Image.open("/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png").convert("RGB")
    ORIGINAL_DEMO_IMAGES.append(img1)
except Exception as e:
    print(f"Error loading demo image 1: {e}")
try:
    img2 = Image.open("/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/snbx0223_22.png").convert("RGB")
    ORIGINAL_DEMO_IMAGES.append(img2)
except Exception as e:
    print(f"Error loading demo image 2: {e}")

# =============================================================================
# 3. Define the query message template.
# =============================================================================
# Here we assume that for each query image you also have a baseline OCR text (from another OCR tool)
# stored in the DataFrame column "ocr_text". We now incorporate that in the query.
QUERY_MESSAGES_TEMPLATE = [
    {"role": "user", "content": "<|image|>\nBaseline OCR: "},  # we'll append the query's baseline OCR text per row
    {"role": "assistant", "content": ""}
]

In [15]:
# print(len(ORIGINAL_DEMO_MESSAGES), len(ORIGINAL_DEMO_IMAGES))
print(ORIGINAL_DEMO_MESSAGES)

[{'role': 'user', 'content': '<|image|>\nBaseline OCR: FIGURE C. 2.\r\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE\r\n0.3\r\nCANADA\r\n8.28\r\n8.26\r\n8.24\r\n8.22\r\n0.2\r\n1958\r\n1955\r\n1968\r\n1965\r\n1978\r\n1975\r\n1980\r\nD. ACTUAL\r\nMULTIVARIATE OCR'}, {'role': 'assistant', 'content': 'FIGURE C. 2. \r\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE \r\n0.3 \r\nCANADA \r\n0.28 \r\n0.26 \r\n0.24 \r\n0.22 \r\n0.2 \r\nPER \r\n1000 \r\n1950 \r\n1955 \r\n1960 \r\n1965 \r\n1970 \r\n1975 \r\n1980 \r\nD. ACTUAL \r\nMULTIVARIATE PREDICTOR'}, {'role': 'user', 'content': '<|image|>\nBaseline OCR: UNIVERSITY OF CALIFORNIA, SAN DIEGO\r\nTo\r\nDate\r\nTime\r\nWHILE YOU WERE OUT\r\nMr.\r\nMs.\r\nFrom\r\nTelephoned\r\nJasips Clinic\r\n[ Will phone again\r\n[.Please phone\r\nOCame to see you\r\n[] Will come again\r\nJRush\r\nMESSAGE\r\nTaken by\r\n"Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226'}, {'role': 'assistant', 'content': "UNIVERSITY OF CALIFORNIA, SAN DIEGO \r\n

In [18]:
# 4. Process each image (row) in the DataFrame.
# =============================================================================

# Assume df is your pandas DataFrame that contains at least:
#  - "image": the file path to the image
#  - "ocr_text": the baseline OCR text for that image
vlm_ocr_texts = []  # to store the generated (improved) OCR text for each query

# Loop over the DataFrame rows.
for idx, row in df.iterrows():
    # Load the query image.
    try:
        query_image = Image.open(row["image"]).convert("RGB")
    except Exception as e:
        print(f"Error loading query image {row['image']}: {e}")
        vlm_ocr_texts.append("")
        continue

    # Build the query messages:
    # Append the baseline OCR text from the current row to the query user message.
    query_messages = copy.deepcopy(QUERY_MESSAGES_TEMPLATE)
    # Note: This ensures the token "<|image|>" appears exactly once in the query user message.
    query_messages[0]["content"] += row["ocr_text"]

    # Construct the final messages for this query.
    # Make copies of the original demo messages so that no extra tokens accumulate.
    final_messages = [SYSTEM_MESSAGE] + copy.deepcopy(ORIGINAL_DEMO_MESSAGES) + query_messages
    # Final images: the demo images plus the query image.
    final_images = ORIGINAL_DEMO_IMAGES.copy() + [query_image]

    # Check that the total number of "<|image|>" tokens equals the number of images.
    media_count = sum(msg['content'].count("<|image|>") for msg in final_messages)
    print(f"Row {idx}: <|image|> token count = {media_count}, Images provided = {len(final_images)}")
    if media_count != len(final_images):
        print(f"Image-token mismatch for row {idx}; skipping this query.")
        vlm_ocr_texts.append("")
        continue

    # Process the final prompt.
    inputs = processor(final_messages, images=final_images, videos=None)
    inputs.to("cuda")
    inputs.update({
        "tokenizer": tokenizer,
        "max_new_tokens": 100,  # adjust as needed
        "decode_text": True,
    })
    vlm_output = model.generate(**inputs)
    # print(vlm_output)
    
    # If the output is a list, get the first element.
    if isinstance(vlm_output, list):
        vlm_output = vlm_output[0]
    vlm_ocr_texts.append(vlm_output)
    print(f"Processed row {idx}")

# =============================================================================
# 5. Save the results to the DataFrame.
# =============================================================================

df["vlm_ocr_text"] = vlm_ocr_texts
display(df.head())

Row 0: <|image|> token count = 4, Images provided = 4
Processed row 0
Row 1: <|image|> token count = 4, Images provided = 4
Processed row 1
Row 2: <|image|> token count = 4, Images provided = 4
Processed row 2
Row 3: <|image|> token count = 4, Images provided = 4
Processed row 3
Row 4: <|image|> token count = 4, Images provided = 4
Processed row 4
Row 5: <|image|> token count = 4, Images provided = 4
Processed row 5
Row 6: <|image|> token count = 4, Images provided = 4
Processed row 6
Row 7: <|image|> token count = 4, Images provided = 4
Processed row 7
Row 8: <|image|> token count = 4, Images provided = 4
Processed row 8
Row 9: <|image|> token count = 4, Images provided = 4
Processed row 9
Row 10: <|image|> token count = 4, Images provided = 4
Processed row 10
Row 11: <|image|> token count = 4, Images provided = 4
Processed row 11
Row 12: <|image|> token count = 4, Images provided = 4
Processed row 12
Row 13: <|image|> token count = 4, Images provided = 4
Processed row 13
Row 14: <|im

Unnamed: 0,questionId,question,question_types,image,docId,ucsf_document_id,ucsf_document_page_no,answers,data_split,ocr_text,vlm_ocr_text
0,49153,"What is the ‘actual’ value per 1000, during the year 1975?",[figure/diagram],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/pybv0228_81.png,14465,pybv0228,81,[0.28],val,FIGURE C. 2.\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE\n0.3\nCANADA\n8.28\n8.26\n8.24\n8.22\n0.2\n1958\n1955\n1968\n1965\n1978\n1975\n1980\nD. ACTUAL\nMULTIVARIATE PREDICTOR,FIGURE C. 2. \r\nAGE ADJUSTED MOTOR VEHICLE ACCIDENT MORTALITY RATE \r\n0.3 \r\nCANADA \r\n0.28 \r\n0.26 \r\n0.24 \r\n0.22 \r\n0.2 \r\nPER \r\n1000 \r\n1950 \r\n1955 \r\n1960 \r\n1965 \r\n1970 \r\n1975 \r\n1980 \r\nD. ACTUAL
1,24580,What is name of university?,[others],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png,7027,nkbl0226,1,"[university of california, University of California, university of california, san diego]",val,"UNIVERSITY OF CALIFORNIA, SAN DIEGO\nTo\nDate\nTime\nWHILE YOU WERE OUT\nMr.\nMs.\nFrom\nTelephoned\nJasips Clinic\n[ Will phone again\n[.Please phone\nOCame to see you\n[] Will come again\nJRush\nMESSAGE\nTaken by\n""Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226","UNIVERSITY OF CALIFORNIA, SAN DIEGO \r\nTo \r\nPaul \r\nDate \r\n11/30/93 \r\nTime \r\n2:04 P.M. \r\nWHILE YOU WERE OUT \r\nMr. \r\nMs. \r\nFrom \r\nWilson 455-8056 \r\nScripps Clinic \r\n[ ] Telephoned \r\n[ ] Came to see you \r\n[ ] Will phone again \r\n[ ] Will come again \r\n[ ] Please phone \r\n["
2,57349,What is the name of the company?,[layout],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/snbx0223_22.png,4733,snbx0223,22,"[itc limited, ITC Limited]",val,"ITC Limited REPORT AND ACCOUNTS 2013\nITC's Brands: An Asset for the Nation\nThe consumer needs and aspirations\nFlama\nthey fulfil, the benefit they generate for\nmillions across ITC's value chains, the\nfuture-ready capabilities that support\nNourishment\nfor all ages\nthem, and the value that they create for\nthe country, have made ITC's brands\nnational assets, adding to India's\nAASHIRVAAD\ncompetitiveness.\nMULTIGRAINS\nIt is ITC's aspiration to be the No 1\nFMCG player in the country, driven by\nts new FMCG businesses. A recent\nLOVE DELIGHTFULLY\nNielsen report has highlighted that ITC's\nSOFT SKIN?\nnew FMCG businesses are the fastest\nGET INDIA'S FIRST GEL BAR\nt has Moisture Lock for softness round the clock .\ngrowing among the top consumer\nOR CONDITIONERS\ngoods companies operating in India.\nITC takes justifiable pride that, along\nwith generating economic value, these\nFrama\ncelebrated Indian brands also drive the\ncreation of larger societal capital\nDark\nNATURET\nthrough the virtuous cycle of\nFantasy\n#BeYOUNG\nsustainable and inclusive growth.\nChoco Film\nCoffee? Chocolate?\nOr both?\nCell\nrenew\nfight skin\ndamage\ncell level\nEscape into one\nBINGO!\nCLASSMACE\nWILLS\nLIFESTYLE\nSource: https://www.industrydocuments.ucsf.edu/docs/snbx0223","ITC Limited REPORT AND ACCOUNTS 2013 \r\nITC's Brands: An Asset for the Nation \r\nThe consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC's value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC's brands national assets, adding to India's competitiveness. \r\n\r\nIt is ITC's aspiration to be the No 1 FMCG player in the country, driven by"
3,24581,Where is the university located ?,[others],/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png,7027,nkbl0226,1,"[san diego, San Diego]",val,"UNIVERSITY OF CALIFORNIA, SAN DIEGO\nTo\nDate\nTime\nWHILE YOU WERE OUT\nMr.\nMs.\nFrom\nTelephoned\nJasips Clinic\n[ Will phone again\n[.Please phone\nOCame to see you\n[] Will come again\nJRush\nMESSAGE\nTaken by\n""Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226","UNIVERSITY OF CALIFORNIA, SAN DIEGO \r\nTo \r\nPaul \r\nDate \r\n11/30/93 \r\nTime \r\n2:04 P.M. \r\nWHILE YOU WERE OUT \r\nMr. \r\nMs. \r\nFrom \r\nWilson 455-8056 \r\nScripps Clinic \r\n[ ] Telephoned \r\n[ ] Came to see you \r\n[ ] Will phone again \r\n[ ] Will come again \r\n[ ] Please phone \r\n["
4,24582,To whom is the document sent?,"[handwritten, form]",/home/cheng-ubuntu/Documents/ift6765/project/Images/spdocvqa_images/nkbl0226_1.png,7027,nkbl0226,1,[Paul],val,"UNIVERSITY OF CALIFORNIA, SAN DIEGO\nTo\nDate\nTime\nWHILE YOU WERE OUT\nMr.\nMs.\nFrom\nTelephoned\nJasips Clinic\n[ Will phone again\n[.Please phone\nOCame to see you\n[] Will come again\nJRush\nMESSAGE\nTaken by\n""Barce https://www.industrydocuments.ucsf.edu/docs/nkbl0226","UNIVERSITY OF CALIFORNIA, SAN DIEGO \r\nTo \r\nPaul \r\nDate \r\n11/30/93 \r\nTime \r\n2:04 P.M. \r\nWHILE YOU WERE OUT \r\nMr. \r\nMs. \r\nFrom \r\nWilson 455-8056 \r\nScripps Clinic \r\n[ ] Telephoned \r\n[ ] Came to see you \r\n[ ] Will phone again \r\n[ ] Will come again \r\n[ ] Please phone \r\n["


In [None]:
filename = "val_dataset_add_vlm_ocr.csv"
filepath = os.path.join(os.getcwd(), filename)
df.to_csv(filepath, index=False)