In [1]:
!pip install datasets transformers accelerate peft trl bitsandbytes python-dotenv packaging

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Downloading trl-0.19.0-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025

In [2]:
# Import library after install all libraries
import os
import torch
import json
from datasets import load_dataset
from datetime import datetime
from dateutil import parser
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
# Load environment and save HuggingFace token
from dotenv import load_dotenv
load_dotenv("env.txt")
token = os.getenv("HUGGINGFACE_TOKEN")
!huggingface-cli login --token {token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `maujadiaiengineer` has been saved to /home/ec2-user/.cache/huggingface/stored_tokens
Your token has been saved to /home/ec2-user/.cache/huggingface/token
Login successful.
The current active token is: `maujadiaiengineer`


In [4]:
# Load dataset from HuggingFace
dataset = load_dataset("cindyliang/receipts-v1", split="train")

for i in range(100):
    img = dataset[i]['image']
    if(img.mode == "RGB"):
        continue
    else:
        print("This image is not RGB")

dataset[2]

README.md:   0%|          | 0.00/705 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/12 [00:00<?, ? examples/s]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=960x1280>,
 'ground_truth': '{"gt_parses": [{"question": "extract store name, date, total", "answer": ["Tokyo Sundubu", "07/02/2024", "$40.77"]}]}'}

In [5]:
def format_data(sample):
    # Parsing "ground_truth" from HuggingFace dataset
    ground_truth_parsed = json.loads(sample["ground_truth"])

    # Get answer from "gt_parses"
    answer = ground_truth_parsed["gt_parses"][0]["answer"]

    # Define store_name, date and total
    store_name = answer[0]
    date = answer[1]
    total = answer[2]

    # Check if store_name, date and total have "" or not
    if not store_name.strip() or not date.strip() or not total.strip():
        return None

    # All store name must capital using upper()
    store_name = store_name.upper()

    # Parsing date to "%d-%m-%Y" format
    date = parser.parse(date).strftime("%d-%m-%Y")

    # Check if total have $ or not, if total don't have $, add $
    if not total.startswith("$"):
        total = "$" + total

    # Define assistant answer
    assistant = (
        f"{store_name}", f"{date}", f"{total}"
    )

    return {
        "text": assistant,
    }

# Create result based on format_data function
result = dataset.map(format_data, remove_columns="ground_truth").filter(lambda x: x is not None)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/97 [00:00<?, ? examples/s]

In [6]:
print(result[2].keys())
result[2]

dict_keys(['image', 'text'])


{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=960x1280>,
 'text': ['TOKYO SUNDUBU', '02-07-2024', '$40.77']}

In [7]:
def create_prompt(sample):
    system_message = "You are a expert at extract information from receipts and you must identify accurately store name, date of purchase and total amount from receipts."

    prompt = "Extract store name, date and total information based this receipt photo"

    return {
        "image": sample["image"],
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {
                        "type": "image",
                        "image": sample["image"],
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample['text']}],
            },
        ],
    }

# Create final result for fine-tuned based this.
finalresult = [create_prompt(sample) for sample in result]

In [8]:
print(finalresult[2].keys())
finalresult[2]

dict_keys(['image', 'messages'])


{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=960x1280>,
 'messages': [{'role': 'system',
   'content': [{'type': 'text',
     'text': 'You are a expert at extract information from receipts and you must identify accurately store name, date of purchase and total amount from receipts.'}]},
  {'role': 'user',
   'content': [{'type': 'text',
     'text': 'Extract store name, date and total information based this receipt photo'},
    {'type': 'image',
     'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=960x1280>}]},
  {'role': 'assistant',
   'content': [{'type': 'text',
     'text': ['TOKYO SUNDUBU', '02-07-2024', '$40.77']}]}]}

In [9]:
# Using QLoRA
bnbconfig = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

In [10]:
model_name = "google/gemma-3-4b-it"  # Download Gemma from HuggingFace model.
new_model = "gemma-finetuning-receipts-extraction"  # Save fine-tuning model to this folder.

model = Gemma3ForConditionalGeneration.from_pretrained(model_name,
                                                       quantization_config=bnbconfig,
                                                       torch_dtype=torch.bfloat16,
                                                       device_map="auto")

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [11]:
processing_class = AutoProcessor.from_pretrained(model_name)

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [12]:
# Create PEFT Configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=4,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

In [13]:
# Create data collator
def collate_fn(examples):
    texts = [processing_class.apply_chat_template(example["messages"], tokenize=False, add_generation_prompt=False).strip() for example in examples]
    images = [example["image"] for example in examples]
    batch = processing_class(text=texts, images=images, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()

    image_token_id = [
        processing_class.tokenizer.convert_tokens_to_ids(
            processing_class.tokenizer.special_tokens_map["boi_token"]
        )
    ]
    labels[labels == processing_class.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    labels[labels == 262144] = -100
    batch["labels"] = labels
    return batch

In [14]:
# Create Supervised Fine-tuning configuration
sft_config = SFTConfig(
    output_dir="./results",  # directory to save and repository id
    num_train_epochs=1,  # number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    save_steps=15,
    logging_steps=15,  # log every 15 steps
    learning_rate=0.002,
    weight_decay=0.001,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    report_to="none",
    dataset_kwargs={"skip_prepare_dataset": True},
    remove_unused_columns=False,
    group_by_length=False,
)

# Mixed all configuration
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=finalresult,
    peft_config=peft_config,
    processing_class=processing_class,
    data_collator=collate_fn,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [15]:
# Start fine-tuning training
trainer.train()

# Save model and processing class to new model
trainer.processing_class.save_pretrained(new_model)
trainer.model.save_pretrained(new_model)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
15,1.7292
30,0.3751
45,0.2322
60,0.217
75,0.1491
90,0.1148




In [16]:
# Commit all model and processing class that already saved to HuggingFace model.
from huggingface_hub import HfApi

api = HfApi(token=os.getenv("HUGGINGFACE_TOKEN"))
api.upload_folder(
    folder_path="gemma-finetuning-receipts-extraction/",
    repo_id="budionosan/gemma-finetuning-receipts-extraction",
    repo_type="model",
)

adapter_model.safetensors:   0%|          | 0.00/38.6M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/budionosan/gemma-finetuning-receipts-extraction/commit/7500d8df3812d0f2af5ac91fded67b2c84920a19', commit_message='Upload folder using huggingface_hub', commit_description='', oid='7500d8df3812d0f2af5ac91fded67b2c84920a19', pr_url=None, repo_url=RepoUrl('https://huggingface.co/budionosan/gemma-finetuning-receipts-extraction', endpoint='https://huggingface.co', repo_type='model', repo_id='budionosan/gemma-finetuning-receipts-extraction'), pr_revision=None, pr_num=None)