In [1]:
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from datasets import load_from_disk
from datasets import load_dataset
from PIL import Image
from tqdm import tqdm
from io import BytesIO
import numpy as np
import base64
import torch
import json
import os

Using TensorFlow backend.





In [2]:
NUMBER_EPOCHS = 30

In [None]:

if torch.cuda.is_available()==True:
    DEVICE = "cuda"
else:
    DEVICE = "cpu"


model_id = "ds4sd/SmolDocling-256M-preview" 
print("Loading processor ...")
processor = AutoProcessor.from_pretrained(
    model_id
)
print("Processor Loaded.")


In [3]:
def collate_fn(examples):
    texts = []
    images = []

    for example in examples:
        image = example["image"]
        if image.mode != 'RGB':
            image = image.convert('RGB')

        output_text = example["output"]

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Convert this page to docling."},
                    {"type": "image"}
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": output_text}
                ]
            }
        ]

        chat = processor.apply_chat_template(messages, add_generation_prompt=False)
        texts.append(chat.strip())
        images.append(image)  # enlever la liste si pas besoin

    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    batch["labels"] = labels

    return batch


In [None]:
print("Loading configurations and Model ....")

lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    use_dora= True,
    init_lora_weights="gaussian"
)
lora_config.inference_mode = False


model = Idefics3ForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config= None,
    _attn_implementation="flash_attention_2",
    device_map="auto"
)
model.add_adapter(lora_config)
model.enable_adapters()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
print(model.get_nb_trainable_parameters())

print("Done.")


Loading configurations and Model ....


You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in Idefics3VisionTransformer is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in LlamaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash

(3067776, 259552704)
Done.


In [6]:
print("Loading Dataset")
dataset = load_from_disk("SmolDoclingDataNoLoc")
train_loader = DataLoader(dataset["train"], batch_size=4, shuffle=True, collate_fn=collate_fn)
print("Done.")

Loading Dataset
Done.


In [7]:
from transformers import TrainerCallback

class ClearCacheCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        import torch
        torch.cuda.empty_cache()
        print("Cleared CUDA cache at epoch end.")


In [None]:
model.config.use_cache = False
image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")]
model_name = model_id.split("/")[-1]


training_args = TrainingArguments(
    num_train_epochs=NUMBER_EPOCHS,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=25,
    save_strategy="steps",
    save_steps=250,
    save_total_limit=1,
    optim="paged_adamw_8bit", 
    bf16=True, 
    output_dir=f"./{model_name}-{NUMBER_EPOCHS}",
    hub_model_id=f"./{model_name}-{NUMBER_EPOCHS}",
    report_to="tensorboard",
    remove_unused_columns=False,
    gradient_checkpointing=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

trainer.add_callback(ClearCacheCallback())


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
print("Start trainning")
trainer.train()
print("Done training")

Start trainning


The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
25,1.4631
50,1.035
75,0.6318
100,0.4086
125,0.325
150,0.309
175,0.2922
200,0.2792
225,0.2572
250,0.2578


Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.




Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Cleared CUDA cache at epoch end.
Done training


In [11]:
print("Saving model")
model_save_path = f"./{model_name}-{NUMBER_EPOCHS}--NoLoc"
model.save_pretrained(model_save_path)
processor.save_pretrained(model_save_path)
print("Done")


Saving model
Done
