In [1]:
# Fine-Tune Llama2-7b on SE paired dataset
import os
from dataclasses import dataclass, field
from typing import Optional
import torch
import torch.nn as nn
from accelerate import Accelerator
from datasets import load_dataset,DatasetDict
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model_state_dict
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments

from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset
from random import randrange
torch.set_default_device("mps")

  from .autonotebook import tqdm as notebook_tqdm


https://huggingface.co/docs/datasets/loading

In [2]:
from datasets import load_dataset, load_from_disk, concatenate_datasets
#cord_dataset = load_dataset("mychen76/receipts_sroie_v2")
dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1")
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 2043
    })
    test: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 125
    })
    valid: Dataset({
        features: ['image', 'id', 'parsed_data', 'raw_data'],
        num_rows: 70
    })
})

In [3]:
from random import randrange
print(f"dataset size: {len(dataset['train'])}")
sample=dataset['train'][randrange(len(dataset))]

dataset size: 2043


In [4]:
ocr_words=eval(sample['raw_data'])['ocr_words']
ocr_boxes=eval(sample['raw_data'])['ocr_boxes']
print(len(ocr_words), len(ocr_boxes))
print(ocr_words)
print(ocr_boxes)

1029 8122
['Invoice no: 61356291', 'Date of issue:', '09/06/2012', 'Client:', 'Seller:', 'Chapman, Kim and Green', 'Rodriguez-Stevens', '64731 James Branch', '2280 Angela Plain', 'Smithmouth, NC 26872', 'Hortonshire, MS 93248', 'Tax Id: 949-84-9105', 'Tax Id: 939-98-8477', 'IBAN: GB50ACIE59715038217063', 'ITEMS', 'No.', 'Description', 'Qty', 'UM', 'Net price', 'Net worth', 'VAT [%]', 'Gross', ' worth', 'Wine Glasses Goblets Pair Clear', '12,00', '60,00', '10%', '5,00', 'each', '66,00', '1.', 'Glass', 'With Hooks Stemware Storage', '28,08', '112,32', '10%', '4,00', 'each', '123,55', 'Multiple Uses Iron Wine Rack', 'Hanging Glass', '1,00', '7,50', '7,50', '10%', '8,25', 'Replacement Corkscrew Parts', 'each', 'Spiral Worm Wine Opener Bottle', 'Houdini', 'HOME ESSENTIALS GRADIENT', '1,00', 'each', '12,99', '12,99', '10%', '14,29', '4', 'STEMLESS WINE GLASSES SET', 'OF 4 20 FL OZ (591 ml) NEW', 'SUMMARY', 'VAT [%]', 'Net worth', 'VAT', 'Gross worth', '10%', '192,81', '19,28', '212,09', 'Tot

In [5]:
ocr_json=eval(sample['parsed_data'])['json']
len(ocr_json), ocr_json

(1238,
 "{'header': {'invoice_no': '61356291', 'invoice_date': '09/06/2012', 'seller': 'Chapman, Kim and Green 64731 James Branch Smithmouth, NC 26872', 'client': 'Rodriguez-Stevens 2280 Angela Plain Hortonshire, MS 93248', 'seller_tax_id': '949-84-9105', 'client_tax_id': '939-98-8477', 'iban': 'GB50ACIE59715038217063'}, 'items': [{'item_desc': 'Wine Glasses Goblets Pair Clear Glass', 'item_qty': '5,00', 'item_net_price': '12,00', 'item_net_worth': '60,00', 'item_vat': '10%', 'item_gross_worth': '66,00'}, {'item_desc': 'With Hooks Stemware Storage Multiple Uses Iron Wine Rack Hanging Glass', 'item_qty': '4,00', 'item_net_price': '28,08', 'total_net_worth': '112,32', 'item_vat': '10%', 'item_gross_worth': '123,55'}, {'item_desc': 'Replacement Corkscrew Parts Spiral Worm Wine Opener Bottle Houdini', 'item_qty': '1,00', 'item_net_price': '7,50', 'total_net_worth': '7,50', 'item_vat': '10%', 'item_gross_worth': '8,25'}, {'item_desc': 'HOME ESSENTIALS GRADIENT STEMLESS WINE GLASSES SET OF 4

In [6]:
def format_train_instruction(sample):
    return f"""### Instruction:
You are POS receipt expert, and receipt data engineer with many years on working with complex receipt structure. 
I need you parse, detect, recognize and convert following receipt OCR image result into structure receipt format. 
the outout mus be a well-formed json object.```json

### Input:
{eval(sample['raw_data'])['ocr_boxes']}

### Output:
{eval(sample['parsed_data'])['json']}"""

In [7]:
from random import randrange
print(format_train_instruction(dataset["train"][randrange(len(dataset))]))

### Instruction:
You are POS receipt expert, and receipt data engineer with many years on working with complex receipt structure. 
I need you parse, detect, recognize and convert following receipt OCR image result into structure receipt format. 
the outout mus be a well-formed json object.```json

### Input:
[[[[196.0, 110.0], [801.0, 110.0], [801.0, 161.0], [196.0, 161.0]], ('Invoice no: 40378170', 0.9985853433609009)], [[[196.0, 212.0], [517.0, 212.0], [517.0, 259.0], [196.0, 259.0]], ('Date of issue:', 0.9883247017860413)], [[[1204.0, 216.0], [1466.0, 216.0], [1466.0, 256.0], [1204.0, 256.0]], ('10/15/2012', 0.9997990727424622)], [[[192.0, 665.0], [388.0, 665.0], [388.0, 716.0], [192.0, 716.0]], ('Seller:', 0.9997318387031555)], [[[1240.0, 661.0], [1429.0, 661.0], [1429.0, 720.0], [1240.0, 720.0]], ('Client:', 0.9994950890541077)], [[[203.0, 753.0], [931.0, 760.0], [930.0, 811.0], [203.0, 804.0]], ('Patel, Thompson and Montgomery', 0.9964486360549927)], [[[1244.0, 756.0], [1890.0, 7

In [8]:
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = format_train_instruction(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens

In [9]:
dataset_id="mychen76/invoices-and-receipts_ocr_v1"
data_dir="data/finetune"

def create_datasets(tokenizer,dataset_id,data_dir=None,seq_length=2048,num_workers=6,streaming=False,size_valid_set=10,shuffle_buffer=1000):
    dataset = load_dataset(
        dataset_id,
        data_dir=data_dir,
        split="train",
        token=True,
        num_proc=num_workers if not streaming else None,
        streaming=streaming,
    )
    if streaming:
        print("Loading the dataset in streaming mode")
        valid_data = dataset.take(size_valid_set)
        train_data = dataset.skip(size_valid_set)
        train_data = train_data.shuffle(buffer_size=shuffle_buffer, seed=None)
    else:
        dataset = dataset.train_test_split(test_size=0.003, seed=None)
        train_data = dataset["train"]
        valid_data = dataset["test"]
        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer)
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=format_train_instruction,
        infinite=True,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        formatting_func=format_train_instruction,
        infinite=False,
        seq_length=seq_length,
        chars_per_token=chars_per_token,
    )
    return train_dataset, valid_dataset


## load Model for training
`torch.device('cuda:0')` Note that you don’t need to pass device_map when loading the model for training. It will automatically load your model on your GPU. You can also set the device map to a specific device if needed (e.g. cuda:0, 0, torch.device('cuda:0')). Please note that device_map=auto should be used for inference only.


> Nota:
> Login con hugging face antes `huggingface-cli login` 
> Instalar cli -> https://huggingface.co/docs/huggingface_hub/guides/cli

In [10]:
#!transformers-cli env

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

use_flash_attention = False
# Hugging Face model id
#model_id = "NousResearch/Llama-2-7b-hf" # non-gated "meta-llama/Llama-2-7b-hf
#model_id="PY007/TinyLlama-1.1B-intermediate-step-240k-503b"
model_id="TinyLlama/TinyLlama_v1.1"
#model_id = "mistralai/Mistral-7B-v0.1" 
#model_id = "meta-llama/Llama-3.2-1B"

bnb_config = BitsAndBytesConfig(
    #load_in_4bit=True,
    #load_in_8bit=True,
    #bnb_4bit_use_double_quant=True,
    #bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
##quantization_config=bnb_config, 
# Load model and tokenizer
model_8bit = AutoModelForCausalLM.from_pretrained(model_id, 
                                            #quantization_config=bnb_config,  
                                             #use_auth_token=True,
                                             token=True,
                                             trust_remote_code=True,                                                  
                                             device_map="mps")

base_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
base_tokenizer.pad_token = base_tokenizer.eos_token
base_tokenizer.padding_side = "right"

In [17]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model_8bit)

trainable params: 1100048384 || all params: 1100048384 || trainable%: 100.0


In [18]:
## frezee the model
for param in model_8bit.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model_8bit.gradient_checkpointing_enable()  # reduce number of stored activations
model_8bit.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model_8bit.lm_head = CastOutputToFloat(model_8bit.lm_head)

In [19]:
model_8bit.eval()


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [20]:
print(model_8bit.get_memory_footprint()/1024/1024/1024, "GB")


4.097999691963196 GB


In [21]:
print(model_8bit.config.max_position_embeddings)


2048


In [22]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

peft_config = LoraConfig(
    r=64, 
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"], # skip this time
    bias="none",
    task_type="CAUSAL_LM",
)

## prepare model for training
#model = prepare_model_for_kbit_training(model_8bit)
base_model = get_peft_model(model_8bit, peft_config)
base_model.print_trainable_parameters()

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
trainable params: 9,011,200 || all params: 1,109,059,584 || trainable%: 0.8125


In [23]:
from transformers import TrainingArguments
import transformers 

OUTPUT_DIR = "./results/mistral7b_ocr_to_json"
NUM_TRAIN_EPOCHS=3
SAVE_STEPS=20
LOGGING_STEPS=10
LEARNING_RATE=2e-4 #3e-4
TRAIN_STEPS=150  #300
#WARM_UP_STEPS=50  or ratio 
max_seq_length = 2048 # max sequence length for model and packing of the dataset
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,        
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,        
    optim="paged_adamw_32bit",  
    logging_steps=LOGGING_STEPS,    
    save_total_limit=2,  
    save_strategy="epoch",    
    learning_rate=2e-4,            ## LEARNING_RATE,    
    #bf16=True,
    #tf32=True,        
    #bf16_full_eval=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,             ## warmup_steps=WARM_UP_STEPS,    
    lr_scheduler_type="constant",  ##"cosine"   
    disable_tqdm=False,              # disable tqdm since with packing values are in correct    
    #max_steps=TRAIN_STEPS,
    report_to="tensorboard",
    #save_steps=SAVE_STEPS,
    group_by_length=False,
    #remove_unused_columns=False,
    #evaluation_strategy="epoch",  #steps
    run_name="sft_mistral7b_colorist",
)
train_dataset, eval_dataset = create_datasets(base_tokenizer, dataset_id, seq_length=max_seq_length)

Size of the train set: 2036. Size of the validation set: 7


100%|██████████| 400/400 [00:01<00:00, 206.38it/s]

The character to token ratio of the dataset is: 1.35





In [24]:
from trl import SFTTrainer,SFTConfig,DataCollatorForCompletionOnlyLM


training_args = SFTConfig(
    packing=True,
    max_seq_length=max_seq_length,
)


trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    args=training_args,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [25]:
## set model in training mode
base_model.config.pretraining_tp = 1
base_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

## pytorch optimization 
# old_state_dict = base_model.state_dict
# base_model.state_dict = (
#     lambda self, *_, **__: get_peft_model_state_dict(
#         self, old_state_dict()
#     )
# ).__get__(base_model, type(base_model)) 
base_model = torch.compile(base_model)

# # Enable cuDNN auto-tuner - NVIDIA cuDNN supports many algorithms to compute a convolution. 
# torch.backends.cudnn.benchmark = True

In [23]:
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.7'

In [26]:
%%time
# train
trainer.train() # there will not be a progress bar since tqdm is disabled
# save model
trainer.save_model(OUTPUT_DIR)

  return func(*args, **kwargs)


RuntimeError: MPS backend out of memory (MPS allocated: 8.25 GB, other allocations: 8.26 GB, max allowed: 20.40 GB). Tried to allocate 4.00 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).