In [1]:
import torch
import pandas as pd
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, AwqConfig, GPTQConfig
from trl import SFTTrainer, SFTConfig
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_model_and_tokenizer(model_name, quantization_config, use_gpu = False):
    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)
    
    if use_gpu:
        model.to("cuda")
    
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""
    
    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
        
    return model, tokenizer

def preprocess_function(df, context: bool):
    if context:
        processed_data = df.apply(lambda row: {
            "prompt": [{"role": "user", "content": row["question"] + row["context"]}],
            "completion": [{"role": "assistant", "content": row["answer"]}]
        }, axis=1)
    else:
        processed_data = df.apply(lambda row: {
            "prompt": [{"role": "user", "content": row["question"]}],
            "completion": [{"role": "assistant", "content": row["answer"]}]
        }, axis=1)

    
    return processed_data.tolist()

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [3]:
DATASET_CHOICE = "arc"       # options: "arc", "boolq", "squad"
FINETUNING = "SFT"
QUANT_METHOD = "bnb_int8"    # options: "bnb_int8", "bnb_int4", "fp16"
HP_PROFILE = "fast"          # options: "fast", "balanced", "high_quality"
INFER_MODE = "chat"          # options: "chat", "generate"

MODEL_NAME = "Qwen/Qwen1.5-1.8B"
OUTPUT_DIR = "./sft_model"

QUANT_METHOD = "BitsAndBytes"  # Opciones: dynamic, adaround, brecq, gptq, quarot, awq

USE_GPU = False

In [4]:
# ============================================================
# Dataset selection
# ============================================================

if DATASET_CHOICE == "arc":
    df = pd.read_parquet("../Datasets/test-ai2_arc.parquet")
elif DATASET_CHOICE == "boolq":
    df = pd.read_parquet("../Datasets/test-boolq.parquet")
elif DATASET_CHOICE == "squad":
    df = pd.read_parquet("../Datasets/test-squad_v2.parquet")
else:
    raise ValueError("Invalid DATASET_CHOICE")

context = DATASET_CHOICE != "arc"

print(f"Loaded dataset: {DATASET_CHOICE}")

Loaded dataset: arc


In [5]:
# --------------------------------------------
# MÉTODOS DE CUANTIZACIÓN
# --------------------------------------------

match QUANT_METHOD:
    case "BitsAndBytes":
        load_in_4bit = True
        bnb_4bit_quant_type = "nf4"
        bnb_4bit_use_double_quant = True # Saves more memory at no additional performance
        bnb_4bit_compute_dtype = torch.bfloat16


        quantization_config = BitsAndBytesConfig(
            load_in_4bit=load_in_4bit,
            bnb_4bit_quant_type=bnb_4bit_quant_type,
            bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
        )

    case "awq":
        bits = 4
        fuse_max_seq_len = 512
        do_fuse = True

        quantization_config = AwqConfig(
            bits=bits,
            fuse_max_seq_len=fuse_max_seq_len,
            do_fuse=do_fuse,
        )

    case "gptq":
        bits = 4
        v2 = True # Activate GPTQ v2
        
        quantization_config = GPTQConfig(
            bits=bits, 
            dataset=dataset, 
            tokenizer=tokenizer,
            v2=v2
        )

        # USAR EORA PARA EJECUTAR

    case "adaround":
        raise NotImplementedError("Implementar AdaRound aquí")

    case "brecq":
        raise NotImplementedError("Implementar BRECQ aquí")

    case "quarot":
        raise NotImplementedError("Implementar QuaRot aquí")

    case _:
        quantization_config = None

In [6]:
model, tokenizer = load_model_and_tokenizer(MODEL_NAME, quantization_config, USE_GPU)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


lora_r = 32
target_modules=["query_key_value"]
lora_alpha = 16
lora_dropout=0.1

config = LoraConfig(
    r=lora_r, 
    lora_alpha=lora_alpha, 
    target_modules=target_modules, 
    lora_dropout=lora_dropout, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ValueError: Target modules {'query_key_value'} not found in the base model. Please check the target modules and try again.

In [None]:
learning_rate = 8e-5 # Learning rate for training. 

num_train_epochs = 1 #  Set the number of epochs to train the model.

per_device_train_batch_size = 1 # Batch size for each device (e.g., GPU) during training. 

gradient_accumulation_steps = 8 # Number of steps before performing a backward/update pass to accumulate gradients.

gradient_checkpointing = False # Enable gradient checkpointing to reduce memory usage during training at the cost of slower training speed.

packing = False # Multiple examples are packed in the same input sequence to increase training efficiency

assistant_only_loss = False # Loss is computed only on the assistant responses, ignoring user or system messages

max_length = 1024 # Maximum length of the tokenized sequence

num_train_epochs = 1

max_steps = 1

output_dir = ""


logging_steps = 2  # Frequency of logging training progress (log every 2 steps).

train_dataset = preprocess_function(df)

# SFTTrainer config 
sft_config = SFTConfig(
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    logging_steps=logging_steps,
    packing = packing,
    assistant_only_loss=assistant_only_loss,
    output_dir=output_dir,
    max_length=max_length,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
)

sft_trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset, 
    processing_class=tokenizer,
)

In [None]:
# --------------------------------------------
# EJEMPLO DE INFERENCIA (común para todos)
# --------------------------------------------