In [1]:
from datetime import datetime
import json
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, AwqConfig, GPTQConfig, BitsAndBytesConfig, pipeline
from trl import SFTTrainer, SFTConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def load_model_and_tokenizer(model_name, quantization_config, device_map):
    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map=device_map)
    
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""
    
    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
        
    return model, tokenizer

def preprocess_function(df, context: bool):
    if context:
        processed_data = df.apply(lambda row: {
            "prompt": [{"role": "user", "content": row["question"] + row["context"]}],
            "completion": [{"role": "assistant", "content": row["answer"]}]
        }, axis=1)
    else:
        processed_data = df.apply(lambda row: {
            "prompt": [{"role": "user", "content": row["question"]}],
            "completion": [{"role": "assistant", "content": row["answer"]}]
        }, axis=1)

    # Convertir el resultado en Dataset de HuggingFace
    return Dataset.from_list(processed_data.tolist())

In [3]:
DATASET_CHOICE = "arc"       # options: "arc", "boolq", "squad"
FINETUNING = "SFT"

MODEL_NAME = "Qwen/Qwen3-0.6B"

QUANT_METHOD = "QLORA"  # Options: "None", "QLORA", "AWQ", "GPTQ"

device_map = {"": 0} if torch.cuda.is_available() else {"": "cpu"}

In [4]:
# ============================================================
# Dataset selection
# ============================================================

if DATASET_CHOICE == "arc":
    df = pd.read_parquet("../Datasets/test-ai2_arc.parquet").head(3)
elif DATASET_CHOICE == "boolq":
    df = pd.read_parquet("../Datasets/test-boolq.parquet")
elif DATASET_CHOICE == "squad":
    df = pd.read_parquet("../Datasets/test-squad_v2.parquet")
else:
    raise ValueError("Invalid DATASET_CHOICE")

context = DATASET_CHOICE != "arc"

print(f"Loaded dataset: {DATASET_CHOICE}")

Loaded dataset: arc


In [None]:
# --------------------------------------------
# MÉTODOS DE CUANTIZACIÓN
# --------------------------------------------

match QUANT_METHOD:
    case "QLORA":
        load_in_4bit = True
        bnb_4bit_quant_type = "nf4"
        bnb_4bit_use_double_quant = True # Saves more memory at no additional performance
        bnb_4bit_compute_dtype = torch.bfloat16


        quantization_config = BitsAndBytesConfig(
            load_in_4bit=load_in_4bit,
            bnb_4bit_quant_type=bnb_4bit_quant_type,
            bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
        )

    case "AWQ":
        bits = 4
        fuse_max_seq_len = 512
        do_fuse = True

        quantization_config = AwqConfig(
            bits=bits,
            fuse_max_seq_len=fuse_max_seq_len,
            do_fuse=do_fuse,
        )

    case "GPTQ":
        bits = 4
        v2 = True # Activate GPTQ v2
        
        quantization_config = GPTQConfig(
            bits=bits, 
            dataset=dataset, 
            tokenizer=tokenizer,
            v2=v2
        )

        # USAR EORA PARA EJECUTAR

    case "adaround":
        raise NotImplementedError("Implementar AdaRound aquí")

    case "brecq":
        raise NotImplementedError("Implementar BRECQ aquí")

    case "quarot":
        raise NotImplementedError("Implementar QuaRot aquí")

    case _:
        quantization_config = None

In [None]:
model, tokenizer = load_model_and_tokenizer(MODEL_NAME, quantization_config, device_map)
model = prepare_model_for_kbit_training(model)

lora_r = 32
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
lora_alpha = 16
lora_dropout = 0.1

config = LoraConfig(
    r=lora_r, 
    lora_alpha=lora_alpha, 
    target_modules=target_modules, 
    lora_dropout=lora_dropout, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)

trainable params: 9175040 || all params: 385024000 || trainable%: 2.382978723404255


In [7]:
# Carpeta de salida donde se guardarán los checkpoints y el modelo entrenado.
output_dir = f"../Models/{MODEL_NAME}-{DATASET_CHOICE}_{FINETUNING}_{QUANT_METHOD}" 

# Tamaño de batch por cada GPU durante el entrenamiento.
per_device_train_batch_size = 1  
 
# Tamaño de batch para evaluación.
per_device_eval_batch_size = 8  

# Número de pasos de acumulación de gradientes antes de hacer backpropagation. 
# Útil cuando batch_size es muy pequeño.
gradient_accumulation_steps = 1  

# Tasa de aprendizaje. Valores comunes para fine-tuning entre 1e-5 y 1e-4.
learning_rate = 8e-5  

# Factor de regularización para evitar overfitting en los pesos del modelo.
weight_decay = 0.0  

# Gradiente máximo permitido (clipping). Previene exploding gradients.
max_grad_norm = 1.0  

# Número de épocas completas sobre el dataset. 
num_train_epochs = 1  

# Número máximo de pasos de entrenamiento. Si está definido, sobreescribe num_train_epochs.
max_steps = 1  

# Porcentaje del entrenamiento usado como warmup (inicio suave de learning rate).
warmup_ratio = 0.0  

# Frecuencia (en pasos) para imprimir métricas de entrenamiento.
logging_steps = 10  

# Si es True, agrupa ejemplos por longitud similar para eficiencia. 
group_by_length = False  

# Reduce el uso de memoria activando checkpointing de gradientes 
# (más lento, pero permite entrenar modelos grandes). 
gradient_checkpointing = True  

# Une varias entradas en una sola secuencia para mejorar eficiencia en entrenamiento. 
packing = True  

# Si True, solo se calcula la pérdida en las respuestas del asistente, 
# ignorando mensajes del usuario/sistema. 
assistant_only_loss = False  

# Longitud máxima de la secuencia tokenizada de entrada. 
max_length = 1024  

# Dataset procesado
train_dataset = preprocess_function(df, context)


# ===============================
# Configuración de entrenamiento
# ===============================
sft_config = SFTConfig(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    logging_steps=logging_steps,
    packing=packing,
    assistant_only_loss=assistant_only_loss,
    max_length=max_length,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    no_cuda=True
)


# ===============================
# Inicialización del Trainer
# ===============================
sft_trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset, 
    processing_class=tokenizer,
)

Tokenizing train dataset: 100%|██████████| 3/3 [00:00<00:00, 118.10 examples/s]
Packing train dataset: 100%|██████████| 3/3 [00:00<00:00, 931.52 examples/s]


In [17]:
# Calculate total and trainable parameters


trainable_params = 0
total_params = 0
for _, param in model.named_parameters():
    total_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
percentage_trainable = trainable_params / total_params * 100

# Print a summary of the model
print("Model Summary:")
print(f"Model name: {MODEL_NAME}")
print(f"Number of parameters: {total_params / 1e9:.4f} billion")
print(f"Number of trainable parameters: {trainable_params / 1e9:.7f} billion")
print(f"Percentage of trainable parameters: {percentage_trainable:.7f}%")
print(f"Number of layers: {model.config.num_hidden_layers}")
print(f"Hidden size: {model.config.hidden_size}")

print('=========================================================================================================')

if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")

    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")

    device = torch.device("cuda")
    current_gpu_name = torch.cuda.get_device_name(device.index)
    print(f"Using GPU: {current_gpu_name}")

    memory_stats = torch.cuda.memory_stats(device)
    total_memory = torch.cuda.get_device_properties(device).total_memory
    allocated_memory = memory_stats['allocated_bytes.all.current']
    reserved_memory = memory_stats['reserved_bytes.all.current']
    free_memory = total_memory - reserved_memory

    print(f"Total VRAM: {total_memory / (1024 ** 3):.2f} GB")
else:
    device = torch.device("cpu")
    print("Using CPU")

print('=========================================================================================================')

Model Summary:
Model name: Qwen/Qwen3-0.6B
Number of parameters: 0.3758 billion
Number of trainable parameters: 0.0000000 billion
Percentage of trainable parameters: 0.0000000%
Number of layers: 28
Hidden size: 1024
Number of GPUs available: 1
GPU 0: NVIDIA GeForce GTX 1650
Using GPU: NVIDIA GeForce GTX 1650
Total VRAM: 3.77 GB


In [None]:
# ===============================
# Entrenamiento
# ===============================
sft_trainer.train()

=== GPU INFO ===
GPU: NVIDIA GeForce GTX 1650
Memoria total: 4.05 GB
Memoria usada: 0.00 GB


  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


TrainOutput(global_step=1, training_loss=7.716121196746826, metrics={'train_runtime': 489.2349, 'train_samples_per_second': 0.002, 'train_steps_per_second': 0.002, 'total_flos': 671765889024.0, 'train_loss': 7.716121196746826})

In [None]:
# --------------------------------------------
# EJEMPLO DE INFERENCIA (común para todos)
# --------------------------------------------

prompt = "Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation? A Put the objects in groups. B Change the height of the ramp. C Choose different objects to roll. D Record the details of the investigation."

# Solución alternativa con pipeline
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map=device_map,
    use_cache=False,
    do_sample=True,
    temperature=0.7,
    max_length=200,
    pad_token_id=tokenizer.eos_token_id
)

# Generar respuesta
result = pipe(prompt)
print(result[0]['generated_text'])

Device set to use cpu


Juan and LaKeisha roll a few objects down a ramp. They want to see which object rolls the farthest. What should they do so they can repeat their investigation? A Put the objects in groups. B Change the height of the ramp. C Choose different objects to roll. D Record the details of the investigation. E. Make a graph of the distance traveled.

Answer:
They should choose different objects to roll. So the answer is \boxed{C}.
Answer:
C

Answer:
C
Answer:
C
Answer:
C
Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:
C

Answer:



In [19]:
model = sft_trainer.model.merge_and_unload()
model.save_pretrained(output_dir, safe_serialization=True)

def safe_serialize(obj):
    """Convierte objetos no serializables a formato serializable"""
    if isinstance(obj, (str, int, float, bool, type(None))):
        return obj
    elif isinstance(obj, (list, tuple)):
        return [safe_serialize(item) for item in obj]
    elif isinstance(obj, dict):
        return {key: safe_serialize(value) for key, value in obj.items()}
    elif isinstance(obj, SFTConfig):
        # Convertir TrainingArguments a dict
        return {k: safe_serialize(v) for k, v in obj.to_dict().items()}
    elif isinstance(obj, BitsAndBytesConfig):
        # Convertir BitsAndBytesConfig a dict
        return {k: safe_serialize(v) for k, v in vars(obj).items() if not k.startswith('_')}
    elif hasattr(obj, '__dict__'):
        # Para otros objetos con atributos
        return {k: safe_serialize(v) for k, v in vars(obj).items() if not k.startswith('_')}
    else:
        # Convertir a string para tipos no serializables
        return str(obj)

# 4. Crear metadatos completos
training_metadata = {
    "model_info": {
        "model_name": "Qwen3-0.6B",
        "fine_tuning_date": datetime.now().isoformat(),
        "model_type": "CausalLM",
        "has_quantization": quantization_config is not None,
        "total_params": total_params,
        "trainable_params": trainable_params,
        "percentage_trainable": percentage_trainable
    },
    "training_parameters": safe_serialize(sft_trainer.args),
    "lora_config": safe_serialize(config),
    "quantization_config": safe_serialize(quantization_config) if quantization_config else None,
    "training_stats": {
        "total_steps": sft_trainer.state.max_steps,
        "epochs_completed": sft_trainer.state.epoch,
    },
    "hardware_info": {
        "device": str(model.device),
        "dtype": str(model.dtype),
    }
}

# 5. Guardar metadatos
with open(f"{output_dir}/training_metadata.json", "w") as f:
    json.dump(training_metadata, f, indent=4, ensure_ascii=False)

# 6. Guardar también la configuración del modelo base (importante)
model.config.save_pretrained(output_dir)

print(f"Modelo guardado en: {output_dir}")
print("Estructura de archivos creada:")
print(f"  - pytorch_model.bin (modelo fusionado)")
print(f"  - config.json (configuración del modelo)")
print(f"  - training_metadata.json (metadatos de entrenamiento)")
print(f"  - tokenizer files")

Modelo guardado en: ../Models/Qwen/Qwen3-0.6B-arc_SFT_QLORA
Estructura de archivos creada:
  - pytorch_model.bin (modelo fusionado)
  - config.json (configuración del modelo)
  - training_metadata.json (metadatos de entrenamiento)
  - tokenizer files
