# Saving and loading
The models created with *transformer_heads* generally integrate well with huggingface and will work with automatic saving/checkpointing during training using for example the *Trainer* class. However, during loading it has to be ensured that all heads are attached correctly and that their parameters (and qlora parameters) are loaded correctly.

In [None]:
from transformer_heads import (
    create_headed_qlora,
    load_lora_with_heads,
    HeadConfig,
    load_headed,
    get_multi_head_transformer,
)
from transformer_heads.util.helpers import get_model_params
from transformers import BitsAndBytesConfig
from peft import LoraConfig
import torch

In [None]:
model_path = "gpt2"

In [None]:
model_params = get_model_params(model_path)
model_class = model_params["model_class"]
hidden_size = model_params["hidden_size"]
vocab_size = model_params["vocab_size"]
print(model_params)

Let's define some random head configs for later use.

In [None]:
heads = [
    HeadConfig(
        name="lm_head",
        layer_hook=-1,
        in_size=hidden_size,
        output_activation="linear",
        is_causal_lm=True,
        loss_fct="cross_entropy",
        num_outputs=vocab_size,
    ),
    HeadConfig(
        name="classification_hook",
        layer_hook=-4,
        in_size=hidden_size,
        hidden_size=1024,
        num_layers=2,
        output_activation="linear",
        is_causal_lm=False,
        loss_fct="cross_entropy",
        num_outputs=2,
    ),
    HeadConfig(
        name="regression_hook",
        layer_hook=-6,
        in_size=4096,
        output_activation="linear",
        is_causal_lm=False,
        loss_fct="mse",
        num_outputs=1,
        is_regression=True,
    ),
]

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    load_in_8bit=False,
)

## Saving and loading a transformer with attached linear probes

In [None]:
# Create a quantized model with multiple heads
model = load_headed(
    model_class,
    model_path,
    heads,
    device_map="cuda",
    quantization_config=quantization_config,
)
# Now you would do some training ...
# Save the model now
model.save_pretrained("test_model")
# Model is saved, delete it
del model

# With load_headed we can load the quantized model with the heads
model = load_headed(
    model_class,
    model_path,
    head_folder_path="test_model",
    device_map="cuda",
    quantization_config=quantization_config,
)

# Saving and loading a model finetuned with qlora with extra heads.

In [None]:
# Some simple LoRA config. target_modules=None will result in all linear layers being adapted with LoRA
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=None,
)
# create_headed_qlora is the way to go for models with LoRA and newly initialized heads
model = create_headed_qlora(
    base_model_class=model_class,
    model_name=model_path,
    quantization_config=quantization_config,
    lora_config=lora_config,
    head_configs=heads,
    fully_trained_heads=True,
    device_map={"": torch.cuda.current_device()},
)
# Now you would do some training ...
# Saving is still easy using the huggingface api
model.save_pretrained("test_model_qlora")
del model

# Load the qlora model with it's heads. We only need the base model class and the save location. Loading quantized is fully optional here.
model = load_lora_with_heads(
    model_class,
    "test_model_qlora",
    quantization_config,
    device_map={"": torch.cuda.current_device()},
)