# Saving and loading
The models created with *transformer_heads* generally integrate well with huggingface and will work with automatic saving/checkpointing during training using for example the *Trainer* class. However, during loading it has to be ensured that all heads are attached correctly and that their parameters (and qlora parameters) are loaded correctly.

**GPU Requirements:** For running with GPT-2 you may be fine with just 8GB of GPU RAM. With about 24GB you should be able to run any 7B or 13B model. With 80GB (A100) GPU you may be able to run a 70B model.

In [1]:
from transformer_heads import (
    create_headed_qlora,
    load_lora_with_heads,
    HeadConfig,
    load_headed,
    get_multi_head_transformer,
)
from transformer_heads.util.helpers import get_model_params
from transformers import BitsAndBytesConfig
from peft import LoraConfig
import torch

In [2]:
# GPT2 is the fastest and requires fewest memory. However, this works just the same with any Llama or Mistral model. Just change model_path to its huggingface path.
model_path = "gpt2"

In [4]:
model_params = get_model_params(model_path)
model_class = model_params["model_class"]
hidden_size = model_params["hidden_size"]
vocab_size = model_params["vocab_size"]
print(model_params)

{'vocab_size': 50257, 'n_positions': 1024, 'n_embd': 768, 'n_layer': 12, 'n_head': 12, 'n_inner': None, 'activation_function': 'gelu_new', 'resid_pdrop': 0.1, 'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'layer_norm_epsilon': 1e-05, 'initializer_range': 0.02, 'summary_type': 'cls_index', 'summary_use_proj': True, 'summary_activation': None, 'summary_first_dropout': 0.1, 'summary_proj_to_labels': True, 'scale_attn_weights': True, 'use_cache': True, 'scale_attn_by_inverse_layer_idx': False, 'reorder_and_upcast_attn': False, 'bos_token_id': 50256, 'eos_token_id': 50256, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, '

Let's define some random head configs for later use.

In [5]:
heads = [
    HeadConfig(
        name="lm_head",
        layer_hook=-1,
        in_size=hidden_size,
        output_activation="linear",
        is_causal_lm=True,
        loss_fct="cross_entropy",
        num_outputs=vocab_size,
    ),
    HeadConfig(
        name="classification_hook",
        layer_hook=-4,
        in_size=hidden_size,
        hidden_size=1024,
        num_layers=2,
        output_activation="linear",
        is_causal_lm=False,
        loss_fct="cross_entropy",
        num_outputs=2,
    ),
    HeadConfig(
        name="regression_hook",
        layer_hook=-6,
        in_size=4096,
        output_activation="linear",
        is_causal_lm=False,
        loss_fct="mse",
        num_outputs=1,
        is_regression=True,
    ),
]

In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    load_in_8bit=False,
)

## Saving and loading a transformer with attached linear probes

In [7]:
# Create a quantized model with multiple heads
model = load_headed(
    model_class,
    model_path,
    heads,
    device_map="cuda",
    quantization_config=quantization_config,
)
# Now you would do some training ...
# Save the model now
model.save_pretrained("test_model")
# Model is saved, delete it
del model

# With load_headed we can load the quantized model with the heads
model = load_headed(
    model_class,
    model_path,
    head_folder_path="test_model",
    device_map="cuda",
    quantization_config=quantization_config,
)

Some weights of TransformerWithHeads were not initialized from the model checkpoint at gpt2 and are newly initialized: ['heads.classification_hook.lins.0.bias', 'heads.classification_hook.lins.0.weight', 'heads.classification_hook.lins.1.weight', 'heads.regression_hook.lins.0.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of TransformerWithHeads were not initialized from the model checkpoint at gpt2 and are newly initialized: ['heads.classification_hook.lins.0.bias', 'heads.classification_hook.lins.0.weight', 'heads.classification_hook.lins.1.weight', 'heads.regression_hook.lins.0.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Saving and loading a model finetuned with qlora with extra heads.

In [8]:
# Some simple LoRA config. target_modules=None will result in all linear layers being adapted with LoRA
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=None,
)
# create_headed_qlora is the way to go for models with LoRA and newly initialized heads
model = create_headed_qlora(
    base_model_class=model_class,
    model_name=model_path,
    quantization_config=quantization_config,
    lora_config=lora_config,
    head_configs=heads,
    fully_trained_heads=True,
    device_map={"": torch.cuda.current_device()},
)
# Now you would do some training ...
# Saving is still easy using the huggingface api
model.save_pretrained("test_model_qlora")
del model

# Load the qlora model with it's heads. We only need the base model class and the save location. Loading quantized is fully optional here.
model = load_lora_with_heads(
    model_class,
    "test_model_qlora",
    quantization_config,
    device_map={"": torch.cuda.current_device()},
)

Some weights of TransformerWithHeads were not initialized from the model checkpoint at gpt2 and are newly initialized: ['heads.classification_hook.lins.0.bias', 'heads.classification_hook.lins.0.weight', 'heads.classification_hook.lins.1.weight', 'heads.regression_hook.lins.0.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
