# Doing something wild

The point of this notebook is not to do anything useful, but to show what is possible with relatively low effort using the *transformer_heads* library. In this example, we will train four heads on a transformer model while using qlora to finetune the transformer block weights. The first head will be hooked at layer 9 (-4) and predict the sentiment of imdb reviews (text classification). The second head will be hooked at the last layer (-1 or 12) and does causal language modelling on imdb reviews. The third head will be hooked at layer 6 (-7) and will learn to count the number of occurences of each letter of the alphabet occuring in imdb reviews (Text-level regression). The final head will be hooked at layer 4 (-9) and will predict how many words will follow before the review ends for each word in imdb reviews (Word-level regression). The final head will also be a small mlp instead of a linear head.

All heads and the qlora parameters will be trained jointly (multi-modal learning).

In [2]:
from transformer_heads import create_headed_qlora
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    MistralForCausalLM,
    Trainer,
    BitsAndBytesConfig,
    TrainingArguments,
    GPT2Model,
    GPT2LMHeadModel
)
from transformer_heads.util.helpers import DataCollatorWithPadding
from peft import LoraConfig
from transformer_heads.config import HeadConfig
from transformer_heads.util.model import print_trainable_parameters
from transformer_heads.util.evaluate import evaluate_head_wise,get_top_n_preds,get_some_preds
import torch
import pandas as pd

In [3]:
# model_class, model_path = MistralForCausalLM, "mistralai/Mistral-7B-v0.1", 4096, 32000
model_class, model_path, hidden_size, vocab_size = GPT2LMHeadModel, "gpt2", 768, 50257

In [4]:
head_configs = [
    HeadConfig(
        name=f"sentiment_head",
        layer_hook=-4,
        in_size=hidden_size,
        output_activation="linear",
        pred_for_sequence=True,
        loss_fct="cross_entropy",
        num_outputs=2,
    ),
    HeadConfig(
        name=f"causal_lm_head",
        layer_hook=-1,
        in_size=hidden_size,
        output_activation="linear",
        is_causal_lm=True,
        loss_fct="cross_entropy",
        num_outputs=vocab_size,
        is_regression=False,
        output_bias=False,
    ),
    HeadConfig(
        name=f"alphabet_regression",
        layer_hook=-7,
        in_size=hidden_size,
        output_activation="linear",
        is_causal_lm=False,
        pred_for_sequence=True,
        loss_fct="mse",
        num_outputs=26, # 26 letters in the alphabet
        is_regression=True,
    ),
    HeadConfig(
        name=f"num_words_regression",
        layer_hook=-7,
        hidden_size=128, # MLP hidden size
        num_layers=3, # 2 hidden layers in MLP
        in_size=hidden_size,
        output_activation="linear",
        is_causal_lm=False,
        pred_for_sequence=False,
        loss_fct="mse",
        num_outputs=1,
        is_regression=True,
    ),
]

In [5]:
dd = load_dataset("imdb")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token


def processing_function(examples):
    out = tokenizer(examples["text"], padding=False)
    for hc in head_configs:
        out[hc.name] = examples["label"]
    return out


for split in dd.keys():
    dd[split] = dd[split].filter(function=lambda example:len(example["text"])>10)
    dd[split] = dd[split].shuffle()
    dd[split] = dd[split].map(tokenize_function, batched=True)

dd.set_format(
    type="torch", columns=["input_ids", "attention_mask"]+[x.name for x in head_configs]
)
for split in dd.keys():
    dd[split] = dd[split].remove_columns(["text","label"])