<a href="https://colab.research.google.com/github/abhg86/LLM/blob/main/papier/Pythia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
!pip install transformers
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [32]:
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import datasets

from transformers import GPTNeoXForCausalLM, pipeline, AutoTokenizer


In [33]:

pipeline = pipeline(
    "text-generation",
    model = "EleutherAI/pythia-160m-deduped",
    revision="step143000",
    cache_dir="./pythia-160m-deduped/step143000"
    )

model = pipeline.model
tokenizer = pipeline.tokenizer
# model = GPTNeoXForCausalLM.from_pretrained(
#   "EleutherAI/pythia-70m-deduped",
#   revision="step143000",
#   cache_dir="./pythia-70m-deduped/step143000",
# )
# tokenizer = AutoTokenizer.from_pretrained(
#     "EleutherAI/pythia-70m-deduped",
#     revision="step143000",
#     cache_dir="./pythia-70m-deduped/step143000",
#     )

inputs = tokenizer("Paris is the capital of", return_tensors="pt")
tokens = model.generate(**inputs, max_length = 50)
tokenizer.decode(tokens[0])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'Paris is the capital of the world’s largest city, and the city’s most famous landmark is the World Trade Center.\n\nThe city’s history is a complex of buildings, including the World Trade Center, the World Trade Center,'

In [42]:
class Steer(nn.Module):
  def __init__(self, lm_head, embed_dim, num_steers=2, rank=1000, init_var=1e-2, epsilon=1e-3):
    super().__init__()
    self.projector1 = nn.Linear(embed_dim, rank)
    self.projector2 = nn.Linear(rank, embed_dim)
    self.lm_head = lm_head
    self.rank = rank
    self.epsilon = epsilon
    self.num_steers = num_steers
    self.embed_dim = embed_dim
    self.steer_values = torch.zeros(num_steers)

  def set_values(self, steer_values):
    self.steer_values = steer_values

  def forward(self, x):
    delta = self.projector2(self.projector1(x) * self.steer_values.unsqueeze(1))
    return self.lm_head(x + self.epsilon * delta)

  def regularization_term(self):
    return torch.norm(self.projector1.weight) + torch.norm(self.projector2.weight)

  def state_dict(self):
    return {"projector1": self.projector1, "projector2": self.projector2}

  def load_state_dict(self, state_dict):
    self.projector1 = state_dict["projector1"]
    self.projector2 = state_dict["projector2"]

In [50]:
def train(dataloader, model, tokenizer, n_steps=1000, lr=1e-2, training_steer=0, num_steers=2, max_length=256, regularization=1e-6):
    data_iter = iter(dataloader)

    device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    print("number of training steps:", n_steps)
    start_step = 0
    optimizer = Adam(model.parameters(), lr=lr)

    pbar = tqdm(range(start_step, n_steps))

    for step_i in pbar:
        batch = next(data_iter, None)
        if batch is None:
            data_iter = iter(dataloader)
            batch = next(data_iter, None)

        cur_batch_size = len(batch["text"])
        batch_stance = torch.zeros(cur_batch_size, num_steers).to(device)
        batch_stance[:, training_steer] = torch.Tensor(batch["label"]).to(device)
        batch_text = batch["text"]
        tokenized = tokenizer(batch_text, padding=True, max_length=max_length, truncation=True)
        input_ids = torch.LongTensor(tokenized["input_ids"]).to(device)

        optimizer.zero_grad()

        attention_mask = torch.LongTensor(tokenized["attention_mask"]).to(device)

        loss = model(input_ids, attention_mask,batch_stance.float()).loss
        regularization_term = model.regularization_term()
        (loss + regularization * regularization_term).backward()
        optimizer.step()

        if (step_i+1) % 500 == 0:
            print(pbar.desc, flush=True)

    torch.save([
        model.state_dict(),
        max(n_steps, start_step)
    ], "logs/"+model.name+"_"+str(training_steer)+".pt")



In [47]:
dataset_train = datasets.load_dataset("SetFit/sst5")["train"]
dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)

Repo card metadata block was not found. Setting CardData to empty.


In [53]:
pipeline = pipeline(
    "text-generation",
    model = "EleutherAI/pythia-160m-deduped",
    revision="step143000",
    cache_dir="./pythia-160m-deduped/step143000"
    )

model = pipeline.model
tokenizer = pipeline.tokenizer
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

steer = Steer(model.embed_out, model.config.hidden_size)
model.set_output_embeddings(steer)


ValueError: The following `model_kwargs` are not used by the model: ['cache_dir', 'model', 'revision'] (note: typos in the generate arguments will also show up in this list)

In [51]:
train(dataloader_train, model, tokenizer)

number of training steps: 1000


  0%|          | 0/1000 [00:00<?, ?it/s]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.