<a href="https://colab.research.google.com/github/abhg86/LLM/blob/main/papier/Pythia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets



In [2]:
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import datasets

from transformers import GPTNeoXForCausalLM, pipeline, AutoTokenizer


In [3]:

# pipeline = pipeline(
#     "text-generation",
#     model = "EleutherAI/pythia-160m-deduped",
#     revision="step143000",
#     cache_dir="./pythia-160m-deduped/step143000"
#     )

# model = pipeline.model
# tokenizer = pipeline.tokenizer
# # model = GPTNeoXForCausalLM.from_pretrained(
# #   "EleutherAI/pythia-70m-deduped",
# #   revision="step143000",
# #   cache_dir="./pythia-70m-deduped/step143000",
# # )
# # tokenizer = AutoTokenizer.from_pretrained(
# #     "EleutherAI/pythia-70m-deduped",
# #     revision="step143000",
# #     cache_dir="./pythia-70m-deduped/step143000",
# #     )

# inputs = tokenizer("Paris is the capital of", return_tensors="pt")
# tokens = model.generate(**inputs, max_length = 50)
# tokenizer.decode(tokens[0])

In [39]:
class Steer(nn.Module):
  def __init__(self, lm_head, embed_dim, num_steers=2, rank=1000, init_var=1e-2, epsilon=1e-3):
    super().__init__()
    self.projector1 = nn.Linear(embed_dim, rank)
    self.projector2 = nn.Linear(rank, embed_dim)
    self.lm_head = lm_head
    self.rank = rank
    self.epsilon = epsilon
    self.num_steers = num_steers
    self.embed_dim = embed_dim
    self.steer_values = torch.zeros(num_steers)
    self.weight = self.weight()

  def set_values(self, steer_values):
    self.steer_values = steer_values

  def forward(self, x):
    delta = self.projector2(self.projector1(x) * self.steer_values.unsqueeze(1))
    return self.lm_head(x + self.epsilon * delta)

  def regularization_term(self):
    return torch.norm(self.projector1.weight) + torch.norm(self.projector2.weight)

  def state_dict(self, destination=None, prefix='', keep_vars=False):
    # Call the superclass's state_dict method to handle the destination argument
    state_dict_ = super().state_dict(destination, prefix, keep_vars)

    # Add your custom state to the dictionary
    state_dict_[prefix + 'projector1'] = self.projector1.state_dict()
    state_dict_[prefix + 'projector2'] = self.projector2.state_dict()
    return state_dict_
    return {"projector1": self.projector1, "projector2": self.projector2}

  def load_state_dict(self, state_dict):
    self.projector1 = state_dict["projector1"]
    self.projector2 = state_dict["projector2"]

  def weight(self):
    return [self.projector1.weight, self.projector2.weight]

In [44]:
def train(dataloader, model, steer, tokenizer, n_steps=1000, lr=1e-2, training_steer=0, num_steers=2, max_length=256, regularization=1e-6):
    data_iter = iter(dataloader)

    device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    print("number of training steps:", n_steps)
    start_step = 0
    optimizer = Adam(model.parameters(), lr=lr)

    pbar = tqdm(range(start_step, n_steps))

    for step_i in pbar:
        batch = next(data_iter, None)
        if batch is None:
            data_iter = iter(dataloader)
            batch = next(data_iter, None)

        cur_batch_size = len(batch["text"])
        batch_stance = torch.Tensor(batch["label"]).to(device)
        batch_stance = batch_stance.unsqueeze(1)
        batch_text = batch["text"]
        tokenized = tokenizer(batch_text, padding=True, max_length=max_length, truncation=True, return_tensors="pt")
        input_ids = torch.LongTensor(tokenized["input_ids"]).to(device)

        optimizer.zero_grad()

        attention_mask = torch.LongTensor(tokenized["attention_mask"]).to(device)

        steer.set_values(torch.Tensor(batch["label"]).to(device))

        position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        # print("inpu_ids : ", input_ids.shape)
        # print("attention_mask : ", attention_mask.shape)
        # print("position_ids : ", position_ids.shape)
        # print("batch_stance : ", batch_stance)

        output = model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, labels=input_ids)
        loss = output.loss
        regularization_term = steer.regularization_term()
        (loss + regularization * regularization_term).backward()
        optimizer.step()


    torch.save([
        model.state_dict(),
        max(n_steps, start_step)
    ], "train.pt")



In [6]:
dataset_train = datasets.load_dataset("SetFit/sst5")["train"]
dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


In [7]:
pipeline = pipeline(
    "text-generation",
    model = "EleutherAI/pythia-160m-deduped",
    revision="step143000",
    cache_dir="./pythia-160m-deduped/step143000",
    device= "cuda:0" if torch.cuda.is_available() else "cpu"
    )


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [40]:

model = pipeline.model
tokenizer = pipeline.tokenizer
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

for param in model.parameters():
  param.requires_grad = False

steer = Steer(model.embed_out, model.config.hidden_size)
model.lm_head = steer

# vocab_size = len(tokenizer)
# model.resize_token_embeddings(vocab_size)



In [45]:
train(dataloader_train, model, steer, tokenizer)

number of training steps: 1000


100%|██████████| 1000/1000 [02:01<00:00,  8.25it/s]
  state_dict_ = super().state_dict(destination, prefix, keep_vars)
