<a href="https://colab.research.google.com/github/Yyzhang2000/AI-Cookbook/blob/main/llm_fine_tuning_prompt_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from pathlib import Path

from transformers import GPT2LMHeadModel, GPTNeoForCausalLM

import torch
import torch.nn as nn

In [5]:
class GPTPromptTuningMixin:
    @classmethod
    def from_pretrained(
            cls,
            pretrained_model_name_or_path: str,
            soft_prompt_path: str = None,
            n_tokens: int = None,
            initialize_from_vocab: bool = True,
            random_range: float = 0.5,
            **kwargs
    ):
        model = super().from_pretrained(
            pretrained_model_name_or_path, **kwargs
        )

        for p in model.parameters():
            p.requires_grad = False

        if soft_prompt_path is not None:
            model.set_soft_prompt_embeds(soft_prompt_path)
        elif n_token is not None:
            print("Initializing soft prompt...")
            model.initialize_soft_prompt(
                n_tokens=n_tokens,
                initialize_from_vocab=initialize_from_vocab,
                random_range=random_range,
            )

        return model

    def set_soft_prompt_embeds(
            self,
            soft_prompt_path: str
    ):
        self.soft_prompt = torch.load(
            soft_prompt_path, map_location = torch.device('cpu')
        )

        self.n_tokens = self.soft_prompt.num_embeddings

        print(f"Set soft prompt! (n_tokens: {self.n_tokens})")


    def initialize_soft_prompt(
            self,
            n_tokens: int = 20,
            initialize_from_vocab: bool = True,
            random_range: float = 0.5
    ) -> None:
        self.n_tokens = n_tokens

        if initialize_from_vocab:
            init_prompt_value = self.transformer.wte.weight[:n_tokens].clone().detach()
        else:
            init_prompt_value = torch.FloatTensor(2, 10).uniform_(
                -random_range, random_range
            )


        self.soft_prompt = nn.Embedding(n_tokens, self.config.n_embd)

        self.soft_prompt.weight = nn.parameter.Parameter(
            init_prompt_value
        )


    def _cat_learned_embedding_to_input(
            self, input_ids
    ) -> torch.Tensor:
        inputs_embeds = self.transformer.wte(input_ids)

        if len(list(inputs_embeds.shape)) == 2:
            inputs_embeds = inputs_embeds.unsqueeze(0)

        learned_embeds = self.soft_prompt.weight.repeat(inputs_embeds.size(0), 1, 1)

        inputs_embeds = torch.cat([learned_embeds, inputs_embeds], dim = 1)

        return inputs_embeds

    def _extend_labels(
            self,
            labels,
            ignore_index = -100
    ) -> torch.Tensor:
        if len(list(labels.shape)) == 1:
            labels = labels.unsqueeze(0)

        n_batches = labels.shape[0]

        return torch.cat([
            torch.full((n_batches, self.n_tokens), ignore_index).to(self.device),
            labels
        ],
        dim = 1)

    def _extend_attention_mask(
            self,
            attention_mask
    ) -> torch.Tensor:
        if len(list(attention_mask.shape)) == 1:
            attention_mask = attention_mask.unsqueeze(0)

        n_batches = attention_mask.shape[0]

        return torch.cat([
            torch.full((n_batches, self.n_tokens), 1).to(self.device),
            attention_mask
        ], dim = 1)

    def save_soft_prompt(
            self,
            path: str,
            filename:str = 'soft_promt.model'
    ):
        Path(path).mkdir(parents=True, exist_ok= True)
        torch.save(self.soft_prompt, os.path.join(path, filename))

    def forward(
            self,
            input_ids = None,
            past_key_values = None,
            attention_mask = None,
            token_type_ids = None,
            position_ids = None,
            head_mask = None,
            inputs_embeds = None,
            encoder_hidden_states = None,
            encoder_attention_mask = None,
            labels = None,
            use_cache = None,
            output_attentions = None,
            output_hidden_states = None,
            return_dict = None
    ):
        if input_ids is not None:
            inputs_embeds = self._cat_learned_embedding_to_input(
                input_ids
            ).to(self.device)

        if labels is not None:
            labels = self._extend_labels(labels).to(self.device)

        if attention_mask is not None:
            attention_mask = self._extend_attention_mask(attention_mask).to(self.device)

        return super().forward(
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            labels=labels,
            use_cache=use_cache,
            return_dict=return_dict,
        )

class GPT2PromptTuningLM(GPTPromptTuningMixin, GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)


class GPTNeoPromptTuningLM(GPTPromptTuningMixin, GPTNeoForCausalLM):
    def __init__(self, config):
        super().__init__(config)

In [6]:
from transformers import (
    GPT2TokenizerFast,
    AdamW,
    get_scheduler
)

ImportError: cannot import name 'AdamW' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [7]:
class Config:
    # Same default parameters as run_clm_no_trainer.py in tranformers
    # https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py
    num_train_epochs = 3
    weight_decay = 0.01
    learning_rate = 0.01
    lr_scheduler_type = "linear"
    num_warmup_steps = 0
    max_train_steps = num_train_epochs

    # Prompt-tuning
    # number of prompt tokens
    n_prompt_tokens = 20
    # If True, soft prompt will be initialized from vocab
    # Otherwise, you can set `random_range` to initialize by randomization.
    init_from_vocab = True
    # random_range = 0.5
args = Config()

In [8]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# Initialize GPT2LM with soft prompt
model = GPT2PromptTuningLM.from_pretrained(
    "gpt2",
    n_tokens=args.n_prompt_tokens,
    initialize_from_vocab=args.init_from_vocab
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

NameError: name 'n_token' is not defined