# Prompt Tuning Example


In [10]:
## import libraries
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = 'bigscience/bloomz-560m'
NUM_VIRTUAL_TOKENS = 10
NUM_EPOCHS = 5

In [11]:
## load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
foundational_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                          trust_remote_code = True)

In [12]:
## create a function that returns the outputs from the model we have received above, and inputs.
def get_outputs(model, inputs, max_new_tokens = 100):
    outputs = model.generate(
        input_ids = inputs['input_ids'],
        attention_mask = inputs['attention_mask'],
        max_new_tokens = max_new_tokens,
        repetition_penalty = 1.5, 
        early_stopping = True,
        eos_token_id = tokenizer.eos_token_id,
        num_beams = 6
    )

    return outputs

In [13]:
## get a smaple response from the above get_output and existing foundational models
test_prompt = 'I want you to act as a motivational coach.'

input_prompt = tokenizer(test_prompt, return_tensors = 'pt')
foundational_outputs_prompt = get_outputs(foundational_model, input_prompt, max_new_tokens = 50)

print(tokenizer.batch_decode(foundational_outputs_prompt, skip_special_tokens = True))

["I want you to act as a motivational coach. Don't be afraid to ask questions"]


In [14]:
## get a smaple response from the above get_output and existing foundational models
test_sentence = 'There are two things that matter:'

input_sentences = tokenizer(test_sentence, return_tensors = 'pt')
foundational_outputs_sentence = get_outputs(foundational_model, input_sentences, max_new_tokens = 50)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens = True))

['There are two things that matter: quality and quantity. quality matters more than quantity.']


In [53]:
## preparing the datasets
from datasets import load_dataset

dataset_prompt = 'fka/awesome-chatgpt-prompts'

#create the dataset to create prompts
data_prompt = load_dataset(dataset_prompt)
data_prompt = data_prompt.map(lambda samples:tokenizer(samples['prompt']), batched = True)
train_sample_prompt = data_prompt['train'].select(range(50))

train_sample_prompt = train_sample_prompt.remove_columns(['act','prompt'])

display(train_sample_prompt)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 50
})

In [54]:
pd.DataFrame(train_sample_prompt)

Unnamed: 0,input_ids,attention_mask
0,"[44, 4026, 1152, 427, 1769, 661, 267, 104105, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"[44, 4026, 1152, 427, 1769, 661, 660, 7165, 24...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[44, 4026, 1152, 427, 1769, 661, 660, 33322, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[44, 4026, 1152, 427, 1769, 661, 267, 49760, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[44, 4026, 1152, 427, 1769, 661, 267, 5484, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5,"[44, 4026, 1152, 427, 1769, 661, 660, 7165, 14...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6,"[44, 4026, 1152, 427, 1769, 661, 267, 93949, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,"[44, 4026, 1152, 427, 1769, 661, 267, 25008, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
8,"[44, 4026, 1152, 427, 1769, 661, 267, 159667, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9,"[44, 4026, 1152, 427, 1769, 3269, 731, 84491, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [55]:
## prepare a second dataset 
dataset_sentences = load_dataset('Abirate/english_quotes')

data_sentences = dataset_sentences.map(lambda samples:tokenizer(samples['quote']), batched = True)
train_sample_sentences = data_sentences['train'].select(range(25))
train_sample_sentences = train_sample_sentences.remove_columns(['quote','author','tags'])
display(train_sample_sentences)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 25
})

In [56]:
pd.DataFrame(train_sample_sentences)

Unnamed: 0,input_ids,attention_mask
0,"[1502, 17143, 33218, 30, 39839, 4384, 632, 112...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,"[1502, 10203, 239002, 15, 136192, 1049, 530, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[1502, 35417, 11217, 1306, 61759, 29, 368, 713...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[1502, 6895, 7112, 38695, 15, 1427, 10512, 350...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,"[119533, 22630, 7160, 38695, 632, 3269, 267, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
5,"[1502, 17143, 5268, 1152, 1306, 530, 5894, 359...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6,"[1502, 124002, 83213, 59020, 3269, 20242, 8839...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,"[1502, 5448, 4472, 11700, 361, 19134, 3262, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
8,"[1502, 5448, 3804, 20152, 14275, 15, 1965, 132...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9,"[1502, 17143, 368, 7458, 861, 1152, 26338, 427...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [57]:
## time to fine tune with Parameter Efficient Fine Tuning
from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit

generator_config = PromptTuningConfig(
    task_type = TaskType.CAUSAL_LM, # this makes the model generate text
    prompt_tuning_init = PromptTuningInit.RANDOM, # initialise the virtual tokens with random numbers
    num_virtual_tokens = NUM_VIRTUAL_TOKENS, # number of virtual tokens to add and train
    tokenizer_name_or_path = model_name
)

In [58]:
peft_model_prompt = get_peft_model(foundational_model, generator_config)
print(peft_model_prompt.print_trainable_parameters())

trainable params: 10,240 || all params: 559,224,832 || trainable%: 0.0018311060979495275
None


In [59]:
peft_model_sentences = get_peft_model(foundational_model, generator_config)
print(peft_model_sentences.print_trainable_parameters())

trainable params: 10,240 || all params: 559,224,832 || trainable%: 0.0018311060979495275
None


In [60]:
## start to create the training configuration
from transformers import TrainingArguments

def create_training_arguments(path, learning_rate = 0.0035, epochs = 6):
    training_args = TrainingArguments(
        output_dir = path,
        auto_find_batch_size = True,
        learning_rate = learning_rate,
        num_train_epochs = epochs
    )

    return training_args

In [61]:
import os

In [62]:
## create directories to hold the model when they don't exist

working_dir = './peft'

## it is recommended to store the models seperately
output_dir_prompt = os.path.join(working_dir, 'peft_outputs_prompt')
output_dir_sentences = os.path.join(working_dir, 'peft_outputs_sentences')

## create the directories if they don't exist
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
if not os.path.exists(output_dir_prompt):
    os.mkdir(output_dir_prompt)
if not os.path.exists(output_dir_sentences):
    os.mkdir(output_dir_sentences)


In [63]:
training_args_prompt = create_training_arguments(output_dir_prompt, 0.003, NUM_EPOCHS)
training_args_sentences = create_training_arguments(output_dir_sentences, 0.0035, NUM_EPOCHS)

In [64]:
from transformers import Trainer, DataCollatorForLanguageModeling 

def create_trainer(model, training_args, train_dataset):
    trainer = Trainer(
        model=model,
        args = training_args,
        train_dataset = train_dataset,
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
    )

    return trainer

In [65]:
## time to finetune the models
trainer_prompt = create_trainer(peft_model_prompt, training_args_prompt, train_sample_prompt)
trainer_prompt.train()

trainer_sentences = create_trainer(peft_model_sentences, training_args_sentences, train_sample_sentences)
trainer_sentences.train()

Step,Training Loss


Step,Training Loss


TrainOutput(global_step=5, training_loss=4.071768188476563, metrics={'train_runtime': 51.5815, 'train_samples_per_second': 2.423, 'train_steps_per_second': 0.097, 'total_flos': 28795358208000.0, 'train_loss': 4.071768188476563, 'epoch': 5.0})

In [66]:
## save the models
trainer_prompt.model.save_pretrained(output_dir_prompt)
trainer_sentences.model.save_pretrained(output_dir_sentences)

In [67]:
from peft import PeftModel

loaded_model_prompt = PeftModel.from_pretrained(foundational_model,
                                         output_dir_prompt,
                                         #device_map='auto',
                                         is_trainable=False)

In [30]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['I want you to act as a motivational coach. Learn how to be a motivational coach']


In [48]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['I want you to act as a motivational coach. I want you to be a motivational coach.']


In [68]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['I want you to act as a motivational coach. I want you to be a motivational coach.']


In [69]:
loaded_model_sentences = PeftModel.from_pretrained(foundational_model,
                                         output_dir_sentences,
                                         #device_map='auto',
                                         is_trainable=False)

In [32]:
## inference
loaded_model_sentences_outputs = get_outputs(loaded_model_sentences, input_sentences)
print(tokenizer.batch_decode(loaded_model_sentences_outputs, skip_special_tokens = True))

['There are two things that matter: income and wealth.']


In [50]:
## inference
loaded_model_sentences_outputs = get_outputs(loaded_model_sentences, input_sentences)
print(tokenizer.batch_decode(loaded_model_sentences_outputs, skip_special_tokens = True))

['There are two things that matter: time and money.']


In [70]:
## inference
loaded_model_sentences_outputs = get_outputs(loaded_model_sentences, input_sentences)
print(tokenizer.batch_decode(loaded_model_sentences_outputs, skip_special_tokens = True))

['There are two things that matter: time and money.']
