# Prompt Tuning Example


In [27]:
## import libraries
import creds
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = 'bigscience/bloomz-560m'
NUM_VIRTUAL_TOKENS = 10
NUM_EPOCHS = 2

In [28]:
## load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
foundational_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                          trust_remote_code = True,
                                                          token = creds.HUGGINGFACE_TOKEN)

In [29]:
## create a function that returns the outputs from the model we have received above, and inputs.
def get_outputs(model, inputs, max_new_tokens = 100):
    outputs = model.generate(
        input_ids = inputs['input_ids'],
        attention_mask = inputs['attention_mask'],
        max_new_tokens = max_new_tokens,
        repetition_penalty = 1.5, 
        early_stopping = True,
        eos_token_id = tokenizer.eos_token_id,
        num_beams = 6
    )

    return outputs

In [48]:
## get a smaple response from the above get_output and existing foundational models
test_prompt = 'I want you to act as a motivational coach.'

input_prompt = tokenizer(test_prompt, return_tensors = 'pt')
foundational_outputs_prompt = get_outputs(foundational_model, input_prompt, max_new_tokens = 50)

print(tokenizer.batch_decode(foundational_outputs_prompt, skip_special_tokens = True))

["I want you to act as a motivational coach. Don't be afraid to ask questions"]


In [46]:
## get a smaple response from the above get_output and existing foundational models
test_prompt = 'Give three tips for staying healthy.'

input_prompt = tokenizer(test_prompt, return_tensors = 'pt')
foundational_outputs_prompt = get_outputs(foundational_model, input_prompt, max_new_tokens = 50)

print(tokenizer.batch_decode(foundational_outputs_prompt, skip_special_tokens = True))

['Give three tips for staying healthy.']


In [32]:
## get a smaple response from the above get_output and existing foundational models
test_prompt = 'What is the capital of France?'

input_prompt = tokenizer(test_prompt, return_tensors = 'pt')
foundational_outputs_prompt = get_outputs(foundational_model, input_prompt, max_new_tokens = 50)

print(tokenizer.batch_decode(foundational_outputs_prompt, skip_special_tokens = True))

['What is the capital of France? Paris']


In [50]:
## get a smaple response from the above get_output and existing foundational models
test_prompt = 'What are the three primary colours?'

input_prompt = tokenizer(test_prompt, return_tensors = 'pt')
foundational_outputs_prompt = get_outputs(foundational_model, input_prompt, max_new_tokens = 50)

print(tokenizer.batch_decode(foundational_outputs_prompt, skip_special_tokens = True))

['What are the three primary colours? Red, Green and Blue']


In [52]:
## get a smaple response from the above get_output and existing foundational models
test_prompt = 'What is an atom?'

input_prompt = tokenizer(test_prompt, return_tensors = 'pt')
foundational_outputs_prompt = get_outputs(foundational_model, input_prompt, max_new_tokens = 50)

print(tokenizer.batch_decode(foundational_outputs_prompt, skip_special_tokens = True))

['What is an atom? a single particle of matter']


In [33]:
import datasets
from datasets import load_dataset

dataset_prompt = load_dataset("tatsu-lab/alpaca", split="train")

#create the dataset to create prompts
data_prompt = dataset_prompt.map(lambda samples:tokenizer(samples['text']), batched = True)
train_sample_prompt = data_prompt.select(range(1000))

display(data_prompt)

Map: 100%|██████████| 52002/52002 [00:02<00:00, 20792.92 examples/s]


Dataset({
    features: ['instruction', 'input', 'output', 'text', 'input_ids', 'attention_mask'],
    num_rows: 52002
})

In [34]:
train_sample_prompt

Dataset({
    features: ['instruction', 'input', 'output', 'text', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [35]:
pd.DataFrame(train_sample_prompt)

Unnamed: 0,instruction,input,output,text,input_ids,attention_mask
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...,Below is an instruction that describes a task....,"[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",Below is an instruction that describes a task....,"[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Describe the structure of an atom.,,"An atom is made up of a nucleus, which contain...",Below is an instruction that describes a task....,"[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,How can we reduce air pollution?,,There are a number of ways to reduce air pollu...,Below is an instruction that describes a task....,"[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,Describe a time when you had to make a difficu...,,I had to make a difficult decision when I was ...,Below is an instruction that describes a task....,"[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...
995,Name the 6 most populous cities in China.,,The six most populous cities in China are Shan...,Below is an instruction that describes a task....,"[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
996,Edit the following text to make it easier to read,"In the 20th centuary,developments in the field...","In the 20th century, developments in the field...","Below is an instruction that describes a task,...","[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
997,Determine the surface area of the following fi...,A cube with side length 2 cm,The surface area of the cube is 24 cm².,"Below is an instruction that describes a task,...","[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
998,Find the definition of the following financial...,Stock Split,A stock split is a corporate action in which a...,"Below is an instruction that describes a task,...","[111757, 632, 660, 54103, 861, 63808, 267, 201...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [36]:
## time to fine tune with Parameter Efficient Fine Tuning
from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit

generator_config = PromptTuningConfig(
    task_type = TaskType.CAUSAL_LM, # this makes the model generate text
    prompt_tuning_init = PromptTuningInit.RANDOM, # initialise the virtual tokens with random numbers
    num_virtual_tokens = NUM_VIRTUAL_TOKENS, # number of virtual tokens to add and train
    tokenizer_name_or_path = model_name
)

In [37]:
peft_model_prompt = get_peft_model(foundational_model, generator_config)
print(peft_model_prompt.print_trainable_parameters())

trainable params: 10,240 || all params: 559,224,832 || trainable%: 0.0018311060979495275
None


In [38]:
## start to create the training configuration
from transformers import TrainingArguments

def create_training_arguments(path, learning_rate = 0.0035, epochs = 6):
    training_args = TrainingArguments(
        output_dir = path,
        auto_find_batch_size = True,
        learning_rate = learning_rate,
        num_train_epochs = epochs
    )

    return training_args

In [39]:
import os

## create directories to hold the model when they don't exist

working_dir = './peft_baseLLM'

## it is recommended to store the models seperately
output_dir_prompt = os.path.join(working_dir, 'peft_outputs_prompt')
output_dir_sentences = os.path.join(working_dir, 'peft_outputs_sentences')

## create the directories if they don't exist
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
if not os.path.exists(output_dir_prompt):
    os.mkdir(output_dir_prompt)
if not os.path.exists(output_dir_sentences):
    os.mkdir(output_dir_sentences)


In [40]:
training_args_prompt = create_training_arguments(output_dir_prompt, 0.003, 2)


In [41]:
from transformers import Trainer, DataCollatorForLanguageModeling 

def create_trainer(model, training_args, train_dataset):
    trainer = Trainer(
        model=model,
        args = training_args,
        train_dataset = train_dataset,
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
    )

    return trainer

In [42]:
## time to finetune the models
trainer_prompt = create_trainer(peft_model_prompt, training_args_prompt, train_sample_prompt)
trainer_prompt.train()

Step,Training Loss


TrainOutput(global_step=64, training_loss=3.493079662322998, metrics={'train_runtime': 1659.7604, 'train_samples_per_second': 1.205, 'train_steps_per_second': 0.039, 'total_flos': 909697514864640.0, 'train_loss': 3.493079662322998, 'epoch': 2.0})

In [43]:
trainer_prompt.model.save_pretrained(output_dir_prompt)

In [44]:
from peft import PeftModel

loaded_model_prompt = PeftModel.from_pretrained(foundational_model,
                                         output_dir_prompt,
                                         #device_map='auto',
                                         is_trainable=False)

In [49]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['I want you to act as a motivational coach. By signing up for our newsletter, you agree to the Terms of Use']


In [47]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['Give three tips for staying healthy.']


In [45]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['What is the capital of France? Paris']


In [51]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['What are the three primary colours? Red, Green and Yellow']


In [53]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['What is an atom? a molecular entity']


In [25]:
## preparing the datasets
from datasets import load_dataset

dataset_prompt = 'fka/awesome-chatgpt-prompts'

#create the dataset to create prompts
data_prompt = load_dataset(dataset_prompt)
data_prompt = data_prompt.map(lambda samples:tokenizer(samples['prompt']), batched = True)
train_sample_prompt = data_prompt['train'].select(range(50))

# train_sample_prompt = train_sample_prompt.remove_columns(['act','prompt'])

display(train_sample_prompt)

Dataset({
    features: ['act', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [29]:
data_prompt

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt', 'input_ids', 'attention_mask'],
        num_rows: 153
    })
})

In [26]:
pd.DataFrame(train_sample_prompt)

Unnamed: 0,act,prompt,input_ids,attention_mask
0,Linux Terminal,I want you to act as a linux terminal. I will ...,"[2, 235285, 1938, 692, 577, 2027, 685, 476, 61...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,English Translator and Improver,"I want you to act as an English translator, sp...","[2, 235285, 1938, 692, 577, 2027, 685, 671, 46...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,`position` Interviewer,I want you to act as an interviewer. I will be...,"[2, 235285, 1938, 692, 577, 2027, 685, 671, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,JavaScript Console,I want you to act as a javascript console. I w...,"[2, 235285, 1938, 692, 577, 2027, 685, 476, 77...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,Excel Sheet,I want you to act as a text based excel. you'l...,"[2, 235285, 1938, 692, 577, 2027, 685, 476, 27...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5,English Pronunciation Helper,I want you to act as an English pronunciation ...,"[2, 235285, 1938, 692, 577, 2027, 685, 671, 46...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6,Spoken English Teacher and Improver,I want you to act as a spoken English teacher ...,"[2, 235285, 1938, 692, 577, 2027, 685, 476, 22...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,Travel Guide,I want you to act as a travel guide. I will wr...,"[2, 235285, 1938, 692, 577, 2027, 685, 476, 50...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
8,Plagiarism Checker,I want you to act as a plagiarism checker. I w...,"[2, 235285, 1938, 692, 577, 2027, 685, 476, 15...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9,Character from Movie/Book/Anything,I want you to act like {character} from {serie...,"[2, 235285, 1938, 692, 577, 2027, 1154, 612, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [55]:
## prepare a second dataset 
dataset_sentences = load_dataset('Abirate/english_quotes')

data_sentences = dataset_sentences.map(lambda samples:tokenizer(samples['quote']), batched = True)
train_sample_sentences = data_sentences['train'].select(range(25))
train_sample_sentences = train_sample_sentences.remove_columns(['quote','author','tags'])
display(train_sample_sentences)

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 25
})

In [56]:
pd.DataFrame(train_sample_sentences)

Unnamed: 0,input_ids,attention_mask
0,"[1502, 17143, 33218, 30, 39839, 4384, 632, 112...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,"[1502, 10203, 239002, 15, 136192, 1049, 530, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,"[1502, 35417, 11217, 1306, 61759, 29, 368, 713...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"[1502, 6895, 7112, 38695, 15, 1427, 10512, 350...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,"[119533, 22630, 7160, 38695, 632, 3269, 267, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
5,"[1502, 17143, 5268, 1152, 1306, 530, 5894, 359...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6,"[1502, 124002, 83213, 59020, 3269, 20242, 8839...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
7,"[1502, 5448, 4472, 11700, 361, 19134, 3262, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
8,"[1502, 5448, 3804, 20152, 14275, 15, 1965, 132...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9,"[1502, 17143, 368, 7458, 861, 1152, 26338, 427...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [57]:
## time to fine tune with Parameter Efficient Fine Tuning
from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit

generator_config = PromptTuningConfig(
    task_type = TaskType.CAUSAL_LM, # this makes the model generate text
    prompt_tuning_init = PromptTuningInit.RANDOM, # initialise the virtual tokens with random numbers
    num_virtual_tokens = NUM_VIRTUAL_TOKENS, # number of virtual tokens to add and train
    tokenizer_name_or_path = model_name
)

In [58]:
peft_model_prompt = get_peft_model(foundational_model, generator_config)
print(peft_model_prompt.print_trainable_parameters())

trainable params: 10,240 || all params: 559,224,832 || trainable%: 0.0018311060979495275
None


In [59]:
peft_model_sentences = get_peft_model(foundational_model, generator_config)
print(peft_model_sentences.print_trainable_parameters())

trainable params: 10,240 || all params: 559,224,832 || trainable%: 0.0018311060979495275
None


In [60]:
## start to create the training configuration
from transformers import TrainingArguments

def create_training_arguments(path, learning_rate = 0.0035, epochs = 6):
    training_args = TrainingArguments(
        output_dir = path,
        auto_find_batch_size = True,
        learning_rate = learning_rate,
        num_train_epochs = epochs
    )

    return training_args

In [61]:
import os

In [62]:
## create directories to hold the model when they don't exist

working_dir = './peft'

## it is recommended to store the models seperately
output_dir_prompt = os.path.join(working_dir, 'peft_outputs_prompt')
output_dir_sentences = os.path.join(working_dir, 'peft_outputs_sentences')

## create the directories if they don't exist
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
if not os.path.exists(output_dir_prompt):
    os.mkdir(output_dir_prompt)
if not os.path.exists(output_dir_sentences):
    os.mkdir(output_dir_sentences)


In [63]:
training_args_prompt = create_training_arguments(output_dir_prompt, 0.003, NUM_EPOCHS)
training_args_sentences = create_training_arguments(output_dir_sentences, 0.0035, NUM_EPOCHS)

In [64]:
from transformers import Trainer, DataCollatorForLanguageModeling 

def create_trainer(model, training_args, train_dataset):
    trainer = Trainer(
        model=model,
        args = training_args,
        train_dataset = train_dataset,
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
    )

    return trainer

In [65]:
## time to finetune the models
trainer_prompt = create_trainer(peft_model_prompt, training_args_prompt, train_sample_prompt)
trainer_prompt.train()

trainer_sentences = create_trainer(peft_model_sentences, training_args_sentences, train_sample_sentences)
trainer_sentences.train()

Step,Training Loss


Step,Training Loss


TrainOutput(global_step=5, training_loss=4.071768188476563, metrics={'train_runtime': 51.5815, 'train_samples_per_second': 2.423, 'train_steps_per_second': 0.097, 'total_flos': 28795358208000.0, 'train_loss': 4.071768188476563, 'epoch': 5.0})

In [66]:
## save the models
trainer_prompt.model.save_pretrained(output_dir_prompt)
trainer_sentences.model.save_pretrained(output_dir_sentences)

In [67]:
from peft import PeftModel

loaded_model_prompt = PeftModel.from_pretrained(foundational_model,
                                         output_dir_prompt,
                                         #device_map='auto',
                                         is_trainable=False)

In [30]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['I want you to act as a motivational coach. Learn how to be a motivational coach']


In [48]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['I want you to act as a motivational coach. I want you to be a motivational coach.']


In [68]:
## inference
loaded_model_prompt_outputs = get_outputs(loaded_model_prompt, input_prompt)
print(tokenizer.batch_decode(loaded_model_prompt_outputs, skip_special_tokens = True))


['I want you to act as a motivational coach. I want you to be a motivational coach.']


In [69]:
loaded_model_sentences = PeftModel.from_pretrained(foundational_model,
                                         output_dir_sentences,
                                         #device_map='auto',
                                         is_trainable=False)

In [32]:
## inference
loaded_model_sentences_outputs = get_outputs(loaded_model_sentences, input_sentences)
print(tokenizer.batch_decode(loaded_model_sentences_outputs, skip_special_tokens = True))

['There are two things that matter: income and wealth.']


In [50]:
## inference
loaded_model_sentences_outputs = get_outputs(loaded_model_sentences, input_sentences)
print(tokenizer.batch_decode(loaded_model_sentences_outputs, skip_special_tokens = True))

['There are two things that matter: time and money.']


In [70]:
## inference
loaded_model_sentences_outputs = get_outputs(loaded_model_sentences, input_sentences)
print(tokenizer.batch_decode(loaded_model_sentences_outputs, skip_special_tokens = True))

['There are two things that matter: time and money.']
