In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import os
from datasets import load_dataset
from peft import  get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit
from transformers import Trainer, DataCollatorForLanguageModeling

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
model_name = "microsoft/Phi-3-mini-4k-instruct"
dataset_prompt = "fka/awesome-chatgpt-prompts"
NUM_VIRTUAL_TOKENS = 8
NUM_EPOCHS = 20

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
foundational_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True
).to('cuda')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.74s/it]


In [24]:
def get_outputs(model, inputs, max_new_tokens=100):
    outputs = model.generate(
        input_ids=inputs["input_ids"].to('cuda'),
        attention_mask=inputs["attention_mask"].to('cuda'),
        max_new_tokens=max_new_tokens,
        early_stopping=True, 
        eos_token_id=tokenizer.eos_token_id
    )
    return outputs

In [25]:
input_prompt = tokenizer("I want you to act as a motivational coach. ", return_tensors="pt")
foundational_outputs_prompt = get_outputs(foundational_model, input_prompt, max_new_tokens=50)

print(tokenizer.batch_decode(foundational_outputs_prompt, skip_special_tokens=True))



['I want you to act as a motivational coach. \n\n**Solution 1:**\n\nAs a motivational coach, I would begin by establishing a connection with the individual seeking guidance. I would ask open-ended questions to understand their current situation, challenges, and aspir']


In [26]:
data_prompt = load_dataset(dataset_prompt)
data_prompt = data_prompt.map(lambda samples: tokenizer(samples["prompt"]), batched=True)
train_sample_prompt = data_prompt["train"].select(range(100))
test_sample_prompt = data_prompt["train"].select(range(100, 150))

In [27]:
generation_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM, #This type indicates the model will generate text.
    prompt_tuning_init=PromptTuningInit.RANDOM,  #The added virtual tokens are initializad with random numbers
    num_virtual_tokens=NUM_VIRTUAL_TOKENS, #Number of virtual tokens to be added and trained.
    tokenizer_name_or_path=model_name #The pre-trained model.
)

In [28]:
peft_model_prompt = get_peft_model(foundational_model, generation_config)
print(peft_model_prompt.print_trainable_parameters())

trainable params: 24,576 || all params: 3,821,104,128 || trainable%: 0.0006
None


In [29]:
from transformers import TrainingArguments
def create_training_arguments(path, learning_rate=0.003, epochs=16):
    training_args = TrainingArguments(
        output_dir=path, 
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        learning_rate= learning_rate, 
        per_device_eval_batch_size=1,
        eval_accumulation_steps=8,
        fp16=True,
        num_train_epochs=epochs,
        eval_strategy="epoch", 
        logging_strategy="epoch",
    )
    return training_args

In [30]:
working_dir = "./"

#Is best to store the models in separate folders.
#Create the name of the directories where to store the models.
output_directory_prompt =  os.path.join(working_dir, "peft_outputs_prompt")
output_directory_sentences = os.path.join(working_dir, "peft_outputs_sentences")

#Just creating the directoris if not exist.
if not os.path.exists(working_dir):
    os.mkdir(working_dir)
if not os.path.exists(output_directory_prompt):
    os.mkdir(output_directory_prompt)
if not os.path.exists(output_directory_sentences):
    os.mkdir(output_directory_sentences)


In [31]:
training_args_prompt = create_training_arguments(output_directory_prompt, 0.003, NUM_EPOCHS)
training_args_sentences = create_training_arguments(output_directory_sentences, 0.003, NUM_EPOCHS)

In [32]:
def create_trainer(model, training_args, train_dataset, eval_dataset):
    trainer = Trainer(
        model=model, # We pass in the PEFT version of the foundation model, bloomz-560M
        args=training_args, #The args for the training.
        train_dataset=train_dataset, #The dataset used to tyrain the model.
        eval_dataset=eval_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) # mlm=False indicates not to use masked language modeling
    )
    return trainer

In [34]:
trainer_prompt = create_trainer(peft_model_prompt, training_args_prompt, train_sample_prompt, test_sample_prompt)
trainer_prompt.train()

Epoch,Training Loss,Validation Loss
0,2.1614,1.898827
2,2.101,1.892539
4,2.0502,1.896296
6,2.0085,1.904047


KeyboardInterrupt: 

In [14]:
!wandb login --relogin f2ab06dd6bd8ac74fd896151578e02adbb51d623

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/admin/.netrc
