In [1]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM,AutoTokenizer
import transformers
from transformers import TrainingArguments, Trainer
import os
import torch
from peft import LoraConfig, get_peft_model, PeftModel,PeftConfig



In [2]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
foundation_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

def get_outputs(model, inputs, max_new_tokens=100):
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        repetition_penalty=1.5, #Avoid repetition.
        early_stopping=True, #The model can stop before reach the max_length
        eos_token_id=tokenizer.eos_token_id
    )
    return outputs

In [3]:
dataset = "fka/awesome-chatgpt-prompts"
#Create the Dataset to create prompts.
data = load_dataset(dataset)
data = data.map(lambda samples: tokenizer(samples["prompt"]), batched=True)
train_sample = data["train"].select(range(50))
train_sample = train_sample.remove_columns('act')
display(train_sample)

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [4]:
print(train_sample[0]['prompt'])

Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.


In [5]:
major, _ = torch.cuda.get_device_capability()
major

8

In [6]:
for name,_ in foundation_model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.q_proj.bias
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.k_proj.bias
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.v_proj.bias
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.q_proj.bias
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.k_proj.bias
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.v_proj.bias
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self

In [7]:
lora_config = LoraConfig(
    r=4, #As bigger the R bigger the parameters to train.
    lora_alpha=1, # a scaling factor that adjusts the magnitude of the weight matrix. Usually set to 1
    target_modules=["q_proj", "v_proj",'k_proj'],
    # modules_to_save=['23.self_attn.k_proj.weight'],
    lora_dropout=0.05, #Helps to avoid Overfitting.
    bias="none", # this specifies if the bias parameter should be trained.
    task_type="CAUSAL_LM"
)

In [8]:
peft_model = get_peft_model(foundation_model, lora_config)
print(peft_model.print_trainable_parameters())

  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)


trainable params: 368,640 || all params: 494,401,408 || trainable%: 0.07456289444871483
None


In [9]:
working_dir = './'
output_directory = os.path.join(working_dir, "peft_lab_outputs")
training_args = TrainingArguments(
    output_dir=output_directory,
    auto_find_batch_size=True, # Find a correct bvatch size that fits the size of Data.
    learning_rate= 3e-2, # Higher learning rate than full fine-tuning.
    num_train_epochs=2
)

In [10]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_sample,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: long8244557. Use `wandb login --relogin` to force relogin


  0%|          | 0/14 [00:00<?, ?it/s]

{'train_runtime': 14.1083, 'train_samples_per_second': 7.088, 'train_steps_per_second': 0.992, 'train_loss': 2.348611831665039, 'epoch': 2.0}


TrainOutput(global_step=14, training_loss=2.348611831665039, metrics={'train_runtime': 14.1083, 'train_samples_per_second': 7.088, 'train_steps_per_second': 0.992, 'total_flos': 25902686169600.0, 'train_loss': 2.348611831665039, 'epoch': 2.0})

In [11]:
#Save the model.
peft_model_path = os.path.join(output_directory, "lora_model")
trainer.model.save_pretrained(peft_model_path)
#Load the Model.
peft_model_1 = PeftModel.from_pretrained(foundation_model,
                                        peft_model_path,
                                        is_trainable=False)

  adapters_weights = torch.load(filename, map_location=torch.device(device))


In [12]:
input_sentences = tokenizer("I want you to act as a motivational coach. ", return_tensors="pt")
input_sentences = {key: value.to(device) for key, value in input_sentences.items()}
foundational_outputs_sentence = get_outputs(peft_model_1, input_sentences, max_new_tokens=50)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))



['I want you to act as a motivational coach.  I will provide some information about what it takes for someone with different personality types, goals and values in order achieve success while working towards achieving their desired outcomes through the use of various methods such as coaching techniques or other forms of therapy that can help them overcome']


In [13]:
peft_model_1

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 896)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): Linear(
                in_features=896, out_features=896, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=896, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=896, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(
                in_features=896, out_features=128, bias=True
                (lora_dropo

In [14]:
foundation_model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(
            in_features=896, out_features=896, bias=True
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=896, out_features=4, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=4, out_features=896, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear(
            in_features=896, out_features=128, bias=True
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Line

In [15]:
# merge weight
merged_model = peft_model_1.merge_and_unload()
merged_model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [16]:
merged_model.save_pretrained('output_pretrained/')

In [17]:
# set/switch adapter
data = load_dataset(dataset)
data = data.map(lambda samples: tokenizer(samples["prompt"]), batched=True)
train_sample_1 = data["train"].select(range(150))
display(train_sample_1)

lora_config = LoraConfig(
    r=4,
    lora_alpha=4,
    target_modules=['k_proj'],
    modules_to_save=['23.self_attn.k_proj.weight'],
    lora_dropout=0.001,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(foundation_model, lora_config)
print(peft_model.print_trainable_parameters())

working_dir = './'
output_directory = os.path.join(working_dir, "peft_lab_outputs")
training_args = TrainingArguments(
    output_dir=output_directory,
    auto_find_batch_size=True, # Find a correct bvatch size that fits the size of Data.
    learning_rate= 3e-2, # Higher learning rate than full fine-tuning.
    num_train_epochs=2
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_sample,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

#Save the model.
peft_model_path = os.path.join(output_directory, "lora_model_2")
trainer.model.save_pretrained(peft_model_path)
#Load the Model.
peft_model_2 = PeftModel.from_pretrained(foundation_model,
                                        peft_model_path,
                                        is_trainable=False)

Dataset({
    features: ['act', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 150
})

trainable params: 98,304 || all params: 494,131,072 || trainable%: 0.019894316623747958
None


  0%|          | 0/14 [00:00<?, ?it/s]

{'train_runtime': 11.7908, 'train_samples_per_second': 8.481, 'train_steps_per_second': 1.187, 'train_loss': 2.1818479810442244, 'epoch': 2.0}


  adapters_weights = torch.load(filename, map_location=torch.device(device))


In [18]:
# add adapter
peft_model_1 = PeftConfig.from_pretrained('peft_lab_outputs\lora_model')
peft_model_2 = PeftConfig.from_pretrained('peft_lab_outputs\lora_model_2')
foundation_model.add_adapter(peft_model_1,adapter_name='adapter_1')
foundation_model.add_adapter(peft_model_2,adapter_name='adapter_2')

  peft_model_1 = PeftConfig.from_pretrained('peft_lab_outputs\lora_model')
  peft_model_2 = PeftConfig.from_pretrained('peft_lab_outputs\lora_model_2')


In [21]:
# switch adapter
foundation_model.set_adapter('adapter_1')
input_sentences = tokenizer("I want you to act as a motivational coach. ", return_tensors="pt")
input_sentences = {key: value.to(device) for key, value in input_sentences.items()}
foundational_outputs_sentence = get_outputs(foundation_model, input_sentences, max_new_tokens=50)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))



['I want you to act as a motivational coach.  I will provide some ideas for improving your personal or professional life goals and it is up to me what methods are most effective in achieving them, such techniques that can help people overcome obstacles they face during the process of setting new ones along with creating strategies tailored']


In [None]:
foundation_model.set_adapter('adapter_2')
input_sentences = tokenizer("I want you to act as a motivational coach. ", return_tensors="pt")
input_sentences = {key: value.to(device) for key, value in input_sentences.items()}
foundational_outputs_sentence = get_outputs(foundation_model, input_sentences, max_new_tokens=50)

print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))

['I want you to act as a motivational coach.  I will provide some ideas for improving self-confidence and your role is not just giving advice but also helping the person come up with strategies that can be successful in achieving their goals based on different factors like learning new skills, overcoming fears etc., while at same']


In [26]:
foundation_model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(
            in_features=896, out_features=896, bias=True
            (lora_dropout): ModuleDict(
              (adapter_1): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (adapter_1): Linear(in_features=896, out_features=4, bias=False)
            )
            (lora_B): ModuleDict(
              (adapter_1): Linear(in_features=4, out_features=896, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear(
            in_features=896, out_features=128, bias=True
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.001, inplace=False)
              (adapter_1): Dropout(p=0.05, inplace=False)
            

In [29]:
with peft_model.disable_adapter():
    input_sentences = tokenizer("I want you to act as a motivational coach. ", return_tensors="pt")
    input_sentences = {key: value.to(device) for key, value in input_sentences.items()}
    foundational_outputs_sentence = get_outputs(peft_model, input_sentences, max_new_tokens=50)

    print(tokenizer.batch_decode(foundational_outputs_sentence, skip_special_tokens=True))



['I want you to act as a motivational coach.  I will provide some information about what motivates people and your role is not only helping them understand the reasons behind that motivation but also providing strategies on how they can achieve it in order for their success stories, while engaging others along with these efforts through group']
