<a href="https://www.kaggle.com/code/ashishraics/finetune-using-peft?scriptVersionId=186392857" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git

In [26]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer,DataCollatorForLanguageModeling,BitsAndBytesConfig
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import os
from torch.utils.data import DataLoader
import torch

In [27]:
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
foundation_model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=bnb_config, device_map={"":0})
data = load_dataset("Nan-Do/instructional_code-search-net-python", cache_dir="../working/cache"+"/datasets")
base_model = AutoModelForCausalLM.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
next(foundation_model.parameters()).dtype

torch.float16

In [10]:
from peft import prepare_model_for_kbit_training

foundation_model.gradient_checkpointing_enable()
foundation_model = prepare_model_for_kbit_training(foundation_model)

In [11]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["q_proj","k_proj"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(foundation_model, config)

In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
print_trainable_parameters(peft_model)

trainable params: 1572864 || all params: 740919296 || trainable%: 0.21228546867269063


In [14]:
shuffled_data = data["train"].shuffle(seed=42)

train_data_sample = shuffled_data.select(range(2000))
val_data_sample = shuffled_data.select(range(2000, 2030))

In [15]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [16]:
#peft_model = torch.nn.DataParallel(peft_model, device_ids = [0,1]).to(device)
#peft_model.to(device)

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["INSTRUCTION"], examples["RESPONSE"], truncation=True)

train_tokenized_data = train_data_sample.map(tokenize_function, batched=True, remove_columns=["INSTRUCTION", "RESPONSE", "SOURCE"])
val_tokenized_data = val_data_sample.map(tokenize_function, batched=True, remove_columns=["INSTRUCTION", "RESPONSE", "SOURCE"])


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

train_dataloader = DataLoader(
    train_tokenized_data,
    shuffle=True,
    batch_size=1,  
    collate_fn=data_collator,
)

val_dataloader = DataLoader(
    val_tokenized_data,
    batch_size=1,
    collate_fn=data_collator,
)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [18]:
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=2, 
    per_device_train_batch_size=1, 
    logging_steps=10, 
    learning_rate=5e-5,
    weight_decay=0.01,
    report_to=[],  # disable reporting to external services like wandb
    disable_tqdm=False,
    evaluation_strategy="steps", 
    eval_steps=10,  
    gradient_accumulation_steps=4, 
    lr_scheduler_type='linear',  
    warmup_steps=100,  
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_tokenized_data, 
    eval_dataset=val_tokenized_data,    
    data_collator=data_collator,
)

peft_model.config.use_cache = False 
trainer.train()



Step,Training Loss,Validation Loss
10,1.8601,1.761057
20,1.947,1.760111
30,1.851,1.758253
40,2.0205,1.754841
50,2.1769,1.74851
60,1.9316,1.737558
70,1.8744,1.721354
80,1.7945,1.698768
90,1.8472,1.670522
100,1.683,1.643397




TrainOutput(global_step=1000, training_loss=1.4698223133087158, metrics={'train_runtime': 3352.3336, 'train_samples_per_second': 1.193, 'train_steps_per_second': 0.298, 'total_flos': 9098449199185920.0, 'train_loss': 1.4698223133087158, 'epoch': 2.0})

In [20]:
messages=[
    { 'role': 'user', 'content': "How do I change my pandas dataframe column name"}
]
inputs = tokenizer.apply_chat_template(messages, 
                                       add_generation_prompt=True, 
                                       return_tensors="pt").to(foundation_model.device)



In [31]:
%%time
base_model.to(device)
outputs = base_model.generate(inputs, max_new_tokens=200, do_sample=False,
                         top_k=5, top_p=0.9, num_return_sequences=1, 
                         eos_token_id=tokenizer.eos_token_id,
                        early_stopping=True)

print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


You can change the name of a column in a pandas DataFrame using the `rename` function. Here's an example:

```python
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Rename a column
df = df.rename(columns={'A': 'New_A'})

print(df)
```

In this example, the column 'A' in the DataFrame will be renamed to 'New_A'.

If you want to rename multiple columns, you can pass a dictionary where the keys are the old names and the values are the new names:

```python
df = df.rename(columns={'A': 'New_A', 'B': 'New_B'
CPU times: user 6.52 s, sys: 160 ms, total: 6.68 s
Wall time: 6.67 s


In [29]:
peft_model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32256, 2048)
        (layers): ModuleList(
          (0-23): 24 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lo

In [30]:
%%time
outputs = peft_model.generate(inputs, max_new_tokens=200, do_sample=False,
                         top_k=5, top_p=0.9, num_return_sequences=1, 
                         eos_token_id=tokenizer.eos_token_id,
                        early_stopping=True)

print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


You can change the name of a column in a pandas DataFrame using the `rename()` function. Here's an example:

```python
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Rename a column
df = df.rename(columns={'A': 'New_A', 'B': 'New_B'})
```

In this example, the column 'A' in the DataFrame is renamed to 'New_A', and the column 'B' is renamed to 'New_B'.

You can also rename multiple columns at once:

```python
df = df.rename(columns={'A': 'New_A', 'B': 'New_B', 'C
CPU times: user 13.3 s, sys: 7.79 ms, total: 13.3 s
Wall time: 13.3 s
