# Finetune with Humna Eval Datasets

In [1]:
from datasets import load_dataset
dataset = load_dataset("sahil2801/CodeAlpaca-20k")

In [2]:
train_sample_size = 10000
train_dataset = dataset['train'].shuffle(seed=40).select(range(train_sample_size))

In [3]:
for i in range(100):
    doc_string = train_dataset[i]["output"]
    if len(doc_string) > 500:
        print(f"Item {i+1}: Length = {len(doc_string)}")

Item 51: Length = 576
Item 72: Length = 621
Item 74: Length = 1075
Item 82: Length = 560


In [4]:
# val_sample_size = 5000
# val_dataset = dataset['validation'].shuffle(seed=40).select(range(val_sample_size))

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from codebleu import calc_codebleu
import torch

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
print(train_dataset)
print(train_dataset[0])
#train_dataset.reset_format()

Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 10000
})
{'output': 'my_tuple = (1, 2, 3)  # Output: (1, 2, 3)', 'instruction': 'Define a tuple in Python.', 'input': ''}


In [7]:
train_dataset.reset_format()
print(train_dataset)
print(train_dataset[0])

Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 10000
})
{'output': 'my_tuple = (1, 2, 3)  # Output: (1, 2, 3)', 'instruction': 'Define a tuple in Python.', 'input': ''}


# LORA

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

def preprocess_function(examples):
    inputs = examples["instruction"]
    outputs = examples["output"]

    formatted_inputs = f"Generate code for the following documentation:\n{inputs}\n\n### Code:\n"
    
    # Tokenize inputs
    model_inputs = tokenizer(
        formatted_inputs,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    # Tokenize outputs (labels)
    labels = tokenizer(
        outputs,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

train_dataset.reset_format()
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=False,
    remove_columns=train_dataset.column_names,
    load_from_cache_file=False 
)
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# # LoRA Configuration
# lora_config = LoraConfig(
#     r=8,  # LoRA rank
#     lora_alpha=32,  # Scaling factor
#     target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],  # Correct target modules
#     lora_dropout=0.1,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# # Apply LoRA to the model
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()  # Check trainable parameters count

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./lora_finetuned_model",
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=4,
#     num_train_epochs=1,
#     learning_rate=2e-4,
#     fp16=True,
#     logging_dir="./logs",
#     logging_steps=10,
#     save_strategy="epoch",
#     save_total_limit=1,
#     report_to="none"
# )

# # Trainer setup
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train_dataset
# )

# # Fine-tune the model
# trainer.train()

# # Save the LoRA-adapted model
# trainer.save_model("./lora_finetuned_model")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [8]:
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=32,  # Scaling factor
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],  # Correct target modules
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # Check trainable parameters count

trainable params: 3,440,640 || all params: 7,619,057,152 || trainable%: 0.0452


In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./lora_finetuned_model_CodeAlpaca",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset
)

# Fine-tune the model
trainer.train()

# Save the LoRA-adapted model
trainer.save_model("./lora_finetuned_model_CodeAlpaca")

Step,Training Loss
10,9.9987
20,1.399
30,1.1008
40,1.0721
50,1.075
60,0.849
70,0.9344
80,0.863
90,1.1533
100,0.6398


# No LORA

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import PrefixTuningConfig, get_peft_model
from datasets import load_dataset

# Preprocessing function
def preprocess_function(examples):
    inputs = examples["instruction"]
    outputs = examples["output"]

    formatted_inputs = f"Generate code for the following documentation:\n{inputs}\n\n### Code:\n"
    
    # Tokenize inputs
    model_inputs = tokenizer(
        formatted_inputs,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    # Tokenize outputs (labels)
    labels = tokenizer(
        outputs,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Tokenize and format the dataset
train_dataset.reset_format()
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=False,
    remove_columns=train_dataset.column_names,
    load_from_cache_file=False 
)
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

prefix_config = PrefixTuningConfig(
    task_type="CAUSAL_LM",  # Type of task
    num_virtual_tokens=30,  # Number of virtual tokens to prepend
)

model = get_peft_model(model, prefix_config)
model.print_trainable_parameters()  # Check trainable parameters count

# Define training arguments
training_args = TrainingArguments(
    output_dir="./prefix_tuned_model_CodeAlpaca",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset
)

# Fine-tune the model
trainer.train()

# Save the Prefix Tuned model
trainer.save_model("./prefix_tuned_model_CodeAlpaca")


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

trainable params: 860,160 || all params: 7,616,476,672 || trainable%: 0.0113


We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)


Step,Training Loss
10,9.4724
20,9.0001
30,8.8941
40,8.5462
50,8.3211
60,7.8572
70,7.7574
80,7.4919
90,7.2468
100,6.7865


# Test

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from codebleu import calc_codebleu
import torch
from peft import PeftModel

base_model_name = "Qwen/Qwen2.5-7B-Instruct" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# Load the LoRA-adapted model
model = PeftModel.from_pretrained(base_model, "./lora_finetuned_model_CodeAlpaca")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
# Test prompt
test_prompt = "Generate code for the following documentation:\nCreate a function that multiply two numbers.\n\n### Code:\n"

# Tokenize input
inputs = tokenizer(
    test_prompt,
    return_tensors="pt",
    truncation=True,
    max_length=512
)

# Generate output
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=512,
    temperature=0.7,
    top_p=0.9,
)

# Decode and print
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:\n", generated_code)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generated Code:
 Generate code for the following documentation:
Create a function that multiply two numbers.

### Code:



In [8]:
# Test prompt
test_prompt = "Write a function to generate the nth Fibonacci number..\n"

# Tokenize input
inputs = tokenizer(
    test_prompt,
    return_tensors="pt",
    truncation=True,
    max_length=512
)

# Generate output
outputs = model.generate(
    inputs["input_ids"],
    max_length=512,
    temperature=1.0,
    top_p=0.9,
    top_k=50, 
    do_sample=True
)

# Decode and print
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:\n", generated_code)


Generated Code:
 Write a function to generate the nth Fibonacci number..
def Fibonacci


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
# Load LoRA weights into the base model
model = PeftModel.from_pretrained(base_model, "./lora_finetuned_model")

# Test prompt
test_prompt = "Generate code for the following documentation:\nCreate a function that multiply two numbers.\n\n### Code:\n"

# Tokenize input
inputs = tokenizer(
    test_prompt,
    return_tensors="pt",
    truncation=True,
    max_length=256
).to('cuda')  # Ensure inputs are on the same device as the model

outputs = model.generate(
    inputs["input_ids"],
    max_length=256,
    do_sample=True,
    top_p=0.9
)

# Decode and print
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:\n", generated_code)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Code:
 Generate code for the following documentation:
Create a function that multiply two numbers.

### Code:
 def two two multiply two multiply two multiply two multiply multiply multiply two multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multi

In [None]:
# Test prompt
test_prompt = "Generate code for the following documentation:\nCreate a function that download video from website.\n\n### Code:\n"

# Tokenize input
inputs = tokenizer(
    test_prompt,
    return_tensors="pt",
    truncation=True,
    max_length=256
).to('cuda')  # Ensure inputs are on the same device as the model

outputs = model.generate(
    inputs["input_ids"],
    max_length=256,
    do_sample=True,
    top_p=0.9,
    repetition_penalty=1.5  # Penalize token repetition
)

# Decode and print
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:\n", generated_code)
