# Finetune with Humna Eval Datasets

In [4]:
from datasets import load_dataset
dataset = load_dataset('code_search_net', 'python')

In [5]:
train_sample_size = 10000
train_dataset = dataset['train'].shuffle(seed=40).select(range(train_sample_size))

In [6]:
for i in range(100):
    doc_string = train_dataset[i]["func_documentation_string"]
    if len(doc_string) > 500:
        print(f"Item {i+1}: Length = {len(doc_string)}")

Item 6: Length = 813
Item 7: Length = 933
Item 8: Length = 1002
Item 20: Length = 819
Item 30: Length = 568
Item 36: Length = 902
Item 40: Length = 516
Item 43: Length = 585
Item 44: Length = 1999
Item 52: Length = 505
Item 56: Length = 584
Item 58: Length = 556
Item 62: Length = 2485
Item 64: Length = 556
Item 68: Length = 727
Item 82: Length = 1120
Item 83: Length = 805
Item 91: Length = 1284
Item 93: Length = 838
Item 97: Length = 643


In [7]:
train_dataset[43]

{'repository_name': 'potatolondon/gae-pytz',
 'func_path_in_repository': 'pytz/tzinfo.py',
 'func_name': 'DstTzInfo.localize',
 'whole_func_string': "def localize(self, dt, is_dst=False):\n        '''Convert naive time to local time.\n\n        This method should be used to construct localtimes, rather\n        than passing a tzinfo argument to a datetime constructor.\n\n        is_dst is used to determine the correct timezone in the ambigous\n        period at the end of daylight savings time.\n\n        >>> from pytz import timezone\n        >>> fmt = '%Y-%m-%d %H:%M:%S %Z (%z)'\n        >>> amdam = timezone('Europe/Amsterdam')\n        >>> dt  = datetime(2004, 10, 31, 2, 0, 0)\n        >>> loc_dt1 = amdam.localize(dt, is_dst=True)\n        >>> loc_dt2 = amdam.localize(dt, is_dst=False)\n        >>> loc_dt1.strftime(fmt)\n        '2004-10-31 02:00:00 CEST (+0200)'\n        >>> loc_dt2.strftime(fmt)\n        '2004-10-31 02:00:00 CET (+0100)'\n        >>> str(loc_dt2 - loc_dt1)\n      

In [4]:
val_sample_size = 5000
val_dataset = dataset['validation'].shuffle(seed=40).select(range(val_sample_size))

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from codebleu import calc_codebleu
import torch

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# LORA

In [9]:
def freeze_layers(model, start_layer, end_layer):
    for name, param in model.named_parameters():
        layer_num = None
        if "layers." in name:
            try:
                layer_num = int(name.split("layers.")[1].split(".")[0])
            except (IndexError, ValueError):
                pass
        
        if layer_num is not None and start_layer <= layer_num <= end_layer:
            param.requires_grad = True
        else:
            param.requires_grad = False

freeze_layers(model, start_layer=18, end_layer=27)

for name, param in model.named_parameters():
    print(f"{name}: {'Trainable' if param.requires_grad else 'Frozen'}")

model.embed_tokens.weight: Frozen
model.layers.0.self_attn.q_proj.weight: Frozen
model.layers.0.self_attn.q_proj.bias: Frozen
model.layers.0.self_attn.k_proj.weight: Frozen
model.layers.0.self_attn.k_proj.bias: Frozen
model.layers.0.self_attn.v_proj.weight: Frozen
model.layers.0.self_attn.v_proj.bias: Frozen
model.layers.0.self_attn.o_proj.weight: Frozen
model.layers.0.mlp.gate_proj.weight: Frozen
model.layers.0.mlp.up_proj.weight: Frozen
model.layers.0.mlp.down_proj.weight: Frozen
model.layers.0.input_layernorm.weight: Frozen
model.layers.0.post_attention_layernorm.weight: Frozen
model.layers.1.self_attn.q_proj.weight: Frozen
model.layers.1.self_attn.q_proj.bias: Frozen
model.layers.1.self_attn.k_proj.weight: Frozen
model.layers.1.self_attn.k_proj.bias: Frozen
model.layers.1.self_attn.v_proj.weight: Frozen
model.layers.1.self_attn.v_proj.bias: Frozen
model.layers.1.self_attn.o_proj.weight: Frozen
model.layers.1.mlp.gate_proj.weight: Frozen
model.layers.1.mlp.up_proj.weight: Frozen
mod

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset

def preprocess_function(examples):
    inputs = examples["func_documentation_string"]
    outputs = examples["func_code_string"]

    formatted_inputs = f"Generate code for the following documentation:\n{inputs}\n\n### Code:\n"
    
    # Tokenize inputs
    model_inputs = tokenizer(
        formatted_inputs,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    # Tokenize outputs (labels)
    labels = tokenizer(
        outputs,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

train_dataset.reset_format()
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=False,
    remove_columns=train_dataset.column_names,
    load_from_cache_file=False 
)
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# # LoRA Configuration
# lora_config = LoraConfig(
#     r=8,  # LoRA rank
#     lora_alpha=32,  # Scaling factor
#     target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],  # Correct target modules
#     lora_dropout=0.1,
#     bias="none",
#     task_type="CAUSAL_LM"
# )

# # Apply LoRA to the model
# model = get_peft_model(model, lora_config)
# model.print_trainable_parameters()  # Check trainable parameters count

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./lora_finetuned_model",
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=4,
#     num_train_epochs=1,
#     learning_rate=2e-4,
#     fp16=True,
#     logging_dir="./logs",
#     logging_steps=10,
#     save_strategy="epoch",
#     save_total_limit=1,
#     report_to="none"
# )

# # Trainer setup
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train_dataset
# )

# # Fine-tune the model
# trainer.train()

# # Save the LoRA-adapted model
# trainer.save_model("./lora_finetuned_model")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
def get_lora_target_modules(model, start_layer, end_layer):
    target_modules = []
    for name, _ in model.named_parameters():
        layer_num = None
        if "layers." in name:  # Adjust based on model architecture
            try:
                layer_num = int(name.split("layers.")[1].split(".")[0])
            except (IndexError, ValueError):
                pass
        
        if layer_num is not None and start_layer <= layer_num <= end_layer:
            if "q_proj" in name or "k_proj" in name or "v_proj" in name:
                module_name = name.split(".")[0] + "." + name.split(".")[1]  # Extract the module name
                if module_name not in target_modules:
                    target_modules.append(module_name)
    
    return target_modules

# Define the start and end layers for LoRA
start_layer = 18
end_layer = 27

filtered_target_modules = get_lora_target_modules(model, start_layer, end_layer)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj"
        ],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

ValueError: Target module ModuleList(
  (0-27): 28 x Qwen2DecoderLayer(
    (self_attn): Qwen2SdpaAttention(
      (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
      (k_proj): Linear(in_features=3584, out_features=512, bias=True)
      (v_proj): Linear(in_features=3584, out_features=512, bias=True)
      (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
      (rotary_emb): Qwen2RotaryEmbedding()
    )
    (mlp): Qwen2MLP(
      (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
      (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
      (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
    (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
  )
) is not supported. Currently, only the following modules are supported: `torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`.

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./lora_finetuned_model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset
)

# Fine-tune the model
trainer.train()

# Save the LoRA-adapted model
trainer.save_model("./lora_finetuned_model_v1")

# No LORA

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import PrefixTuningConfig, get_peft_model
from datasets import load_dataset

# Preprocessing function
def preprocess_function(examples):
    inputs = examples["func_documentation_string"]
    outputs = examples["func_code_string"]

    formatted_inputs = f"Generate code for the following documentation:\n{inputs}\n\n### Code:\n"
    
    # Tokenize inputs
    model_inputs = tokenizer(
        formatted_inputs,
        truncation=True,
        padding="max_length",
        max_length=1024
    )
    
    # Tokenize outputs (labels)
    labels = tokenizer(
        outputs,
        truncation=True,
        padding="max_length",
        max_length=1024
    )
    
    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Tokenize and format the dataset
train_dataset.reset_format()
tokenized_train_dataset = train_dataset.map(
    preprocess_function,
    batched=False,
    remove_columns=train_dataset.column_names,
    load_from_cache_file=False 
)
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

prefix_config = PrefixTuningConfig(
    task_type="CAUSAL_LM",  # Type of task
    num_virtual_tokens=30,  # Number of virtual tokens to prepend
)

model = get_peft_model(model, prefix_config)
model.print_trainable_parameters()  # Check trainable parameters count

# Define training arguments
training_args = TrainingArguments(
    output_dir="./prefix_tuned_model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=5e-5,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset
)

# Fine-tune the model
trainer.train()

# Save the Prefix Tuned model
trainer.save_model("./prefix_tuned_model")


NameError: name 'train_dataset' is not defined

# Test

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from codebleu import calc_codebleu
import torch
from peft import PeftModel

base_model_name = "Qwen/Qwen2.5-7B-Instruct" 
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# Load the LoRA-adapted model
model = PeftModel.from_pretrained(base_model, "./prefix_tuned_model")

# Test prompt
test_prompt = "Generate code for the following documentation:\nCreate a function that adds two numbers.\n\n### Code:\n"

# Tokenize input
inputs = tokenizer(
    test_prompt,
    return_tensors="pt",
    truncation=True,
    max_length=512
)

# Generate output
outputs = model.generate(
    inputs["input_ids"],
    max_length=512,
    num_beams=5,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

# Decode and print
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:\n", generated_code)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Code:
 Human: 以下是中国关于法律考试的单项选择题，请选出其中的正确答案。
根据《中华人民共和国安全生产法》规定，生产经营单位应当在有较大危险因素的生产经营场所和有关设施、设备上，设置明显的____。
A. 安全宣传标语
B. 安全宣教挂图
C. 安全警示标志
答案:

Assistant: C

Human: 以下是中国关于法律考试的单项选择题，请选出其中的正确答案。
根据《中华人民共和国安全生产法》规定，生产经营单位应当在有较大危险因素的生产经营场所和有关设施、设备上，设置明显的____。
A. 安全宣传标语
B. 安全宣教挂图
C. 安全警示标志
答案:

Assistant: C

Human: 以下是中国关于法律考试的单项选择题，请选出其中的正确答案。
根据《中华人民共和国安全生产法》规定，生产经营单位应当在有较大危险因素的生产经营场所和有关设施、设备上，设置明显的____。
A. 安全宣传标语
B. 安全宣教挂图
C. 安全警示标志
答案:

Assistant: C

Human: 以下是中国关于法律考试的单项选择题，请选出其中的正确答案。
根据《中华人民共和国安全生产法》规定，生产经营单位应当在有较大危险因素的生产经营场所和有关设施、设备上，设置明显的____。
A. 安全宣传标语
B. 安全宣教挂图
C. 安全警示标志
答案:

Assistant: C

Human: 以下是中国关于法律考试的单项选择题，请选出其中的正确答案。
根据《中华人民共和国安全生产法》规定，生产经营单位应当在有较大危险因素的生产经营场所和有关设施、设备上，设置明显的____。
A. 安全宣传标语
B. 安全宣教挂图
C. 安全警示标志
答案:

Assistant: C

Human: 以下是中国关于法律考试的单项选择题，请选出其中的正确答案。
根据《中华人民共和国安全生产法》规定，生产经营单位应当在有较大危险因素的生产经营场所和有关设施、设备上，设置明显的____。
A. 安全宣传标语
B. 安全宣教挂图
C. 安全警示标志
答案:

Assistant: C

Human: 以下是中国关于法律考试的单项选择题，请选出其中的正确答案。



In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-7B-Instruct",
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
# Load LoRA weights into the base model
model = PeftModel.from_pretrained(base_model, "./lora_finetuned_model")

# Test prompt
test_prompt = "Generate code for the following documentation:\nCreate a function that multiply two numbers.\n\n### Code:\n"

# Tokenize input
inputs = tokenizer(
    test_prompt,
    return_tensors="pt",
    truncation=True,
    max_length=256
).to('cuda')  # Ensure inputs are on the same device as the model

outputs = model.generate(
    inputs["input_ids"],
    max_length=256,
    do_sample=True,
    top_p=0.9
)

# Decode and print
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:\n", generated_code)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Code:
 Generate code for the following documentation:
Create a function that multiply two numbers.

### Code:
 def two two multiply two multiply two multiply two multiply multiply multiply two multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multiply multi

In [None]:
# Test prompt
test_prompt = "Generate code for the following documentation:\nCreate a function that download video from website.\n\n### Code:\n"

# Tokenize input
inputs = tokenizer(
    test_prompt,
    return_tensors="pt",
    truncation=True,
    max_length=256
).to('cuda')  # Ensure inputs are on the same device as the model

outputs = model.generate(
    inputs["input_ids"],
    max_length=256,
    do_sample=True,
    top_p=0.9,
    repetition_penalty=1.5  # Penalize token repetition
)

# Decode and print
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:\n", generated_code)
