<a href="https://colab.research.google.com/github/alee-kolachi/CodeGen-LoRA-Fine-Tuning/blob/main/Untitled15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install -q transformers accelerate datasets bitsandbytes peft

In [21]:
import os
os.makedirs('data', exist_ok=True)

In [22]:
from datasets import load_dataset

ds = load_dataset("flytech/python-codes-25k")

In [23]:
ds

DatasetDict({
    train: Dataset({
        features: ['output', 'instruction', 'input', 'text'],
        num_rows: 49626
    })
})

In [24]:
ds['train'][0]

{'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'instruction': 'Help me set up my daily to-do list!',
 'input': 'Setting up your daily to-do list...',
 'text': "Help me set up my daily to-do list! Setting up your daily to-do list... ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}

In [25]:
from transformers import AutoTokenizer

MODEL_NAME = "Salesforce/codegen-350M-mono"

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True
)

if tokenizer.pad_token_id is None:
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})

In [26]:
ds['train'][0]

{'output': "```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```",
 'instruction': 'Help me set up my daily to-do list!',
 'input': 'Setting up your daily to-do list...',
 'text': "Help me set up my daily to-do list! Setting up your daily to-do list... ```python\ntasks = []\nwhile True:\n    task = input('Enter a task or type 'done' to finish: ')\n    if task == 'done': break\n    tasks.append(task)\nprint(f'Your to-do list for today: {tasks}')\n```"}

In [27]:
def preprocess_mask_prompt(example, tokenizer, MAX_LENGTH=512):
    # Build prompt from dataset fields
    human = f"###Human: {example['instruction']}\n{example['input']}\n\n"
    assistant = f"###Assistant:\n{example['output']}"

    # Tokenize
    prompt_ids = tokenizer(human, add_special_tokens=False)['input_ids']
    response_ids = tokenizer(assistant, add_special_tokens=False)['input_ids']

    # Concatenate and truncate
    input_ids = (prompt_ids + response_ids)[:MAX_LENGTH]
    attention_mask = [1] * len(input_ids)

    # Padding
    pad_length = MAX_LENGTH - len(input_ids)
    if pad_length > 0:
        input_ids += [tokenizer.pad_token_id] * pad_length
        attention_mask += [0] * pad_length

    # Labels: mask prompt, keep only response
    labels = [-100] * len(prompt_ids) + response_ids
    labels = labels[:MAX_LENGTH] + [-100] * max(0, MAX_LENGTH - len(labels))

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [28]:
tokenized = ds.map(
    lambda x: preprocess_mask_prompt(x, tokenizer=tokenizer),
    remove_columns=['instruction', 'input', 'output', 'text']
)


In [29]:
print(tokenized)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 49626
    })
})


In [30]:
pip install -U bitsandbytes



In [31]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training

bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
model.resize_token_embeddings(len(tokenizer))
model=prepare_model_for_kbit_training(model)
print("model loaded and prepared")

Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

model loaded and prepared


In [32]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["qkv_proj", "k_proj", "out_proj", "fc_in", "fc_out"], # Updated target modules
    lora_dropout=0.05,
    bias="none",
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, lora_config)
print("LoRA Applied, trainable params: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

LoRA Applied, trainable params:  2621440


In [33]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="opt350m_lora_out",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    learning_rate=1e-5,     # conservative for fine-tuning
    weight_decay=0.01,
    warmup_steps=20,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    report_to="none",
    remove_unused_columns=False
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    data_collator=data_collator
)
print("Trainer ready.")


Trainer ready.


In [None]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
10,1.6894
20,1.7495
30,1.479
40,1.7136
50,1.4565
60,1.3049
70,1.6836
80,1.5118
90,1.3626
100,1.5098


In [None]:
model.save_pretrained("opt350m_lora_adapter")
tokenizer.save_pretrained("opt350m_lora_adapter")
print("Adapter + tokenizer saved.")


First, let's load the base model and the fine-tuned model (adapter).

In [None]:
MODEL_NAME = "Salesforce/codegen-350M-mono"

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import PeftModel

ADAPTER_DIR = "opt350m_lora_adapter"
PROMPT = "###Human: Write a python function to reverse a string.\n\n###Assistant"
MAX_NEW_TOKENS = 200

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token_id is None:
    tokenizer.add_special_tokens({"pad_token":"[PAD]"})

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)

base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
base.resize_token_embeddings(len(tokenizer))

fin_base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, quantization_config=bnb_config, device_map="auto")
fin_base.resize_token_embeddings(len(tokenizer))
finetuned = PeftModel.from_pretrained(fin_base, ADAPTER_DIR)

gen_args = dict(max_new_tokens=MAX_NEW_TOKENS, do_sample=True, top_p=0.9, temperature=0.75,
                repetition_penalty=1.25, no_repeat_ngram_size=3,
                eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)

def gen_cont(model, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(next(model.parameters()).device)
    with torch.no_grad():
        seq = model.generate(**inputs, **gen_args)
    cont = seq[0, inputs["input_ids"].shape[1]:]
    return tokenizer.decode(cont, skip_special_tokens=True).strip()

print("PROMPT:\n", PROMPT, "\n")
print("BASE CONT:", gen_cont(base, PROMPT), "\n")
print("FINETUNED CONT:", gen_cont(finetuned, PROMPT))
