In [1]:
pip install -q pandas wandb torch transformers datasets peft accelerate bitsandbytes trl tqdm evaluate bert_score rouge_score nltk sacrebleu codebleu

Note: you may need to restart the kernel to use updated packages.


In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer)

MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
# Formatting function for SFT
def formatting_prompts_func(examples):
    outputs =[]
    for prompt, completion in zip(examples['prompt'], examples['completion']):
        messages = [
            {"role": "system", "content": "You are a helpful AI assistant specialized in code review and security analysis."},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": completion}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        outputs.append(text)
    return {"text":outputs}

# Load dataset
train_dataset = load_dataset('json', data_files='/kaggle/input/codereviewdataset/train_dataset.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='/kaggle/input/codereviewdataset/eval_dataset.jsonl', split='train')


train_dataset = train_dataset.map(
    formatting_prompts_func, 
    batched=True, 
    remove_columns=['prompt', 'completion']
)
eval_dataset = eval_dataset.map(
    formatting_prompts_func, 
    batched=True, 
    remove_columns=['prompt', 'completion']
)

train_dataset.push_to_hub("alenphilip/Code-Review-Assistant")
eval_dataset.push_to_hub("alenphilip/Code-Review-Assistant-Eval")

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import gc
import os
import wandb
import evaluate
import numpy as np
import nltk

# Set memory optimization
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear memory
torch.cuda.empty_cache()
gc.collect()

# Configuration
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
OUTPUT_DIR = "./qwen2.5-7b-sft-qlora"
LORA_R = 32
LORA_ALPHA = 64
LORA_DROPOUT = 0.1
# TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
TARGET_MODULES = ["q_proj", "v_proj", "down_proj", "gate_proj"]

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2"
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

# Configure LoRA
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=TARGET_MODULES,
    bias="none",
    task_type="CAUSAL_LM",
)

# Load datasets
train_dataset = load_dataset("alenphilip/Code-Review-Assistant", split="train")
eval_dataset = load_dataset("alenphilip/Code-Review-Assistant-Eval", split="train")

# Load metrics
nltk.download('punkt', quiet=True)
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")


# CRITICAL FIX: Convert logits to token IDs immediately to save memory
def preprocess_logits_for_metrics(logits, labels):
    """
    This function runs BEFORE accumulation, saving massive amounts of memory.
    Converts (batch, seq_len, vocab_size) → (batch, seq_len)
    Memory reduction: ~600x smaller!
    """
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)


def compute_metrics(eval_preds):
    """
    Now receives token IDs (not logits) thanks to preprocess_logits_for_metrics.
    Computes ROUGE-L and BLEU on assistant responses only.
    """
    try:
        predictions, labels = eval_preds
        
        # Predictions are already token IDs (not logits)
        predicted_ids = predictions
        
        # Clean labels (-100 → pad_token_id)
        predicted_ids = np.where(predicted_ids == -100, tokenizer.pad_token_id, predicted_ids)
        labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
        
        # Decode
        decoded_preds = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        
        parsed_preds = []
        parsed_labels = []
        
        for pred, label in zip(decoded_preds, decoded_labels):
            # Extract assistant response from prediction
            if "<|im_start|>assistant" in pred:
                assistant_part = pred.split("<|im_start|>assistant")[-1]
                if "<|im_end|>" in assistant_part:
                    assistant_response = assistant_part.split("<|im_end|>")[0].strip()
                    parsed_preds.append(assistant_response)
                else:
                    parsed_preds.append(assistant_part.strip())
            else:
                parsed_preds.append(pred.strip())
            
            # Extract assistant response from label
            if "<|im_start|>assistant" in label:
                assistant_part = label.split("<|im_start|>assistant")[-1]
                if "<|im_end|>" in assistant_part:
                    assistant_response = assistant_part.split("<|im_end|>")[0].strip()
                    parsed_labels.append(assistant_response)
                else:
                    parsed_labels.append(assistant_part.strip())
            else:
                parsed_labels.append(label.strip())
        
        # Filter out empty strings
        parsed_preds = [p for p in parsed_preds if p.strip()]
        parsed_labels = [l for l in parsed_labels if l.strip()]
        
        if not parsed_preds or not parsed_labels:
            return {"rougeL": 0.0, "bleu": 0.0}

        # Calculate metrics
        rouge_results = rouge_metric.compute(predictions=parsed_preds, references=parsed_labels)
        bleu_results = bleu_metric.compute(predictions=parsed_preds, references=[[l] for l in parsed_labels])
        return {
            "rougeL": rouge_results["rougeL"],
            "bleu": bleu_results["score"],
        }
        
    except Exception as e:
        print(f"Metrics error: {e}")
        return {"rougeL": 0.0, "bleu": 0.0}


# Training arguments with evaluation enabled
training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    
    # Training settings
    do_train=True,
    do_eval=True,  # ✅ Enable evaluation during training
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  # Smaller for evaluation (can increase if no OOM)
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=2,
    max_grad_norm=0.3,
    
    # Evaluation strategy
    eval_strategy="steps", 
    eval_steps=50,  
    eval_accumulation_steps=4,
    
    # Scheduler and optimization
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    optim="paged_adamw_8bit",
    bf16=True,
    weight_decay=0.05,
    
    # Logging and saving
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,  # ✅ Keep only best 3 checkpoints to save disk space
    
    # Model selection based on metrics
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    
    # SFT specific
    dataset_text_field="text",
    remove_unused_columns=False,
    gradient_checkpointing=True,
    max_length=2048,
    dataloader_pin_memory=False,  # ✅ Save memory
    
    # Push to hub
    push_to_hub=True,
    hub_model_id="alenphilip/Code_Review_Assistant",
    hub_strategy="checkpoint",  # ✅ Push best checkpoint (not just at end)
    
    # Logging
    report_to="wandb"
)

# Clear memory before training
torch.cuda.empty_cache()
gc.collect()

# Create trainer with the memory-efficient preprocessing function
trainer = SFTTrainer(
    model=model,
    args=training_args,
    peft_config=peft_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # ✅ Now we include eval dataset
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,  # ✅ KEY FIX!
)

# Start training with evaluation
print("Starting training with evaluation...")
print(f"Training on {len(train_dataset)} examples")
print(f"Evaluating on {len(eval_dataset)} examples every {training_args.eval_steps} steps")
print(f"Best model will be selected based on {training_args.metric_for_best_model}")

trainer.train()

print("\n" + "="*70)
print("Training completed!")
print("="*70)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


Starting training with evaluation...
Training on 13670 examples
Evaluating on 1726 examples every 50 steps
Best model will be selected based on rougeL


[34m[1mwandb[0m: Currently logged in as: [33malenphilip2071[0m ([33malenphilip2071-google[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,Rougel,Bleu,Entropy,Num Tokens,Mean Token Accuracy
50,0.7196,0.655296,0.749021,62.702702,0.661059,1206059.0,0.81573
100,0.6431,0.623488,0.738838,60.525961,0.591626,2417754.0,0.823156
150,0.6279,0.607305,0.748553,61.158296,0.595939,3614721.0,0.826756
200,0.6032,0.602193,0.749269,60.260313,0.582608,4806172.0,0.827303
250,0.5672,0.601825,0.748254,61.178798,0.530741,6002932.0,0.828879
300,0.5536,0.597962,0.752772,61.618169,0.52903,7198597.0,0.830216
350,0.538,0.595937,0.753529,61.861999,0.525623,8406525.0,0.83059
400,0.5455,0.595128,0.754338,61.993044,0.525889,9612371.0,0.830819


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)



Training completed!


In [3]:
import torch
from transformers import AutoTokenizer
def save_merged_model():
    from transformers import AutoModelForCausalLM
    from peft import PeftModel
    import os
    
    # Use your final checkpoint
    checkpoint_path = "./qwen2.5-7b-sft-qlora/checkpoint-400"
  
    base_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2.5-7B-Instruct",  # Original base model
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )

    peft_model = PeftModel.from_pretrained(
        base_model,
        checkpoint_path,  # Your trained adapter
        torch_dtype=torch.bfloat16
    )
    
    merged_model = peft_model.merge_and_unload()
    merged_model.push_to_hub("alenphilip/Code_Review_Assistant_Model")
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"
    tokenizer.push_to_hub("alenphilip/Code_Review_Assistant_Model")

save_merged_model()

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            