In [None]:
import os
import csv
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from accelerate import Accelerator
from tqdm import tqdm

# Initialize Accelerator
accelerator = Accelerator()

# Step 1: Load Dataset
dataset_path = "./data/codegen_finetune_pairs.json"  # Path to your dataset
dataset = load_dataset("json", data_files=dataset_path)

# Step 2: Tokenizer and Model
model_name = "Salesforce/codet5-small"  # Model name
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocess the dataset
def preprocess_function(examples):
    return tokenizer(
        examples["input"],
        text_target=examples["output"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

# Add tqdm for tokenization progress
tokenized_datasets = dataset.map(preprocess_function, batched=True, desc="Tokenizing dataset")

# Step 3: Load Model with Quantization Config
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit quantization
    llm_int8_threshold=6.0,  # Threshold for weights that remain in float32
    llm_int8_skip_modules=["lm_head"],  # Skip quantization for specified modules
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",  # Automatically map model to available GPUs
)

# Step 4: Configure LoRA
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=32,
    target_modules=["q", "k", "v", "o"],  # Targeting attention layers
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Step 5: Training Configuration
output_dir = "./codet5_qlora_finetuned"
eval_results_csv = "/data/evaluation_results.csv"  # Path to store evaluation results

# Training Configuration
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save at the end of each epoch
    per_device_train_batch_size=2,  # Adjusted for 4GB GPU
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=5e-4,  # Set default learning rate
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=3,  # Keep the latest 3 checkpoints
    fp16=True,  # Enable mixed precision for better performance
    load_best_model_at_end=True,
    optim="adamw_bnb_8bit",  # Optimizer for 8-bit training
)

# Step 6: Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"].select(range(100)),  # Small eval subset
    tokenizer=tokenizer,
)

# Step 7: Fine-Tuning with tqdm progress
with open(eval_results_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["epoch", "eval_loss", "eval_runtime", "eval_samples_per_second", "eval_steps_per_second"])

    with tqdm(total=training_args.num_train_epochs, desc="Training epochs") as pbar:
        for epoch in range(training_args.num_train_epochs):
            trainer.train()
            eval_results = trainer.evaluate()

            # Log evaluation results
            writer.writerow([
                epoch + 1,
                eval_results.get("eval_loss"),
                eval_results.get("eval_runtime"),
                eval_results.get("eval_samples_per_second"),
                eval_results.get("eval_steps_per_second"),
            ])

            pbar.update(1)

# Step 8: Save Fine-Tuned Model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Fine-tuned model saved to {output_dir}")
print(f"Evaluation results saved to {eval_results_csv}")

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model and tokenizer
model_dir = "./codet5_qlora_finetuned"  # Directory where the fine-tuned model is stored
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

# Input text for code generation
input_text = "Generate code for DBPower C300 HD Camera"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate code using beam search
outputs = model.generate(
    input_ids,
    max_length=100,           # Maximum length of the generated sequence
    num_beams=5,              # Beam search for higher quality outputs
    no_repeat_ngram_size=3,   # Prevent repeating the same n-grams
    early_stopping=True,      # Stop generation when the model is confident
    length_penalty=1.0,       # Penalize longer sequences; adjust as needed
)

# Decode and display the generated code
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Code:")
print(generated_code)
