# **Step 1:** install required Dependencies

In [1]:
pip install -q accelerate==0.34.2 peft==0.6.2 bitsandbytes transformers trl==0.9.6 torch datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tensorboardX

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting protobuf>=3.20 (from tensorboardX)
  Using cached protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Using cached protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
Installing collected packages: protobuf, tensorboardX
Successfully installed protobuf-5.29.3 tensorboardX-2.6.2.2
Note: you may need to restart the kernel to use updated packages.


# **Step 2:** import required packages

In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, Trainer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
from trl import SFTTrainer
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


# **Step 3:** define the model name and the dataset used

In [2]:
MODEL_NAME = "tiiuae/falcon-7b-instruct"
DATASET_NAME = "jitx/Methods2Test_java_unit_test_code"

# **Step 4:** Load the dataset from remote repository.


*   "train" section is used for training
*   "test" section is used for evaluation

In [5]:
training_dataset = load_dataset(DATASET_NAME, split="train")

In [6]:
evaluation_dataset = load_dataset(DATASET_NAME, split="test")

# **Step 5:** select the partion of datasets used in training and convert it to the model prompt format

In [7]:
# Set a seed for deterministic sorting
seed = 42
part = 0.02

train_partion = training_dataset.shuffle(seed=seed).select(range(int(len(training_dataset) * part)))
evaluation_partion = evaluation_dataset.shuffle(seed=seed).select(range(int(len(evaluation_dataset) * part)))

In [9]:
INPUT_FIELD = "src_fm"
OUTPUT_FIELD = "target"

# Function to convert each example
def convert_to_falcon_format(focal_method, target_test_case):
    # Define the system prompt
    system_prompt = "Generate unit tests for the following method or function:\n"

    # Format the example into LLaMA format
    formatted_example = f"### System: {system_prompt}### Human: {focal_method}### Assistant: {target_test_case}"

    return formatted_example

# Convert the entire dataset
converted_data = [{"text": convert_to_falcon_format(entry[INPUT_FIELD], entry[OUTPUT_FIELD])} for entry in train_partion]

# Save the converted data to a JSON file
output_file = './resources/dataset/falcon_format_dataset_train.json'
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w') as f:
    json.dump(converted_data, f, indent=4)

# Print a few examples to verify the result
print(f"Converted dataset Training saved to {output_file}")
for example in converted_data[:5]:
    print(example)

# Convert the entire dataset
converted_data = [{"text" : convert_to_falcon_format(entry[INPUT_FIELD], entry[OUTPUT_FIELD])} for entry in evaluation_partion]

# Save the converted data to a JSON file
output_file = './resources/dataset/falcon_format_dataset_eval.json'
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w') as f:
    json.dump(converted_data, f, indent=4)

# Print a few examples to verify the result
print(f"Converted dataset Evaluation saved to {output_file}")
for example in converted_data[:5]:
    print(example)

Converted dataset Training saved to ./falcon7b_finetuning/dataset/falcon_format_dataset_train.json
{'text': '### System: Generate unit tests for the following method or function:\n### Human: public Set<String> getOutputResourceFields( T meta ) { return null; }### Assistant: @Test public void testGetOutputResourceFields() throws Exception { assertNull( analyzer.getOutputResourceFields( meta ) ); }'}
{'text': '### System: Generate unit tests for the following method or function:\n### Human: @Override public Long queryFrom(MonetaryAmount amount) { Objects.requireNonNull(amount, "Amount required."); return amount.with(downRounding).getNumber().longValueExact(); }### Assistant: @Test public void shouldReturnMajorPartNegative() { MonetaryAmount monetaryAmount = Money.parse("BHD -1.345"); Long result = query.queryFrom(monetaryAmount); Long expected = -1L; assertEquals(result, expected ); }'}
{'text': '### System: Generate unit tests for the following method or function:\n### Human: public sta

In [None]:
# Huggingface Token required for remote access.
# Please update it with your actual Read Token
hf_token = 

In [4]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, use_auth_token=hf_token)

# Configure Padding
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



In [5]:
# Configure quantization
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    use_safetensors=True,
    quantization_config=quant_config,
    trust_remote_code=True,
    device_map="cuda:0",
    use_auth_token=hf_token
)

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model.config.use_cache = False



Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.50s/it]
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [6]:
# list quantisation Params of the model for verification
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.BITS_AND_BYTES: 'bitsandbytes'>,
 '_load_in_8bit': False,
 '_load_in_4bit': True,
 'llm_int8_threshold': 6.0,
 'llm_int8_skip_modules': None,
 'llm_int8_enable_fp32_cpu_offload': False,
 'llm_int8_has_fp16_weight': False,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': True,
 'bnb_4bit_compute_dtype': 'float16',
 'bnb_4bit_quant_storage': 'uint8',
 'load_in_4bit': True,
 'load_in_8bit': False}

# **Step 7:** Train the model with the saved dataset files, then save the new Lora Adapter

In [6]:
# Configure Lora Params
lora_alpha = 32
Lora_dropout = 0.1
lora_r = 16

lora_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=Lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)


In [7]:
# Load dataset
train_dataset = load_dataset("json", data_files={"train" : "./resources/dataset/falcon_format_dataset_train.json"})

evaluation_dataset = load_dataset("json", data_files={"validation" : "./resources/dataset/falcon_format_dataset_eval.json"})

In [8]:
# Dataset size and batch parameters
dataset_size = 12480
per_device_train_batch_size = 2
gradient_accumulation_steps = 2
num_gpus = 1

# Effective batch size and steps per epoch
effective_batch_size = per_device_train_batch_size * gradient_accumulation_steps * num_gpus
steps_per_epoch = dataset_size // effective_batch_size

# TrainingArguments configuration
training_args = TrainingArguments(
    # Batch and gradient parameters
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size * 2,  # Double the train batch size for evaluation
    gradient_accumulation_steps=gradient_accumulation_steps,

    # Optimizer and learning rate scheduler
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.05,
    lr_scheduler_type="cosine",

    # Training duration
    num_train_epochs=2,

    # Evaluation and logging strategies
    evaluation_strategy="steps",  # Evaluate at regular steps
    eval_steps=steps_per_epoch // 5,  # Evaluate 5 times per epoch
    logging_strategy="steps",
    logging_steps=steps_per_epoch // 5,  # Log metrics at the same frequency as evaluation

    # Checkpointing
    save_strategy="steps",  # Save checkpoints regularly
    save_steps=steps_per_epoch // 5,  # Save 5 times per epoch

    # Output and reporting
    output_dir="./resources/tensorboard",
    report_to="tensorboard",  # Log metrics to TensorBoard
    run_name="falcon7b_finetuning",  # Experiment name for tracking

    # Miscellaneous
    group_by_length=True,  # Group sequences of similar lengths for efficiency
    gradient_checkpointing=True,  # Reduce memory usage during training
    seed=42,  # Ensures reproducibility
    save_total_limit=2,  # Retain only the last 2 checkpoints to save storage
    save_safetensors=True,  # Save model checkpoints in a more secure format
)

# Print the configuration for verification
print(training_args)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=624,
eval_strategy=IntervalStrategy.STEPS,
eval_u



In [9]:
# Trainer Initialization
trainer = SFTTrainer(
    model=model,                        # Model to train
    args=training_args,                 # Training arguments
    train_dataset=train_dataset["train"],
    eval_dataset=evaluation_dataset["validation"],
    tokenizer=tokenizer,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
  super().__init__(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [10]:
trainer.train()

# save the adapter
trainer.save_model(output_dir="./resources/trained_model_adapt_param")


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
624,1.286,1.17922
1248,1.1679,1.161336
1872,1.1621,1.149861
2496,1.153,1.143124
3120,1.1415,1.136528
3744,1.0937,1.134776
4368,1.0997,1.131975
4992,1.0834,1.129775
5616,1.1043,1.128822
6240,1.0834,1.128627


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwargs)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  return fn(*args, **kwa