In [1]:
%%capture
!pip install unsloth
!pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install huggingface_hub==0.25.0

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name= "unsloth/Qwen2.5-Coder-1.5B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.2: Fast Qwen2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.138 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers via:
`pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"`


We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj", 
                      # "embed_tokens", "lm_head",
                     ],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.10.2 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.


<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [3]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def print_dataset_info(dataset):
    """
    Print information about the dataset.

    Args:
        dataset: The dataset to print information about
    """
    print("Dataset loaded successfully.")
    print(f"Dataset info: {dataset}")
    dataset = dataset.shuffle(seed=42)
    print("Shuffled!!!")

In [4]:
def load_fast_apply_coding_dataset():
    """
    Load the fast-apply-coding dataset from Hugging Face.

    Returns:
        dataset: The loaded dataset with only 'original_code', 'update_snippet', and 'final_code' columns
    """
    
    from datasets import load_dataset
    dataset = load_dataset("quocdat25/fast-apply-coding")
    # Select only the required columns
    dataset = dataset.map(lambda example: {
        'original_code': example['original_code'],
        'update_snippet': example['update_snippet'],
        'final_code': example['final_code']
    })
    return dataset

def formatting_prompts_func(examples):
    original_codes = examples["original_code"]
    update_snippets = examples["update_snippet"]
    final_codes = examples["final_code"]
    texts = []
    prompt_template = """<|im_start|>system
You are an coding assistant that helps merge code updates, ensuring every modification is fully integrated.<|im_end|>
<|im_start|>user
Merge all changes from the <update> snippet into the <code> below.
- Preserve the code's structure, order, comments, and indentation exactly.
- Output only the updated code, enclosed within <updated-code> and </updated-code> tags.
- Do not include any additional text, explanations, placeholders, ellipses, or code fences.

<code>{original_code}</code>

<update>{update_snippet}</update>

Provide the complete updated code.<|im_end|>
<|im_start|>assistant
<updated-code>{final_code}</updated-code>
"""

    for original_code, update_snippet, final_code in zip(original_codes, update_snippets, final_codes):
        text = prompt_template.format(
            original_code=original_code,
            update_snippet=update_snippet,
            final_code=final_code
        ).strip() + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

In [5]:
dataset = load_fast_apply_coding_dataset()

filtered_dataset = dataset
if False: # If want to remove lengthy example , default = False
    TOKEN_LIMIT = 2500
    def filter_by_token_count(example):
        return example['Token Count'] <= TOKEN_LIMIT
    
    filtered_dataset = dataset.filter(filter_by_token_count)

formatted_dataset = filtered_dataset.map(formatting_prompts_func, batched=True, remove_columns=filtered_dataset["train"].column_names)

print_dataset_info(formatted_dataset)

print("\nFormatted Dataset Sample:")
print(formatted_dataset["train"].select(range(1)))


Dataset loaded successfully.
Dataset info: DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 5688
    })
    test: Dataset({
        features: ['text'],
        num_rows: 200
    })
})
Shuffled!!!

Formatted Dataset Sample:
Dataset({
    features: ['text'],
    num_rows: 1
})


In [6]:
print(formatted_dataset["train"].select(range(1))[0]['text'])


<|im_start|>system
You are an coding assistant that helps merge code updates, ensuring every modification is fully integrated.<|im_end|>
<|im_start|>user
Merge all changes from the <update> snippet into the <code> below.
- Preserve the code's structure, order, comments, and indentation exactly.
- Output only the updated code, enclosed within <updated-code> and </updated-code> tags.
- Do not include any additional text, explanations, placeholders, ellipses, or code fences.

<code>import { buttonVariants } from '@/components/ui/Button';
import Link from 'next/link';
import { cn } from '@/lib/utils/helpers';
import config from '@/lib/config/marketing';

export default function CTA() {
  const {
    copy: { cta }
  } = config;

  return (
    <div className="">
      <div className="px-6 py-24 sm:px-6 sm:py-32 lg:px-8">
        <div className="mx-auto max-w-2xl text-center">
          <h2 className="text-3xl font-bold tracking-tight  sm:text-4xl">
            {cta.heading}
            <br 

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset["train"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4, # higher = more stable 
        warmup_steps = 15,
        num_train_epochs = 1, # Set this for 1 full training run.
        learning_rate = 1e-4, 
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [8]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100 80GB PCIe. Max memory = 79.138 GB.
1.643 GB of memory reserved.


In [None]:
# from unsloth import unsloth_train
# trainer_stats = unsloth_train(trainer)
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 5,688 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 4
\        /    Total batch size = 32 | Total steps = 177
 "-____-"     Number of trainable parameters = 36,929,536


**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers and Unsloth!


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mquocdat-le-insacvl2[0m ([33mquocdat-le-insacvl-recall-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,0.5596
2,0.5715
3,0.608
4,0.6147
5,0.6104
6,0.5787
7,0.5709
8,0.6768
9,0.5518
10,0.5271


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
version='FastApply-1.5B-Instruct'
hf_token="hf_..."

In [None]:
# model.save_pretrained(f"lora_model/v{version}")
# tokenizer.save_pretrained(f"lora_model/v{version}")

# model.push_to_hub(f"quocdat25/fast-apply_lora-4b-v0.{version}", token=hf_token)
# tokenizer.push_to_hub(f"quocdat25/fast-apply_lora-4b-v0.{version}", token=hf_token)



In [None]:
# model.push_to_hub_merged(f"quocdat25/fast-apply-16bit-v0.{version}", tokenizer, save_method="merged_16bit", token=hf_token)

In [None]:
# Save to 8bit Q8_0
# if True: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
# if True: model.push_to_hub_gguf(f"quocdat25/fast-apply_gguf-Q8_0-v0.{version}", tokenizer, token = hf_token)

# Save to 16bit GGUF
# if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
# if True: model.push_to_hub_gguf(f"quocdat25/fast-apply_gguf-16bit_0-v0.{version}", tokenizer, quantization_method = "f16", token = hf_token)

# Save to q4_k_m GGUF
# if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
# if True: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )

In [None]:
# model.push_to_hub_merged(f"quocdat25/vLLM-fast-apply-4bit-v0.{version}", tokenizer, save_method = "merged_4bit_forced", token = hf_token)

In [None]:
! bash -c "runpodctl stop pod $RUNPOD_POD_ID"