In [1]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print ('Using device: ', device)

Using device:  cuda


In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4060. Num GPUs = 1. Max memory: 7.996 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
from datasets import load_dataset, concatenate_datasets

# Define a function to flatten the 'story_generation' output
def flatten_story_output(example):
    # Flatten the 'output' field
    if isinstance(example["output"], dict):
        example["title"] = example["output"].get("title", "")
        example["story"] = example["output"].get("story", "")
        example["questions"] = example["output"].get("questions", [])

    # Remove the nested 'output' field after flattening
    example["output"] = ""
    return example

# Load the datasets
dataset_grammar = load_dataset("json", data_files="datasets/grammar.json", split="train")
dataset_translation = load_dataset("json", data_files="datasets/translations.json", split="train")
dataset_story = load_dataset("json", data_files="datasets/stories.json", split="train")

# Apply the flattening function to the story dataset
dataset_story = dataset_story.map(flatten_story_output)

# Combine the datasets
combined_dataset = concatenate_datasets([dataset_grammar, dataset_translation, dataset_story]).shuffle(seed=32)

In [5]:
combined_dataset[10]

{'task': 'story_generation',
 'instruction': 'Generate a JSON object that contains random Cebuano story with comprehension questions and answers.',
 'input': '',
 'output': '',
 'title': 'Ang Pagpangadyi ni Tomas',
 'story': 'Si Tomas usa ka batang lalaki nga nagpuyo sa usa ka gamay nga baryo.\nBisan pa man sa kalisod sa kinabuhi, adunay dakong pangandoy si Tomas nga mahimong usa ka mang-uuma.\nUsa ka adlaw, nagkita siya sa iyang amahan sa uma.\n"Papa, gusto ko nga magtubo nga pareho nimo. Gusto ko mag-uma nga magmalipayong mabuhi," ingon ni Tomas.\n"Anaa sa imong mga kamot ang imong kalampusan, anak. Apan kinahanglan nga magpaningkamot ka," tubag sa iyang amahan.\nNagpadayon si Tomas sa pagtabang sa iyang amahan sa uma matag adlaw.\nNagtanom sila og mga prutas ug gulay, ug nag-atiman sa mga hayop.\n"Bisan og lisod ang trabaho, nagsugod ko og makat-on og daghang butang, Papa," ingon ni Tomas.\nAng iyang amahan nagtudlo kaniya sa mga teknik sa pagpanguma ug pag-atiman sa uma.\nNiadtong 

In [6]:
import json

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    texts = []

    for instruction, input_text, output, title, story, question_list, task in zip(
        examples["instruction"],
        examples["input"],
        examples["output"],
        examples["title"],
        examples["story"],
        examples["questions"],
        examples["task"]
    ):
        if task == "story_generation":
            # Build question JSONs as string blocks
            question_blocks = []
            for q in question_list:
                question_blocks.append(f'''{{
        "question": {json.dumps(q["question"])},
        "choices": [{", ".join(json.dumps(choice) for choice in q["choices"])}],
        "answer": {json.dumps(q["answer"])},
        "explanation": {json.dumps(q["explanation"])}
    }}''')

            # Join questions
            questions_str = ",\n        ".join(question_blocks)

            # Final story block
            story_json_block = f"""{{
    "title": {json.dumps(title)},
    "story": {json.dumps(story)},
    "questions": [
        {questions_str}
    ]
}}"""

            text = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{story_json_block}
""" + EOS_TOKEN

        else:
            # Assume you're using a simpler 3-part format for non-story tasks
            text = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}
""" + EOS_TOKEN

        texts.append(text)

    return {"text": texts, }
pass

In [7]:
dataset = combined_dataset.map(formatting_prompts_func, batched = True,)
dataset[10]

{'task': 'story_generation',
 'instruction': 'Generate a JSON object that contains random Cebuano story with comprehension questions and answers.',
 'input': '',
 'output': '',
 'title': 'Ang Pagpangadyi ni Tomas',
 'story': 'Si Tomas usa ka batang lalaki nga nagpuyo sa usa ka gamay nga baryo.\nBisan pa man sa kalisod sa kinabuhi, adunay dakong pangandoy si Tomas nga mahimong usa ka mang-uuma.\nUsa ka adlaw, nagkita siya sa iyang amahan sa uma.\n"Papa, gusto ko nga magtubo nga pareho nimo. Gusto ko mag-uma nga magmalipayong mabuhi," ingon ni Tomas.\n"Anaa sa imong mga kamot ang imong kalampusan, anak. Apan kinahanglan nga magpaningkamot ka," tubag sa iyang amahan.\nNagpadayon si Tomas sa pagtabang sa iyang amahan sa uma matag adlaw.\nNagtanom sila og mga prutas ug gulay, ug nag-atiman sa mga hayop.\n"Bisan og lisod ang trabaho, nagsugod ko og makat-on og daghang butang, Papa," ingon ni Tomas.\nAng iyang amahan nagtudlo kaniya sa mga teknik sa pagpanguma ug pag-atiman sa uma.\nNiadtong 

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 1,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 130,
        num_train_epochs = 2, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate =2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,420 | Num Epochs = 2 | Total steps = 1,354
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.4517
2,1.3942
3,1.5121
4,1.2749
5,1.2999
6,1.6036
7,1.3455
8,1.3432
9,1.3383
10,1.2146


In [10]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

In [None]:
# alpaca_prompt = Copied from above


from transformers import TextStreamer
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Translate this Cebuano sentence to English: Nalingaw kami sa pagdula sa basketball.", # instruction
        "", # input
        ""
    )
], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2000)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Translate this Cebuano sentence to English: Nalingaw kami sa pagdula sa basketball.

### Input:


### Response:

We enjoyed playing basketball.
<|end_of_text|>


In [13]:
# alpaca_prompt = Copied from above
from transformers import TextStreamer
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Generate a JSON object that contains random Cebuano story with comprehension questions and answers.", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2000)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate a JSON object that contains random Cebuano story with comprehension questions and answers.

### Input:


### Response:

{
    "title": "Ang Pagpaningkamot ni Juan",
    "story": "Si Juan usa ka batang lalaki nga nagpuyo sa baryo.\nBisan pa sa kalisod, dili siya magpasagad sa iyang mga pangandoy.\nUsa ka adlaw, naglakaw siya paingon sa umahan aron magtrabaho.\n\"Juan, nganong maglisod man ka magtrabaho?\" pangutana ni Lola.\n\"Lola, gusto ko nga makatabang sa pamilya ug makab-ot ang akong mga pangandoy,\" tubag ni Juan.\n\"Maayo kaayo ka, Juan. Ang imong paningkamot magahatag og maayo nga resulta,\" ingon ni Lola.\nNagpadayon si Juan sa iyang trabaho sa umahan.\nSa paglabay sa mga adlaw, nakakita siya og mga tanom nga nagsugod og pagtubo.\n\"Lola, tan-awa! Ang mga tanom nagatubo!\" sigaw 

In [28]:
model.save_pretrained("lora/cebuano_lora_epoch_model")
tokenizer.save_pretrained("lora/cebuano_lora_epoch_model")

('lora/cebuano_lora_epoch_model\\tokenizer_config.json',
 'lora/cebuano_lora_epoch_model\\special_tokens_map.json',
 'lora/cebuano_lora_epoch_model\\tokenizer.json')

In [23]:
%ls

 Volume in drive C has no label.
 Volume Serial Number is F6E1-C05A

 Directory of c:\Users\user\Documents\Projects\ThesisAI\llama.cpp

04/05/2025  06:23 am    <DIR>          .
04/05/2025  06:17 am    <DIR>          ..
04/05/2025  06:06 am             4,853 .clang-format
04/05/2025  06:06 am               930 .clang-tidy
04/05/2025  06:06 am    <DIR>          .devops
04/05/2025  06:06 am               257 .dockerignore
04/05/2025  06:06 am               103 .ecrc
04/05/2025  06:06 am             1,011 .editorconfig
04/05/2025  06:06 am               583 .flake8
04/05/2025  06:06 am    <DIR>          .github
04/05/2025  06:06 am             1,837 .gitignore
04/05/2025  06:06 am               110 .gitmodules
04/05/2025  06:06 am               463 .pre-commit-config.yaml
04/05/2025  06:06 am            48,966 AUTHORS
04/05/2025  06:06 am            22,242 build-xcframework.sh
04/05/2025  06:06 am    <DIR>          ci
04/05/2025  06:06 am    <DIR>          cmake
04/05/2025  06:06 am       

In [26]:
%git checkout b3345
%git submodule update --init --recursive
%make clean
%make all -j
%git log -1

UsageError: Line magic function `%git` not found.


In [27]:
model.save_pretrained_gguf("save/model", tokenizer,)

RuntimeError: *** Unsloth: Failed compiling llama.cpp using os.system(...) with error 1. Please report this ASAP!