In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch # Import torch to specify dtype if needed

# 1. Load the model and tokenizer
max_seq_length = 2048 # Or your desired maximum sequence length
# Automatically set dtype for a slight speedup depending on your GPU
dtype = None
load_in_4bit = True # Set to True to load in 4-bit for reduced memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Optional: Enable faster inference - Unsloth often handles this automatically
# FastLanguageModel.for_inference(model)

# 2. Prepare your input prompt
# Use the chat template appropriate for Qwen models.
messages = [
    {"role": "system", "content": "You are a helpful AI assistant. The question will be in Polish language."},
    {"role": "user", "content": "Cześć! Jak się masz?"},
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Tokenize the input prompt and move to GPU
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# 3. Perform inference
# Use TextStreamer for streaming the output in Colab
text_streamer = TextStreamer(tokenizer)

print("Generating response...")
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=150, use_cache=True) # Adjust max_new_tokens as needed

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.81G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/6.78k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Generating response...
<｜begin▁of▁sentence｜><｜begin▁of▁sentence｜>You are a helpful AI assistant. The question will be in Polish language.<｜User｜>Cześć! Jak się masz?<｜Assistant｜><think>
Alright, the user greeted me with "Cześć!" which is a Polish way of saying "Hello". I responded with a friendly "Cześć, chcesz contextsze?". So, I need to respond in a similar manner. I should keep it in Polish and make sure it's welcoming.

Maybe I can add another question to keep it going. So, I'll say "Zajecie, chcesz contextsze?". That should be appropriate and keep the conversation flowing smoothly.
</think>

Zajecie, chcesz contextsze?<｜end▁of▁sentence｜>


In [None]:
!git clone -q https://github.com/fxmeng/TransMLA.git


In [None]:
from datasets import load_dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

MODEL_NAME = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B"
MAX_LEN    = 4096

raw_ds = load_dataset("emplocity/owca", split="train")

template_tokenizer = get_chat_template(tokenizer, chat_template="alpaca")
def format_as_chat(example):
    messages = []
    instruction = example.get("instruction", "").strip()
    input_text = example.get("input", "").strip()
    output = example.get("output", "").strip()

    if input_text:
        messages.append({"role": "user", "content": f"{instruction}\n\n{input_text}"})
    else:
        messages.append({"role": "user", "content": instruction})

    messages.append({"role": "assistant", "content": output})

    return {
        "text": tokenizer.apply_chat_template(messages, tokenize=False)
    }

train_ds = raw_ds.map(format_as_chat, remove_columns=raw_ds.column_names)

Repo card metadata block was not found. Setting CardData to empty.


Map:   0%|          | 0/51713 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
import torch, time, json
from unsloth import UnslothTrainingArguments

FastLanguageModel.for_inference(model)   # Flash-Attn 2 path

model = FastLanguageModel.get_peft_model(
        model, r=16, lora_alpha=16,
        target_modules=["q_proj","k_proj","v_proj","o_proj",
                        "gate_proj","up_proj","down_proj"])

t0 = time.time()
args = UnslothTrainingArguments(
    output_dir="out_mha",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=20,
    save_strategy="no",
)
trainer = SFTTrainer(
        model=model, tokenizer=tokenizer, train_dataset=train_ds,
        max_seq_length=MAX_LEN,
        args=args)
trainer.train()
json.dump({"wall_time_s": time.time()-t0}, open("out_mha/time.json","w"))


Unsloth: Already have LoRA adapters! We shall skip this step.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/51713 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,713 | Num Epochs = 1 | Total steps = 12,928
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 18,464,768/5,000,000,000 (0.37% trained)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33ms184339[0m ([33ms184339-politechnika-gda-ska[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
20,3.5348
40,2.7535
60,2.3832
80,2.3464
100,2.0536
120,2.207
140,2.1327
160,2.1383
180,2.214
200,2.3046


Unsloth: Will smartly offload gradients to save VRAM!


KeyboardInterrupt: 

In [None]:

from google.colab import files # For downloading later
# Define a path in your Colab environment to save the adapters
adapter_save_path = "/content/my_lora_adapters"
os.makedirs(adapter_save_path, exist_ok=True)

try:
    print(f"Saving PEFT adapters to {adapter_save_path}...")
    # This saves the LoRA adapters and the adapter_config.json
    # For Unsloth models, this is the standard way to save your LoRA fine-tune
    model.save_pretrained(adapter_save_path)

    # It's also good practice to save the tokenizer alongside your adapters
    if 'tokenizer' in locals() and tokenizer is not None:
        tokenizer.save_pretrained(adapter_save_path)
        print(f"Tokenizer saved to {adapter_save_path}")
    else:
        # If you loaded the model with FastLanguageModel, the tokenizer might be attached to it
        # Or you might have it in a different variable.
        # For Unsloth, often the tokenizer is loaded with the model:
        # model, tokenizer = FastLanguageModel.from_pretrained(...)
        # So, ensure your 'tokenizer' variable is correctly referenced.
        print("Tokenizer variable not found or is None. Please save it manually if needed.")

    print("PEFT adapters saved successfully.")

    # To download the saved adapters from Colab:
    # Option A: Download individual files (if you know them, e.g., adapter_model.safetensors, adapter_config.json)
    # files.download(os.path.join(adapter_save_path, "adapter_model.safetensors"))
    # files.download(os.path.join(adapter_save_path, "adapter_config.json"))

    # Option B: Zip the entire directory and download the zip (more convenient)
    archive_path = f"{adapter_save_path}.zip"
    !zip -r {archive_path} {adapter_save_path}
    print(f"Adapters zipped to {archive_path}")
    files.download(archive_path)

except Exception as e:
    print(f"Error saving PEFT adapters: {e}")
    import traceback
    traceback.print_exc()

Saving PEFT adapters to /content/my_lora_adapters...
Tokenizer saved to /content/my_lora_adapters
PEFT adapters saved successfully.
updating: content/my_lora_adapters/ (stored 0%)
updating: content/my_lora_adapters/special_tokens_map.json (deflated 70%)
updating: content/my_lora_adapters/tokenizer.json (deflated 81%)
updating: content/my_lora_adapters/adapter_model.safetensors (deflated 7%)
updating: content/my_lora_adapters/adapter_config.json (deflated 56%)
updating: content/my_lora_adapters/README.md (deflated 66%)
updating: content/my_lora_adapters/tokenizer_config.json (deflated 84%)
Adapters zipped to /content/my_lora_adapters.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install evaluate rouge_score sacrebleu nltk bert_score --quiet
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True) # Often needed for METEOR if you use it
nltk.download('omw-1.4', quiet=True) # Also for wordnet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.0 MB/s[0m eta [36m0:00

True

In [None]:
from unsloth import FastLanguageModel # Ensure it's imported
import torch

# Assume 'model' and 'tokenizer' are already loaded and 'model' is your fine-tuned PeftModel
# model = ...
# tokenizer = ...
# MAX_LEN = 4096 # As defined in your training

# Set model to evaluation mode
if hasattr(model, 'eval'):
    model.eval()
    print("Model set to evaluation mode.")

# Enable Unsloth's fast inference path
# This should be called on the PeftModel directly
FastLanguageModel.for_inference(model)
print("Unsloth's FastLanguageModel.for_inference(model) applied.")

Model set to evaluation mode.
Unsloth's FastLanguageModel.for_inference(model) applied.


In [None]:
from datasets import load_dataset, Dataset

print("Loading and preparing evaluation dataset slice (last 5000 samples from OWCA)...")
raw_ds_full = load_dataset("emplocity/owca", split="train")

total_raw_samples = len(raw_ds_full)
eval_samples_count = 5000 # Number of samples to evaluate
# eval_samples_count = 100 # Use a smaller number for quicker testing first!

raw_eval_slice = None
if total_raw_samples >= eval_samples_count:
    eval_indices = range(total_raw_samples - eval_samples_count, total_raw_samples)
    raw_eval_slice = raw_ds_full.select(eval_indices)
elif total_raw_samples > 0:
    raw_eval_slice = raw_ds_full # Use all if less than desired count
    print(f"Warning: Dataset has only {total_raw_samples} samples. Evaluating on all available.")
else:
    print("Error: Raw dataset 'emplocity/owca' is empty or could not be loaded.")

if raw_eval_slice:
    print(f"Prepared evaluation slice with {len(raw_eval_slice)} samples.")
    # You can inspect a sample: print(raw_eval_slice[0])
else:
    print("Evaluation cannot proceed without data.")

Loading and preparing evaluation dataset slice (last 5000 samples from OWCA)...


Repo card metadata block was not found. Setting CardData to empty.


Prepared evaluation slice with 5000 samples.


In [None]:
import pandas as pd # For easier handling later
from tqdm.auto import tqdm # For progress bar

generated_responses = []
reference_responses = []

# Define max_new_tokens for generation - adjust based on expected output length
# Your example used 64, which might be short for some OWCA tasks.
MAX_NEW_TOKENS_EVAL = 256 # Or higher if needed

if raw_eval_slice:
    print(f"\nGenerating responses for {len(raw_eval_slice)} evaluation samples...")
    for example in tqdm(raw_eval_slice):
        instruction = example.get("instruction", "").strip()
        input_text = example.get("input", "").strip()
        reference_output = example.get("output", "").strip()

        messages_for_prompt = []
        if input_text:
            content = f"{instruction}\n\n{input_text}"
        else:
            content = instruction
        messages_for_prompt.append({"role": "user", "content": content})

        # Apply chat template to create the prompt string for generation
        # add_generation_prompt=True is important for inference.
        prompt_string = tokenizer.apply_chat_template(
            messages_for_prompt,
            tokenize=False,
            add_generation_prompt=True # Crucial for telling the model to generate next
        )

        inputs = tokenizer(prompt_string, return_tensors="pt").to("cuda")

        with torch.no_grad():
            outputs_tokens = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS_EVAL,
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id # Important for proper stopping
                # You can add other generation parameters like:
                # do_sample=True, top_k=50, top_p=0.9, temperature=0.7,
            )

        # Decode the full output (prompt + generation)
        full_decoded_output = tokenizer.batch_decode(outputs_tokens, skip_special_tokens=True)[0]

        # Extract only the generated part (assistant's response)
        # This depends on how apply_chat_template formats with add_generation_prompt=True
        # Often, the prompt_string is part of full_decoded_output.
        # A robust way is to decode only the new tokens:
        input_length = inputs.input_ids.shape[1]
        generated_tokens_only = outputs_tokens[0, input_length:]
        decoded_generation = tokenizer.decode(generated_tokens_only, skip_special_tokens=True).strip()

        generated_responses.append(decoded_generation)
        reference_responses.append(reference_output)

    print(f"\nFinished generation. Generated {len(generated_responses)} responses.")
    # Example of first few generations:
    for i in range(min(3, len(generated_responses))):
        print(f"\n--- Example {i+1} ---")
        # Reconstruct prompt for display if needed (or get from raw_eval_slice)
        example_prompt_instr = raw_eval_slice[i]['instruction']
        example_prompt_input = raw_eval_slice[i]['input']
        if example_prompt_input:
            print(f"Instruction:\n{example_prompt_instr}\nInput:\n{example_prompt_input}")
        else:
            print(f"Instruction:\n{example_prompt_instr}")
        print(f"Reference Output:\n{reference_responses[i]}")
        print(f"Generated Output:\n{generated_responses[i]}")
else:
    print("Skipping generation as there is no evaluation data.")


Generating responses for 5000 evaluation samples...


  0%|          | 0/5000 [00:00<?, ?it/s]


Finished generation. Generated 5000 responses.

--- Example 1 ---
Instruction:
Proszę podać przykład danych, które mogą posłużyć do odpowiedzi na następujące pytanie.
Input:
Jaka jest średnia temperatura w Stanach Zjednoczonych?
Reference Output:

Generated Output:
Średnia temperatura w Stanach Zjednoczonych wynosi 13,4°C (57,8°F).

--- Example 2 ---
Instruction:
Proszę odnaleźć pierwotną przyczynę wystąpienia tego błędu oraz dokładnie ją wyjaśnić.
Input:
Program podaje mi komunikat o błędzie: "invalid syntax".
Reference Output:

Generated Output:


--- Example 3 ---
Instruction:
Wygeneruj pierwsze linie powieści osadzonej w przyszłości.
Reference Output:

Generated Output:



In [None]:
import evaluate as hf_evaluate # Renamed to avoid conflict if you have 'evaluate' variable

if generated_responses and reference_responses:
    print("\nCalculating ROUGE scores...")
    rouge_metric = hf_evaluate.load('rouge')
    rouge_scores = rouge_metric.compute(predictions=generated_responses, references=reference_responses)
    print(rouge_scores)

    print("\nCalculating BLEU score...")
    bleu_metric = hf_evaluate.load('sacrebleu') # SacreBLEU is often preferred
    # BLEU expects references to be a list of lists (for multiple references per prediction)
    # For OWCA, we typically have one reference.
    bleu_scores = bleu_metric.compute(predictions=generated_responses, references=[[r] for r in reference_responses])
    print(f"BLEU Score: {bleu_scores['score']:.4f}")
    # print(f"BLEU Details: {bleu_scores}")


    # Optional: METEOR (requires wordnet and punkt to be downloaded by nltk)
    print("\nCalculating METEOR score...")
    try:
        meteor_metric = hf_evaluate.load('meteor')
        meteor_scores = meteor_metric.compute(predictions=generated_responses, references=reference_responses)
        print(meteor_scores)
    except Exception as e:
        print(f"Could not compute METEOR score: {e}. Ensure NLTK's wordnet is downloaded.")

    #Optional: BERTScore (can be slow for 5000 samples without GPU acceleration for scoring)
    print("\nCalculating BERTScore (this might take a while)...")
    try:
    #     bertscore_metric = hf_evaluate.load("bertscore")
    #     # You might need to specify a model_type for bertscore if it doesn't auto-detect well
    #     # For Polish, "allegro/herbert-base-cased" or a multilingual one might be good choices.
    #     # Or let it use its default.
    #     # To run on GPU for scoring:
    #     # bert_scores = bertscore_metric.compute(predictions=generated_responses, references=reference_responses, lang="pl", device="cuda")
         bert_scores = bertscore_metric.compute(predictions=generated_responses, references=reference_responses, lang="pl") # lang="pl" for Polish
         avg_f1 = sum(bert_scores['f1']) / len(bert_scores['f1'])
         print(f"BERTScore (Average F1): {avg_f1:.4f}")
     except Exception as e:
         print(f"Could not compute BERTScore: {e}")

else:
    print("Skipping metric calculation as no responses were generated.")