In [1]:

import os
IS_COLAB = "COLAB_GPU" in os.environ

if IS_COLAB:
    print("Setting up for Google Colab...")
    !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
    !pip install "datasets>=2.16.0"
else:
    print("Not in Colab. Installing with standard pip.")
    !pip install "unsloth[conda] @ git+https://github.com/unslothai/unsloth.git"

Setting up for Google Colab...
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-8up81tu1/unsloth_299e807835f44f2ca401f3cac7b3e86d
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-8up81tu1/unsloth_299e807835f44f2ca401f3cac7b3e86d
  Resolved https://github.com/unslothai/unsloth.git to commit 658a4703c5b7a3b3d3fc5b6c5b5ddc1e607f3b3b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.8.9 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.8.9-py3-none-any.whl.metadata (9.5 kB)
Collecting tyro (from unsloth@ git+https://g

In [2]:
from unsloth import FastLanguageModel
import torch

from datasets import load_dataset
from collections import Counter
from tqdm.auto import tqdm
import pandas as pd

from trl import SFTTrainer
from transformers import TrainingArguments


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, hamming_loss

# Configuration
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
# Use the Gemma-3 270M model. It's small, fast, and surprisingly capable.
model_name = "unsloth/gemma-3-270m-it"

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)

# Add LoRA adapters for memory-efficient fine-tuning (PEFT)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Suggested R value for small models
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.10: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/393M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth: Making `model.base_model.model.model` require gradients


In [6]:
from datasets import load_dataset, Dataset
from collections import Counter
from tqdm.auto import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

# Data Sampling Configuration (Same as before)
CATEGORIES_TO_SELECT = [
    'math', 'astro-ph', 'cs', 'cond-mat', 'physics',
    'hep-ph', 'quant-ph', 'hep-th'
]
SAMPLES_PER_CATEGORY_APPEARANCE = 2500
RANDOM_STATE = 42

# --- Multi-Label Data Sampling ---
print("--- Step 1: Multi-Label Data Sampling & Preparation ---")
# ... (sampling logic remains the same) ...
category_counts = {cat: 0 for cat in CATEGORIES_TO_SELECT}
samples = []
dataset_generator = load_dataset("UniverseTBD/arxiv-abstracts-large", split="train", streaming=True)
for s in tqdm(dataset_generator, desc="Scanning for samples"):
    if all(count >= SAMPLES_PER_CATEGORY_APPEARANCE for count in category_counts.values()):
        break
    if s['categories'] is None or s['abstract'] is None: continue
    parent_categories = {cat.split('.')[0] for cat in s['categories'].strip().split(' ')}
    if any(p in CATEGORIES_TO_SELECT for p in parent_categories):
        relevant_categories = [p for p in parent_categories if p in CATEGORIES_TO_SELECT]
        if relevant_categories:
            samples.append({'abstract': s['abstract'], 'categories': sorted(relevant_categories)})
            for p_cat in relevant_categories:
                category_counts[p_cat] += 1
print(f"Finished sampling. Total samples collected: {len(samples)}")
df_samples = pd.DataFrame(samples)

# --- REVERTED TO ORIGINAL, ROBUST PROMPT FORMATTING ---

# Define the high-level instruction for the system role
system_prompt = "You are an expert scientific paper classifier. Analyze the abstract and determine for each of the 8 predefined categories whether it applies. Respond with only 'Yes' or 'No' for each category, formatted as a comma-separated list."

# We will apply the chat template to create a single 'text' field
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "gemma3",
)

def format_prompt_with_template(example):
    abstract = example['abstract']
    true_labels = example['categories']

    # Build the target string with Yes/No for each category
    assistant_response = ", ".join([f"{cat}: {'Yes' if cat in true_labels else 'No'}" for cat in CATEGORIES_TO_SELECT])

    # Create the standard ChatML conversations list
    conversations = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": abstract},
        {"role": "assistant", "content": assistant_response}
    ]

    # Use the tokenizer to apply the full template
    example['text'] = tokenizer.apply_chat_template(conversations, tokenize=False, add_generation_prompt=False)

    return example

# Apply the formatting
dataset = Dataset.from_pandas(df_samples)
dataset = dataset.map(format_prompt_with_template)


# Create a train/test split
dataset = dataset.train_test_split(test_size=0.1, seed=RANDOM_STATE)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print("\nExample of a formatted training prompt (text field):")
print(train_dataset[0]['text'])

--- Step 1: Multi-Label Data Sampling & Preparation ---


Scanning for samples: 0it [00:00, ?it/s]

Finished sampling. Total samples collected: 42436


Map:   0%|          | 0/42436 [00:00<?, ? examples/s]


Example of a formatted training prompt (text field):
<bos><start_of_turn>user
You are an expert scientific paper classifier. Analyze the abstract and determine for each of the 8 predefined categories whether it applies. Respond with only 'Yes' or 'No' for each category, formatted as a comma-separated list.

Starting from the (apparently) elementary problem of deciding how many
different topological spaces can be obtained by gluing together in pairs the
faces of an octahedron, we will describe the central role played by hyperbolic
geometry within three-dimensional topology. We will also point out the striking
difference with the two-dimensional case, and we will review some of the
results of the combinatorial and computational approach to three-manifolds
developed by different mathematicians over the last several years.<end_of_turn>
<start_of_turn>model
math: Yes, astro-ph: No, cs: No, cond-mat: No, physics: No, hep-ph: No, quant-ph: No, hep-th: No<end_of_turn>



In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    # --- FIX ---
    # Point the trainer back to the 'text' field and remove the unsupported argument
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2,
        learning_rate = 2e-5, # A smaller learning rate is often more stable
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

# Start training
trainer_stats = trainer.train()

Map:   0%|          | 0/38192 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 38,192 | Num Epochs = 2 | Total steps = 9,548
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 3,796,992 of 271,895,168 (1.40% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhungminh-2310[0m ([33mhungminh-2310-fpt-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,4.6004
20,4.2372
30,4.0565
40,3.8404
50,3.7756
60,3.7015
70,3.6332
80,3.4764
90,3.4622
100,3.3782


In [8]:
# Cell 6: Saving the LoRA Adapters

# Define the directory where the adapters will be saved
output_dir = "gemma_classifier_lora"

print(f"\n--- Saving LoRA adapters and tokenizer to '{output_dir}' ---")

# The model object is a PeftModel, which has the save_pretrained method for adapters
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("\nSave complete.")
print(f"You can now find the '{output_dir}' folder in the file browser.")
print("To use it later, you would load the base model and then apply these adapters.")


--- Saving LoRA adapters and tokenizer to 'gemma_classifier_lora' ---

Save complete.
You can now find the 'gemma_classifier_lora' folder in the file browser.
To use it later, you would load the base model and then apply these adapters.
