In [None]:

import os
IS_COLAB = "COLAB_GPU" in os.environ

if IS_COLAB:
    print("Setting up for Google Colab...")
    !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
    !pip install "datasets>=2.16.0"
else:
    print("Not in Colab. Installing with standard pip.")
    !pip install "unsloth[conda] @ git+https://github.com/unslothai/unsloth.git"

Setting up for Google Colab...
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-lrpyhmf2/unsloth_7eefbded302244f8ade1b339890b1534
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-lrpyhmf2/unsloth_7eefbded302244f8ade1b339890b1534
  Resolved https://github.com/unslothai/unsloth.git to commit 73dd1227b09ab5e4d8a151e15410a643c99a1e82
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2025.8.9 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2025.8.9-py3-none-any.whl.metadata (9.5 kB)
Collecting tyro (from unsloth@ git+https://g

In [None]:
from unsloth import FastLanguageModel
import torch

from datasets import load_dataset
from collections import Counter
from tqdm.auto import tqdm
import pandas as pd

from trl import SFTTrainer
from transformers import TrainingArguments


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, hamming_loss

# Configuration
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
# Use the Gemma-3 270M model. It's small, fast, and surprisingly capable.
model_name = "unsloth/gemma-3-270m-it"

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)

# Add LoRA adapters for memory-efficient fine-tuning (PEFT)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Suggested R value for small models
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.10: Fast Gemma3 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors:   0%|          | 0.00/393M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth: Making `model.base_model.model.model` require gradients


In [None]:
pip freeze

absl-py==1.4.0
absolufy-imports==0.3.1
accelerate==1.10.1
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aiosignal==1.4.0
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.2
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.10.0
anywidget==0.9.18
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
array_record==0.8.1
arviz==0.22.0
astropy==7.1.0
astropy-iers-data==0.2025.8.25.0.36.58
astunparse==1.6.3
atpublic==5.1
attrs==25.3.0
audioread==3.0.1
Authlib==1.6.2
autograd==1.8.0
babel==2.17.0
backcall==0.2.0
beartype==0.21.0
beautifulsoup4==4.13.5
betterproto==2.0.0b6
bigframes==2.17.0
bigquery-magics==0.10.3
bitsandbytes==0.47.0
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blobfile==3.0.0
blosc2==3.7.2
bokeh==3.7.3
Bottleneck==1.4.2
bqplot==0.12.45
branca==0.8.1
Brotli==1.1.0
build==1.3.0
CacheControl==0.14.3
cachetools==5.5.2
catalogue==2.0.10
certifi==2025.8.3
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.3
chex==0.1.90
clarabel==0

In [None]:
from datasets import load_dataset
from collections import Counter
from tqdm.auto import tqdm
import pandas as pd

from trl import SFTTrainer
from transformers import TrainingArguments


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, hamming_loss

# Data Sampling Configuration
CATEGORIES_TO_SELECT = [
    'math', 'astro-ph', 'cs', 'cond-mat', 'physics',
    'hep-ph', 'quant-ph', 'hep-th'
]
SAMPLES_PER_CATEGORY_APPEARANCE = 2500
RANDOM_STATE = 42

# --- Multi-Label Data Sampling (from our previous scripts) ---
print("--- Step 1: Multi-Label Data Sampling & Preparation ---")
category_counts = {cat: 0 for cat in CATEGORIES_TO_SELECT}
samples = []
dataset_generator = load_dataset("UniverseTBD/arxiv-abstracts-large", split="train", streaming=True)

for s in tqdm(dataset_generator, desc="Scanning for samples"):
    if all(count >= SAMPLES_PER_CATEGORY_APPEARANCE for count in category_counts.values()):
        break
    if s['categories'] is None or s['abstract'] is None: continue
    parent_categories = {cat.split('.')[0] for cat in s['categories'].strip().split(' ')}
    if any(p in CATEGORIES_TO_SELECT for p in parent_categories):
        # We only need the abstract and the relevant parent categories
        relevant_categories = [p for p in parent_categories if p in CATEGORIES_TO_SELECT]
        if relevant_categories:
            samples.append({'abstract': s['abstract'], 'categories': sorted(relevant_categories)})
            for p_cat in relevant_categories:
                category_counts[p_cat] += 1

print(f"Finished sampling. Total samples collected: {len(samples)}")
df_samples = pd.DataFrame(samples)

# --- Data Formatting for Instruction Fine-tuning ---
# This is the most critical step: we convert our classification task into a text-to-text format.
# We create a new column 'text' that will contain the formatted prompt.

# The prompt template tells the model exactly what to do.
prompt_template = """<start_of_turn>user
Classify the following scientific abstract into one or more of the predefined categories.

Categories: {}

Abstract:
{}<end_of_turn>
<start_of_turn>model
{}<end_of_turn>"""

category_list_str = ", ".join(CATEGORIES_TO_SELECT)

def format_prompt(example):
    abstract = example['abstract']
    # The target labels are joined into a comma-separated string
    labels_str = ", ".join(example['categories'])
    example['text'] = prompt_template.format(category_list_str, abstract, labels_str)
    return example

# Apply the formatting to our DataFrame
df_samples = df_samples.apply(format_prompt, axis=1)

# Convert the pandas DataFrame back to a Hugging Face Dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df_samples)

# Create a train/test split
dataset = dataset.train_test_split(test_size=0.1, seed=RANDOM_STATE)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

print("\nExample of a formatted training prompt:")
print(train_dataset[0]['text'])

--- Step 1: Multi-Label Data Sampling & Preparation ---


README.md:   0%|          | 0.00/810 [00:00<?, ?B/s]

Scanning for samples: 0it [00:00, ?it/s]

Finished sampling. Total samples collected: 42436

Example of a formatted training prompt:
<start_of_turn>user
Classify the following scientific abstract into one or more of the predefined categories.

Categories: math, astro-ph, cs, cond-mat, physics, hep-ph, quant-ph, hep-th

Abstract:
  Starting from the (apparently) elementary problem of deciding how many
different topological spaces can be obtained by gluing together in pairs the
faces of an octahedron, we will describe the central role played by hyperbolic
geometry within three-dimensional topology. We will also point out the striking
difference with the two-dimensional case, and we will review some of the
results of the combinatorial and computational approach to three-manifolds
developed by different mathematicians over the last several years.
<end_of_turn>
<start_of_turn>model
math<end_of_turn>


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text", # We use our newly created 'text' field
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training faster but can complicate things.
    args = TrainingArguments(
        per_device_train_batch_size = 2, # Small batch size is crucial for LLM fine-tuning
        gradient_accumulation_steps = 4, # This simulates a batch size of 2 * 4 = 8
        warmup_steps = 5,
        num_train_epochs = 2, # Use this for a full fine-tuning run
        learning_rate = 5e-5, # A common learning rate for LoRA
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

# Start training
trainer_stats = trainer.train()

Map (num_proc=2):   0%|          | 0/38192 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 38,192 | Num Epochs = 2 | Total steps = 9,548
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 3,796,992 of 271,895,168 (1.40% trained)


Step,Training Loss
1,3.6533
2,3.6603
3,3.7529
4,3.2359
5,3.5586
6,3.523
7,3.4681
8,3.5927
9,3.6296
10,3.6321


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score, hamming_loss

# We need to manually iterate, generate, and evaluate
print("\n--- Starting Evaluation on Test Set ---")

# Prepare true labels
true_labels_str = [sorted(x) for x in test_dataset['categories']]
mlb = MultiLabelBinarizer(classes=CATEGORIES_TO_SELECT)
Y_true = mlb.fit_transform(true_labels_str)

# Prepare for generation
eval_prompt_template = """<start_of_turn>user
Classify the following scientific abstract into one or more of the predefined categories.

Categories: {}

Abstract:
{}<end_of_turn>
<start_of_turn>model"""

predicted_labels_str = []
for example in tqdm(test_dataset, desc="Generating predictions"):
    prompt = eval_prompt_template.format(category_list_str, example['abstract'])

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # --- THIS IS THE FIX ---
    # When using temperature=0.0 for greedy decoding, you must also set do_sample=False.
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False # Explicitly disable sampling
    )
    # Note: When do_sample=False, temperature is ignored, so we can remove it for clarity,
    # but including it with do_sample=False is also fine. Let's remove it.

    # Corrected call:
    outputs = model.generate(
        **inputs,
        max_new_tokens=20,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False
    )
    # --- END OF FIX ---

    # Decode and clean the output
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # Extract just the content after the final 'model' token
    answer = decoded_output.split("<start_of_turn>model")[-1].strip()

    # Simple parsing: split by comma and strip whitespace
    parsed_labels = [label.strip() for label in answer.split(',')]
    # Filter to only include valid categories
    valid_labels = sorted([label for label in parsed_labels if label in CATEGORIES_TO_SELECT])

    predicted_labels_str.append(valid_labels)

# Binarize predicted labels
Y_pred = mlb.transform(predicted_labels_str)

# Calculate and print metrics
subset_acc = accuracy_score(Y_true, Y_pred)
hamming = hamming_loss(Y_true, Y_pred)
report = classification_report(Y_true, Y_pred, target_names=CATEGORIES_TO_SELECT, zero_division=0)

# --- RESULTS ---
print("\n" + "="*50)
print("--- Gemma Fine-Tuning Multi-Label Results ---")
print("="*50)
print(f"Overall Subset Accuracy: {subset_acc:.4f}")
print(f"Hamming Loss: {hamming:.4f}\n")
print("Per-Category Performance:")
print(report)


--- Starting Evaluation on Test Set ---


NameError: name 'test_dataset' is not defined

In [None]:
# Save the LoRA adapters for later use
model.save_pretrained("gemma_classifier_lora")
tokenizer.save_pretrained("gemma_classifier_lora")

print("Gemma model with LoRA adapters saved to 'gemma_classifier_lora'")

# To save a fully merged model for easy deployment, you can run this:
# model.save_pretrained_merged("gemma_classifier_merged", tokenizer, save_method = "merged_16bit")

Gemma model with LoRA adapters saved to 'gemma_classifier_lora'


In [None]:
# --- FINAL STEP: Create a portable, deployable version of the model ---

print("Merging LoRA adapters into the base model and saving...")

# The save_method='merged_16bit' saves it in float16, a standard for inference.
model.save_pretrained_merged("gemma_classifier_merged", tokenizer, save_method = "merged_16bit")

print("Fully merged model saved to 'gemma_classifier_merged'.")
print("This folder is now a standalone model ready for deployment or sharing.")

Merging LoRA adapters into the base model and saving...


config.json: 0.00B [00:00, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Merging weights into 16bit:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [01:21<00:00, 81.26s/it]


Fully merged model saved to 'gemma_classifier_merged'.
This folder is now a standalone model ready for deployment or sharing.
