# Full Finetuning with SmolLM2-135M for News Topic Classification

In [1]:
%%capture
!pip install -q transformers datasets accelerate sentencepiece huggingface_hub wandb

In [2]:
import os, random, numpy as np, torch, platform
import wandb
from huggingface_hub import login

print("Transformers:", __import__("transformers").__version__)
print("Datasets:", __import__("datasets").__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))

random.seed(42); np.random.seed(42); torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Transformers: 4.44.2
Datasets: 4.0.0
CUDA available: True
Device: Tesla T4


# Authenticate to Hugging Face and Weights & Biases (W&B)

In [3]:
from getpass import getpass

hf_token = getpass("üîë Enter your Hugging Face token (press Enter to skip): ")
wb_token = getpass("üîë Enter your Weights & Biases token (or leave blank to skip): ")

if hf_token.strip():
    login(hf_token)
else:
    print("HF login skipped.")

if wb_token.strip():
    wandb.login(key=wb_token)
    run = wandb.init(project="Full-FT-SmolLM2-AGNews", job_type="training", anonymous="allow")
else:
    os.environ["WANDB_DISABLED"] = "true"
    print("W&B disabled for this run.")

üîë Enter your Hugging Face token (press Enter to skip): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
üîë Enter your Weights & Biases token (or leave blank to skip): ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maditya_rajpurohit[0m ([33maditya_rajpurohit-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Load SmolLM2-135M for FULL finetuning

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = "HuggingFaceTB/SmolLM2-135M"

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True, token=hf_token if hf_token.strip() else None)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=(torch.bfloat16 if use_bf16 else (torch.float16 if torch.cuda.is_available() else torch.float32)),
    device_map="auto",
    token=hf_token if hf_token.strip() else None,
)

# memory saver during full fine-tuning
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()

print("‚úÖ Model ready for FULL finetuning (all weights trainable).")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

‚úÖ Model ready for FULL finetuning (all weights trainable).


# Load topic classification data (AG News)

In [5]:
from datasets import load_dataset

def load_topic_data():
    try:
        ds = load_dataset("ag_news")
        print("‚úÖ Loaded dataset: ag_news")
        return ds, "ag_news"
    except Exception as e:
        print("‚ö†Ô∏è Could not load 'ag_news' ‚Üí", e)
        print("‚Ü™Ô∏è Falling back to 'dbpedia_14'...")
        ds = load_dataset("dbpedia_14")
        print("‚úÖ Loaded dataset: dbpedia_14")
        return ds, "dbpedia_14"

raw, ds_name = load_topic_data()

if ds_name == "ag_news":
    train, test = raw["train"], raw["test"]
    valid = train.select(range(4000))
    train = train.select(range(4000, 4000+10000))
    text_field, label_field = "text", "label"
    LABELS = ["World", "Sports", "Business", "Sci/Tech"]
else:
    train, test = raw["train"], raw["test"]
    valid = train.select(range(4000))
    train = train.select(range(4000, 4000+10000))
    text_field, label_field = "content", "label"
    LABELS = [f"Class_{i}" for i in range(14)]


PROMPT = """You are an assistant that classifies a news article into ONE topic from this set:
{}

### Article:
{}

### Instructions:
Respond with exactly one topic from the set above (no extra words).

### Topic:
{}"""

EOS = tokenizer.eos_token
LABEL_MAP = {i: name for i, name in enumerate(LABELS)}
label_set_str = ", ".join(LABELS)

def format_topic(batch):
    texts = batch[text_field]
    labels = batch[label_field]
    out_texts = []
    for t, y in zip(texts, labels):
        gold = LABEL_MAP[int(y)] if int(y) in LABEL_MAP else str(y)
        out_texts.append(PROMPT.format(label_set_str, t, gold) + EOS)
    return {"text": out_texts}

train = train.map(format_topic, batched=True, remove_columns=train.column_names)
valid = valid.map(format_topic, batched=True, remove_columns=valid.column_names)

print("‚úÖ Sample formatted example:\n", train["text"][0][:800])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

‚úÖ Loaded dataset: ag_news


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

‚úÖ Sample formatted example:
 You are an assistant that classifies a news article into ONE topic from this set:
World, Sports, Business, Sci/Tech

### Article:
Court Deals Blow to Movie Studios (Reuters) Reuters - A federal appeals court on Thursday\delivered a stinging blow to the anti-piracy efforts of major\movie studios and music companies by ruling several Internet\file-sharing software companies are not liable for copyright\infringement.

### Instructions:
Respond with exactly one topic from the set above (no extra words).

### Topic:
Sci/Tech<|endoftext|>


# Configure Trainer & TrainingArguments (FULL FT)

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments

max_seq_length = 512

args = TrainingArguments(
    output_dir="smollm2_135m_fullft_agnews",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=2,
    learning_rate=5e-5,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    weight_decay=0.05,
    logging_steps=20,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    bf16=use_bf16,
    fp16=(not use_bf16 and torch.cuda.is_available()),
    report_to=("wandb" if os.environ.get("WANDB_DISABLED","false")!="true" else "none"),
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train,
    eval_dataset=valid,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,
    args=args,
)

# parameter counts
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Params: total={total_params/1e6:.1f}M | trainable={trainable_params/1e6:.1f}M")

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Params: total=134.5M | trainable=134.5M


# Train the model

In [7]:
if torch.cuda.is_available():
    gpu = torch.cuda.get_device_properties(0)
    print(f"Using GPU: {gpu.name} ({round(gpu.total_memory/1e9, 2)} GB VRAM)")

train_result = trainer.train()

Using GPU: Tesla T4 (15.83 GB VRAM)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
1,1.6107,1.584209
2,1.5483,1.581759


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


# Runtime statistics

In [8]:
used_mem = round(torch.cuda.max_memory_reserved() / 1e9, 3) if torch.cuda.is_available() else "CPU"
mins = round(train_result.metrics.get("train_runtime", 0)/60, 2)
print(f"‚è± Runtime: {mins} minutes")
print(f"üíæ Peak reserved GPU memory: {used_mem} GB")

metrics = trainer.evaluate()
print("Eval metrics :", metrics)

‚è± Runtime: 29.1 minutes
üíæ Peak reserved GPU memory: 9.257 GB


Eval metrics : {'eval_loss': 1.581758737564087, 'eval_runtime': 83.9995, 'eval_samples_per_second': 47.619, 'eval_steps_per_second': 5.952, 'epoch': 2.0}


# Inference test on a fresh headlines

In [9]:
from transformers import pipeline

model.eval()

def build_infer_prompt(article_text: str) -> str:
    return f"""You are an assistant that classifies a news article into ONE topic from this set:
{', '.join(LABELS)}

### Article:
{article_text}

### Instructions:
Respond with exactly one topic from the set above (no extra words).

### Topic:
"""

gen = pipeline(
    "text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    torch_dtype=(torch.bfloat16 if use_bf16 else (torch.float16 if torch.cuda.is_available() else torch.float32)),
    device_map="auto",
)

samples = [
    "The central bank raised interest rates amid concerns about inflation in the manufacturing sector.",
    "The team clinched the championship title after a stunning overtime victory.",
    "Scientists have unveiled a new quantum processor that dramatically improves error rates.",
    "Trade tensions rise as global markets react to new tariffs on imported goods."
]

for s in samples:
    prompt = build_infer_prompt(s)
    out = gen(prompt, max_new_tokens=8, do_sample=False)  # greedy for stable labels
    print("Article:", s)
    print("Model output:", out[0]["generated_text"].split("### Topic:")[-1].strip())
    print("-"*60)

Article: The central bank raised interest rates amid concerns about inflation in the manufacturing sector.
Model output: Business/Economy

### Instructions
------------------------------------------------------------
Article: The team clinched the championship title after a stunning overtime victory.
Model output: Sports

### Instructions:
Resp
------------------------------------------------------------
Article: Scientists have unveiled a new quantum processor that dramatically improves error rates.
Model output: Sci/Tech

### Instructions:
------------------------------------------------------------
Article: Trade tensions rise as global markets react to new tariffs on imported goods.
Model output: Business

### Instructions:
Resp
------------------------------------------------------------


# Save model locally and Hugging Face

In [None]:
save_name_local = "SmolLM2-135M-AGNews-FullFT"
repo_id = "username/SmolLM2-135M-AGNews-FullFT"

trainer.save_model(save_name_local)
tokenizer.save_pretrained(save_name_local)

push = input("Push to Hugging Face Hub? (y/N): ").strip().lower()
if push == "y":
    trainer.model.push_to_hub(repo_id)
    tokenizer.push_to_hub(repo_id)
    print("‚úÖ Model pushed to HF Hub:", repo_id)
else:
    print("Saved locally in:", save_name_local)

Push to Hugging Face Hub? (y/N): y


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...-FullFT/model.safetensors:   2%|2         | 6.02MB /  269MB            

README.md: 0.00B [00:00, ?B/s]

‚úÖ Model pushed to HF Hub: aditya-rajpurohit/SmolLM2-135M-AGNews-FullFT
