In [1]:
import os
os.environ["TRANSFORMERS_VERBOSITY"] = "info"
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from model import HuggingfaceModel

In [2]:
dataset = load_dataset("stanfordnlp/imdb", split="train")
dataset = dataset.select(range(1000))

In [3]:
from utils import cast_datast_to_instruction_format
dataset = cast_datast_to_instruction_format(dataset, "text", "label")

In [4]:
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 1000
})

In [6]:
dataset[11]

{'prompt': "I can't believe that those praising this movie herein aren't thinking of some other film. I was prepared for the possibility that this would be awful, but the script (or lack thereof) makes for a film that's also pointless. On the plus side, the general level of craft on the part of the actors and technical crew is quite competent, but when you've got a sow's ear to work with you can't make a silk purse. Ben G fans should stick with just about any other movie he's been in. Dorothy S fans should stick to Galaxina. Peter B fans should stick to Last Picture Show and Target. Fans of cheap laughs at the expense of those who seem to be asking for it should stick to Peter B's amazingly awful book, Killing of the Unicorn.",
 'completion': '0'}

In [7]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B")

loading configuration file config.json from cache at /home/bo/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.52.4",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

loading weights file model.safetensors from cache at /home/bo/.cache/huggi

In [8]:
training_args = SFTConfig(
    # learning_rate=5e-5,
    num_train_epochs=1,
    save_steps=30,
    max_length=512, 
    # per_device_train_batch_size=6,
    logging_steps=10,
    report_to="tensorboard", 
    eos_token="<|im_end|>",
    output_dir="qwen3-0.6B-imdb"
)

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=training_args,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading file vocab.json from cache at /home/bo/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455/vocab.json
loading file merges.txt from cache at /home/bo/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455/merges.txt
loading file tokenizer.json from cache at /home/bo/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/bo/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6d

Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

[2025-06-14 07:48:13,342] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-06-14 07:48:15,602] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


The following columns in the Training set don't have a corresponding argument in `Qwen3ForCausalLM.forward` and have been ignored: completion, prompt. If completion, prompt are not expected by `Qwen3ForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1,000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 125
  Number of trainable parameters = 596,049,920


Step,Training Loss


Saving model checkpoint to qwen3-0.6B-imdb/checkpoint-30
Configuration saved in qwen3-0.6B-imdb/checkpoint-30/config.json
Configuration saved in qwen3-0.6B-imdb/checkpoint-30/generation_config.json
Model weights saved in qwen3-0.6B-imdb/checkpoint-30/model.safetensors
chat template saved in qwen3-0.6B-imdb/checkpoint-30/chat_template.jinja
tokenizer config file saved in qwen3-0.6B-imdb/checkpoint-30/tokenizer_config.json
Special tokens file saved in qwen3-0.6B-imdb/checkpoint-30/special_tokens_map.json
Saving model checkpoint to qwen3-0.6B-imdb/checkpoint-60
Configuration saved in qwen3-0.6B-imdb/checkpoint-60/config.json
Configuration saved in qwen3-0.6B-imdb/checkpoint-60/generation_config.json
Model weights saved in qwen3-0.6B-imdb/checkpoint-60/model.safetensors
chat template saved in qwen3-0.6B-imdb/checkpoint-60/chat_template.jinja
tokenizer config file saved in qwen3-0.6B-imdb/checkpoint-60/tokenizer_config.json
Special tokens file saved in qwen3-0.6B-imdb/checkpoint-60/special_

TrainOutput(global_step=125, training_loss=0.19899940490722656, metrics={'train_runtime': 1527.4019, 'train_samples_per_second': 0.655, 'train_steps_per_second': 0.082, 'total_flos': 1244401609211904.0, 'train_loss': 0.19899940490722656})

In [10]:
hfmodel = HuggingfaceModel("/home/bo/workspace/training/qwen3-0.6B-imdb/checkpoint-120")
# model_name = "models/Qwen3-4B"
# model_name = "Qwen/Qwen3-0.6B"
# model_name = "Qwen/Qwen3-0.6B-base"
# model_name = "opt-350m-imdb/checkpoint-5500"
# model = HuggingfaceModel(model_name)

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file /home/bo/workspace/training/qwen3-0.6B-imdb/checkpoint-120/config.json
Model config Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window":

Model /home/bo/workspace/training/qwen3-0.6B-imdb/checkpoint-120 loaded successfully. Device: cuda:0


In [11]:
hfmodel.generate("what about it", temperature=0.1)

'0'

In [12]:
model_inputs = hfmodel.tokenizer(["this is great"], return_tensors="pt").to(
    hfmodel.model.device
)
model_outputs = hfmodel.model.generate(
    **model_inputs, max_new_tokens=512, temperature=0.1
)
output_ids = model_outputs[0][len(model_inputs.input_ids[0]) :].tolist()
response = hfmodel.tokenizer.decode(
    output_ids, skip_special_tokens=True
).strip("\n")

In [13]:
response

'0'