In [None]:
!pip install -U transformers



## Local Inference on GPU
Model page: https://huggingface.co/unsloth/Qwen3-0.6B-Base

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/unsloth/Qwen3-0.6B-Base)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="unsloth/Qwen3-0.6B-Base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("unsloth/Qwen3-0.6B-Base")
model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen3-0.6B-Base")

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("unsloth/Qwen3-0.6B-Base")
model = AutoModelForCausalLM.from_pretrained("unsloth/Qwen3-0.6B-Base", dtype=torch.float16, device_map = "auto")
# Total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")

Total parameters: 596,049,920


In [None]:
from datasets import load_dataset

ds = load_dataset("vicgalle/alpaca-gpt4")

ds = ds["train"]
print(ds)
dataset = ds.shuffle(seed = 42)
print(dataset)

train_dataset = dataset.select(range(2000))
test_dataset = dataset.select(range(2000,2100))

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})
Train size: 2000, Test size: 100


In [None]:
max_seq_length = 512

def preprocess(example):
    # Combine instruction + optional input + output
    prompt = f"<system> You are a helpful assistant. </system>\n" \
             f"<user>{example['instruction']}\n{example.get('input','')}</user>\n" \
             f"<assistant>{example['output']}</assistant>"

    tokenized = tokenizer(
        prompt,
        truncation=True,
        max_length=max_seq_length,
        padding="max_length"
    )

    # Labels for causal LM
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Apply preprocessing
train_dataset = train_dataset.map(preprocess, batched=False)
test_dataset = test_dataset.map(preprocess, batched=False)

print(train_dataset[0])
print(test_dataset[0])

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'instruction': 'What would be the best type of exercise for a person who has arthritis?', 'input': '', 'output': "If a person has arthritis, low-impact exercises that are gentle on their joints are the best options. Some effective forms of low-impact exercises include:\n\n1. Walking: Walking is a simple, low-impact exercise that can help improve strength, balance and joint flexibility. \n\n2. Water exercises: Swimming or performing water aerobics are great ways to exercise in a weightless environment, which reduces pressure and strain on the joints. \n\n3. Yoga: Practicing yoga is a good way to gently stretch the muscles, improve flexibility and balance, and reduce joint stiffness. \n\n4. Cycling: Stationary cycling is another low-impact way to exercise, as it takes the weight off the joints, and can improve cardiovascular health, as well as leg strength. \n\n5. Tai Chi: Tai Chi is a slow, gentle movement that can help improve balance, flexibility and strength. It's also a meditative 

In [None]:
# Number of tokens in train dataset
total_train_tokens = sum(len(x["input_ids"]) for x in train_dataset)

# Number of tokens in test dataset
total_test_tokens = sum(len(x["input_ids"]) for x in test_dataset)

print(f"Total train tokens: {total_train_tokens}")
print(f"Total test tokens: {total_test_tokens}")

Total train tokens: 1024000
Total test tokens: 51200


In [None]:
avg_train_tokens = total_train_tokens / len(train_dataset)
avg_test_tokens  = total_test_tokens / len(test_dataset)

print(f"Average train tokens per example: {avg_train_tokens:.1f}")
print(f"Average test tokens per example: {avg_test_tokens:.1f}")

Average train tokens per example: 512.0
Average test tokens per example: 512.0


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "q_proj",
        "v_proj",
    ],
    bias="none"
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Check trainable parameters
model.print_trainable_parameters()

trainable params: 1,146,880 || all params: 597,196,800 || trainable%: 0.1920


In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq

training_args = TrainingArguments(
    output_dir="./qwen3_lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,  # effective batch = 16
    learning_rate=1e-4,             # higher LR works well for prompt tuning
    fp16=True,
    num_train_epochs=3,
    logging_steps=50,
    save_steps=500,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="wandb"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjoseneto023dev[0m ([33mjoseneto023dev-microsoft[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.0
100,0.0
150,0.0
200,0.0
250,0.0
300,0.0
350,0.0


TrainOutput(global_step=375, training_loss=0.0, metrics={'train_runtime': 1016.2329, 'train_samples_per_second': 5.904, 'train_steps_per_second': 0.369, 'total_flos': 8139835441152000.0, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
model.save_pretrained("./qwen3_LoRA_tuning")

In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=256,
)

prompt = "<system> You are a helpful assistant. </system>\n<user>Explain gradient descent simply.</user>\n<assistant>"
output = pipe(prompt)
print(output[0]['generated_text'])

Device set to use cuda:0
`generation_config` default values have been modified to match model-specific defaults: {'max_new_tokens': 2048}. If this is not desired, please set these values explicitly.


<system> You are a helpful assistant. </system>
<user>Explain gradient descent simply.</user>
<assistant>Gradient descent is an optimization algorithm used in machine learning and data science to minimize the loss function (or error) in a model. The idea is to iteratively adjust the model parameters in the direction that reduces the error. Here's a breakdown:

1. **Loss Function**: The loss function measures how well the model's predictions match the actual data. For example, in a linear regression model, the loss function could be the mean squared error (MSE) between predicted and actual values.

2. **Gradient**: The gradient of the loss function with respect to the model parameters is a vector that indicates the direction of steepest decrease. In other words, it tells you the direction in which the loss function decreases most rapidly.

3. **Gradient Descent**: The algorithm updates the model parameters by moving in the direction opposite to the gradient. This is done iteratively, ty