In [1]:
import torch
from pathlib import Path
from utils import download_qwen3_small, Qwen3Tokenizer

In [2]:
def set_device() -> torch.device:
    if torch.cuda.is_available():
        return torch.device(device="cuda")
    elif torch.backends.mps.is_available():
        return torch.device(device="mps")
    else:
        return torch.device(device="cpu")


device = set_device()
print(f"Using device: {device}")

Using device: mps


# 2.4 Preparing input texts for LLMs

In [4]:
tokenizer_file_path = Path("qwen3") / "tokenizer-base.json"
tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_file_path)

In [5]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)

print(input_token_ids_list)

[840, 20772, 3460, 4128, 4119, 13]


In [6]:
text = tokenizer.decode(input_token_ids_list)
print(text)

Explain large language models.


In [7]:
for i in input_token_ids_list:
    print(f"{[i]} --> {tokenizer.decode([i])}")

[840] --> Ex
[20772] --> plain
[3460] -->  large
[4128] -->  language
[4119] -->  models
[13] --> .


Exercise 2.1: Encoding unknown words

In [8]:
french_token_ids_list = tokenizer.encode(prompt="Coucou, tu veux voir ma bite?")

for i in french_token_ids_list:
    print(f"{[i]} --> {tokenizer.decode([i])}")

[68210] --> Cou
[22249] --> cou
[11] --> ,
[9765] -->  tu
[5208] -->  ve
[2200] --> ux
[45031] -->  voir
[7491] -->  ma
[22721] -->  bite
[30] --> ?


# 2.5 Loading pre-trained models

In [9]:
download_qwen3_small(kind="base", tokenizer_only=False, out_dir="qwen3")

✓ qwen3/qwen3-0.6B-base.pth already up-to-date
✓ qwen3/tokenizer-base.json already up-to-date
