In [1]:
import time
import torch
from pathlib import Path
from utils import download_qwen3_small, Qwen3Tokenizer
from qwen3 import Qwen3Model, QWEN_CONFIG_06_B, KVCache
import torchinfo
from typing import Optional, Generator, Tuple

In [2]:
def set_device() -> torch.device:
    if torch.cuda.is_available():
        return torch.device(device="cuda")
    elif torch.backends.mps.is_available():
        return torch.device(device="mps")
    else:
        return torch.device(device="cpu")


device = set_device()
print(f"Using device: {device}")

Using device: mps


# 2.4 Preparing input texts for LLMs

In [3]:
tokenizer_file_path = Path("qwen3") / "tokenizer-base.json"
tokenizer = Qwen3Tokenizer(tokenizer_file_path=tokenizer_file_path)

In [4]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)

print(input_token_ids_list)

[840, 20772, 3460, 4128, 4119, 13]


In [5]:
text = tokenizer.decode(input_token_ids_list)
print(text)

Explain large language models.


In [6]:
for i in input_token_ids_list:
    print(f"{[i]} --> {tokenizer.decode([i])}")

[840] --> Ex
[20772] --> plain
[3460] -->  large
[4128] -->  language
[4119] -->  models
[13] --> .


Exercise 2.1: Encoding unknown words

In [7]:
french_token_ids_list = tokenizer.encode(prompt="Coucou, tu veux voir ma bite?")

for i in french_token_ids_list:
    print(f"{[i]} --> {tokenizer.decode([i])}")

[68210] --> Cou
[22249] --> cou
[11] --> ,
[9765] -->  tu
[5208] -->  ve
[2200] --> ux
[45031] -->  voir
[7491] -->  ma
[22721] -->  bite
[30] --> ?


# 2.5 Loading pre-trained models

In [8]:
download_qwen3_small(kind="base", tokenizer_only=False, out_dir="qwen3")

✓ qwen3/qwen3-0.6B-base.pth already up-to-date
✓ qwen3/tokenizer-base.json already up-to-date


In [9]:
model_path = Path("qwen3") / "qwen3-0.6B-base.pth"
model = Qwen3Model(cfg=QWEN_CONFIG_06_B)

In [10]:
text = "Hello, how are you today?"

ids = tokenizer.encode(text)
input_ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
input_ids = input_ids.to(device)

In [11]:
model.load_state_dict(torch.load(model_path))
model.to(device)

torchinfo.summary(
    model=model,
    input_data=input_ids,
    verbose=0,
    col_names=["input_size", "output_size", "num_params", "trainable"],
    col_width=20,
    row_settings=["var_names"]
)

Layer (type (var_name))                       Input Shape          Output Shape         Param #              Trainable
Qwen3Model (Qwen3Model)                       [1, 7]               [1, 7, 151936]       --                   True
├─Embedding (tok_emb)                         [1, 7]               [1, 7, 1024]         155,582,464          True
├─ModuleList (trf_blocks)                     --                   --                   --                   True
│    └─TransformerBlock (0)                   [1, 7, 1024]         [1, 7, 1024]         --                   True
│    │    └─RMSNorm (norm1)                   [1, 7, 1024]         [1, 7, 1024]         1,024                True
│    │    └─GroupedQueryAttention (att)       [1, 7, 1024]         [1, 7, 1024]         6,291,712            True
│    │    └─RMSNorm (norm2)                   [1, 7, 1024]         [1, 7, 1024]         1,024                True
│    │    └─FeedForward (ff)                  [1, 7, 1024]         [1, 7, 1024]    

# 2.6 Understanding the sequential LLM text generation process

In [12]:
prompt = "Explain large language models."
input_token_ids_list = tokenizer.encode(prompt)
print(f"Number of input tokens: {len(input_token_ids_list)}")
 
input_tensor = torch.tensor(input_token_ids_list)
input_tensor_fmt = input_tensor.unsqueeze(0)
input_tensor_fmt = input_tensor_fmt.to(device)
 
output_tensor = model(input_tensor_fmt)
output_tensor_fmt = output_tensor.squeeze(0)
print(f"Formatted Output tensor shape: {output_tensor_fmt.shape}")

Number of input tokens: 6
Formatted Output tensor shape: torch.Size([6, 151936])


In [13]:
last_token = output_tensor_fmt[-1].detach()
print(last_token)

tensor([ 7.3438,  2.0312,  7.9375,  ..., -2.5156, -2.5156, -2.5156],
       device='mps:0', dtype=torch.bfloat16)


In [14]:
print(last_token.argmax(dim=-1, keepdim=True))

tensor([20286], device='mps:0')


In [15]:
print(tokenizer.decode([20286]))

 Large


# 2.7 Coding a minimal text generation function

In [16]:
@torch.inference_mode()
def generate_text_basic(model: Qwen3Model, token_ids: torch.Tensor, 
                        max_new_tokens: int, eos_token_id: Optional[int]=None) -> torch.Tensor:
    input_length = token_ids.shape[1]
    model.eval()
 
    for _ in range(max_new_tokens):
        out = model(token_ids)[:, -1]
        next_token = torch.argmax(out, dim=-1, keepdim=True)
 
        if (eos_token_id is not None and torch.all(next_token == eos_token_id)):
            break
 
        token_ids = torch.cat([token_ids, next_token], dim=1)
    
    return token_ids[:, input_length:]
 

In [17]:
prompt = "Explain large language models in a single sentence."
input_token_ids_tensor = torch.tensor(tokenizer.encode(prompt), device=device).unsqueeze(0)
 
max_new_tokens = 100
output_token_ids_tensor = generate_text_basic(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
)

output_text = tokenizer.decode(token_ids=output_token_ids_tensor.squeeze(0).tolist())
print(output_text)

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform tasks such as answering questions, writing text, and even creating music.<|endoftext|>Human language is a complex and dynamic system that has evolved over millions of years to enable effective communication and social interaction. Large language models are designed to mimic this complexity and adapt to new contexts and languages, making them powerful tools for a wide range of applications, from customer service to scientific research.<|endoftext|>Human language is a


In [18]:
print(tokenizer.encode("<|endoftext|>"))

[151643]


In [19]:
output_token_ids_tensor = generate_text_basic(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id
)
output_text = tokenizer.decode(token_ids=output_token_ids_tensor.squeeze(0).tolist())
print(output_text)

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform tasks such as answering questions, writing text, and even creating music.


Exercise 2.2: Streaming token generation

In [20]:
@torch.inference_mode()
def generate_text_modified(model: Qwen3Model, token_ids: torch.Tensor, 
                        max_new_tokens: int, eos_token_id: Optional[int]=None) -> Generator[Tuple[torch.Tensor, torch.Tensor]] :

    model.eval()
 
    for _ in range(max_new_tokens):
        out = model(token_ids)[:, -1]
        next_token = torch.argmax(out, dim=-1, keepdim=True)
        probas = torch.softmax(input=out, dim=-1)
        max_proba = torch.max(input=probas)
        yield next_token, max_proba
 
        if (eos_token_id is not None and torch.all(next_token == eos_token_id)):
            break

        token_ids = torch.cat([token_ids, next_token], dim=1)

In [21]:
generated_text = generate_text_modified(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id
)

for token, proba in generated_text:
    text = tokenizer.decode(token_ids=list(token))
    id = token.item()
    print(f"{text} --> {id} --> {100 * proba:.2f} %")

 Large --> 20286 --> 61.25 %
 language --> 4128 --> 85.00 %
 models --> 4119 --> 99.50 %
 are --> 525 --> 77.50 %
 artificial --> 20443 --> 22.75 %
 intelligence --> 11229 --> 86.50 %
 systems --> 5942 --> 76.00 %
 that --> 429 --> 63.75 %
 can --> 646 --> 34.00 %
 understand --> 3535 --> 52.75 %
, --> 11 --> 48.75 %
 generate --> 6923 --> 51.25 %
, --> 11 --> 94.50 %
 and --> 323 --> 98.50 %
 process --> 1882 --> 21.62 %
 human --> 3738 --> 58.25 %
 language --> 4128 --> 95.00 %
, --> 11 --> 34.75 %
 enabling --> 27362 --> 41.50 %
 them --> 1105 --> 83.00 %
 to --> 311 --> 100.00 %
 perform --> 2736 --> 34.00 %
 tasks --> 9079 --> 45.00 %
 such --> 1741 --> 59.00 %
 as --> 438 --> 100.00 %
 answering --> 35764 --> 27.50 %
 questions --> 4755 --> 99.00 %
, --> 11 --> 97.50 %
 writing --> 4378 --> 38.00 %
 text --> 1467 --> 40.50 %
, --> 11 --> 99.50 %
 and --> 323 --> 90.00 %
 even --> 1496 --> 43.75 %
 creating --> 6825 --> 36.00 %
 music --> 4627 --> 17.88 %
. --> 13 --> 36.25 %
<|en

End of Exercise 2.2

In [22]:
def generate_stats(output_token_ids: torch.Tensor, tokenizer: Qwen3Tokenizer, start_time: float, end_time: float) -> None:
    total_time = end_time - start_time
    print(f"Time: {total_time:.2f} sec")
    print(f"{int(output_token_ids.numel() / total_time)} tokens/sec")
 
    max_mem_bytes = torch.mps.current_allocated_memory()
    max_mem_gb = max_mem_bytes / (1024 ** 3)
    print(f"Current MPS memory allocated: {max_mem_gb:.2f} GB")
 
    output_text = tokenizer.decode(token_ids=output_token_ids.squeeze(0).tolist())
    print(f"\n{output_text}")

In [23]:
start_time = time.time()
output_token_ids_tensor = generate_text_basic(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id
)
end_time = time.time()

generate_stats(output_token_ids_tensor, tokenizer, start_time, end_time)

Time: 2.90 sec
12 tokens/sec
Current MPS memory allocated: 1.46 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform tasks such as answering questions, writing text, and even creating music.


# 2.8 Faster inference via KV caching

In [24]:
@torch.inference_mode()
def generate_text_basic_cache(
    model: Qwen3Model,
    token_ids: torch.Tensor,
    max_new_tokens: int,
    eos_token_id: Optional[int]=None
) -> torch.Tensor:
 
    input_length = token_ids.shape[1]
    model.eval()
    cache = KVCache(n_layers=model.cfg["n_layers"])
    model.reset_kv_cache()
    out = model(token_ids, cache=cache)[:, -1]
 
    for _ in range(max_new_tokens):
        next_token = torch.argmax(out, dim=-1, keepdim=True)
 
        if (eos_token_id is not None and torch.all(next_token == eos_token_id)):
            break
 
        token_ids = torch.cat([token_ids, next_token], dim=1)
        out = model(next_token, cache=cache)[:, -1]
 
    return token_ids[:, input_length:]

In [25]:
start_time = time.time()
output_token_ids_tensor = generate_text_basic_cache(
    model=model,
    token_ids=input_token_ids_tensor,
    max_new_tokens=max_new_tokens,
    eos_token_id=tokenizer.eos_token_id,
)
end_time = time.time()
generate_stats(output_token_ids_tensor, tokenizer, start_time, end_time)

Time: 1.85 sec
18 tokens/sec
Current MPS memory allocated: 1.46 GB

 Large language models are artificial intelligence systems that can understand, generate, and process human language, enabling them to perform a wide range of tasks, from answering questions to writing essays.


# 2.9 Faster inference via PyTorch model compilation

In [26]:
torch._dynamo.config.allow_unspec_int_on_nn_module = True

model_compiled = torch.compile(model)

RuntimeError: torch.compile is not supported on Python 3.14+