In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from lib.llama3.reference_impl.generation import Llama
import os

ckpt_dir = os.path.expanduser("~/.llama/checkpoints/Llama3.2-1B/original/")
tokenizer_path = ckpt_dir + "tokenizer.model"

llama = Llama.build(
    ckpt_dir=ckpt_dir,
    tokenizer_path=tokenizer_path,
    max_seq_len=512,
    max_batch_size=1,
    device="cpu",
)

next(llama.model.parameters()).device.type

Loaded in 7.09 seconds


'cpu'

In [4]:
llama.text_completion("What is the meaning of life?", max_gen_len=10).generation

' Is there a purpose? Is there a reason for'

In [3]:
shakespeare_text = open("./data/tinyshakespeare.txt", "r").read()

# Display the first few lines
print("First few lines of Shakespeare's text:")
print(shakespeare_text[:500])

# Get some statistics
total_chars = len(shakespeare_text)
total_lines = shakespeare_text.count("\n")

print(f"\nTotal characters: {total_chars}")
print(f"Total lines: {total_lines}")

First few lines of Shakespeare's text:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor

Total characters: 1115394
Total lines: 40000


In [4]:
import json
from lib.llama3.reference_impl.model import ModelArgs, Transformer
from pathlib import Path


with open(Path(ckpt_dir) / "params.json", "r") as f:
    params = json.loads(f.read())

params["dim"] //= 4
params["n_heads"] //= 4
params["n_kv_heads"] //= 4
params["n_layers"] //= 4

model_args: ModelArgs = ModelArgs(
    max_seq_len=512,
    max_batch_size=16,
    **params,
)

model = Transformer(model_args)

In [5]:
import torch

# Check model parameters
print("Model Parameters:")
print(f"Vocabulary Size: {model.vocab_size}")
print(f"Number of Layers: {model.n_layers}")
print(f"Embedding Dimension: {model.params.dim}")
print(f"Number of Attention Heads: {model.params.n_heads}")
print(f"Max Sequence Length: {model.params.max_seq_len}")
print(f"Feedforward Dimension: {model.layers[0].feed_forward.w1.out_features}")

# Check if parameters are initialized
print("\nParameter Initialization:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {'Initialized' if param.sum().item() != 0 else 'Not initialized'}")

# Verify shapes of key components
print("\nKey Component Shapes:")
print(f"Token Embeddings: {model.tok_embeddings.weight.shape}")
print(f"Output Layer: {model.output.weight.shape}")
print(f"First Layer Query Weight: {model.layers[0].attention.wq.weight.shape}")
print(f"First Layer Key Weight: {model.layers[0].attention.wk.weight.shape}")
print(f"First Layer Value Weight: {model.layers[0].attention.wv.weight.shape}")

# Check for NaNs or infinities
print("\nNaN/Inf Check:")
for name, param in model.named_parameters():
    if torch.isnan(param).any() or torch.isinf(param).any():
        print(f"Warning: {name} contains NaN or Inf values")
    else:
        print(f"{name}: OK")


Model Parameters:
Vocabulary Size: 128256
Number of Layers: 4
Embedding Dimension: 512
Number of Attention Heads: 8
Max Sequence Length: 512
Feedforward Dimension: 2048

Parameter Initialization:
tok_embeddings.weight: Initialized
layers.0.attention.wq.weight: Initialized
layers.0.attention.wk.weight: Initialized
layers.0.attention.wv.weight: Initialized
layers.0.attention.wo.weight: Initialized
layers.0.feed_forward.w1.weight: Initialized
layers.0.feed_forward.w2.weight: Initialized
layers.0.feed_forward.w3.weight: Initialized
layers.0.attention_norm.weight: Initialized
layers.0.ffn_norm.weight: Initialized
layers.1.attention.wq.weight: Initialized
layers.1.attention.wk.weight: Initialized
layers.1.attention.wv.weight: Initialized
layers.1.attention.wo.weight: Initialized
layers.1.feed_forward.w1.weight: Initialized
layers.1.feed_forward.w2.weight: Initialized
layers.1.feed_forward.w3.weight: Initialized
layers.1.attention_norm.weight: Initialized
layers.1.ffn_norm.weight: Initialized

In [6]:
# Calculate number of trainable parameters
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal Trainable Parameters: {total_params:,}")

# Calculate size in gigabytes (assuming float32 parameters)
size_in_gb = total_params * 4 / (1024**3)  # 4 bytes per float32 parameter
print(f"Approximate Model Size: {size_in_gb:.2f} GB")


Total Trainable Parameters: 146,543,104
Approximate Model Size: 0.55 GB


In [51]:
import torch
import torch.nn.functional as F

# Tokenize the text
tokens = llama.tokenizer.encode(shakespeare_text, bos=False, eos=False)

# Set up optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Get the device
device = next(model.parameters()).device

# Training loop
from tqdm import tqdm

seq_len = model.params.max_seq_len
num_epochs = 1
batch_size = 16
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = (len(tokens) - seq_len) // (batch_size * seq_len)
    
    with tqdm(total=num_batches, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as pbar:
        for i in range(0, len(tokens) - seq_len, batch_size * seq_len):
            batch_end = min(i + batch_size * seq_len, len(tokens) - seq_len)
            x = torch.tensor([tokens[j:j+seq_len] for j in range(i, batch_end, seq_len)], dtype=torch.long).to(device)
            y = torch.tensor([tokens[j+1:j+seq_len+1] for j in range(i, batch_end, seq_len)], dtype=torch.long).to(device)
            
            optimizer.zero_grad()
            
            logits = model(x, 0)
            loss = F.cross_entropy(logits.view(-1, model.vocab_size), y.view(-1))
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            pbar.update(1)
            pbar.set_postfix({"Loss": f"{loss.item():.4f}"})
    
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1}/{num_epochs} completed. Average Loss: {avg_loss:.4f}")

print("Training completed!")

# Generate some text using the trained model
model.eval()
print(Llama(model, llama.tokenizer, model_args).text_completion("To be or not to be, ", max_gen_len=100).generation)

Epoch 1/1: 37batch [05:30,  8.93s/batch, Loss=0.3155]                     


Epoch 1/1 completed. Average Loss: 0.3303
Training completed!
 to wooing with a wife, my heart.

JULIET:
What's he will make good night.

JULIET:
So much to be my fortune, and I love.

JULIET:
I'll stay the county.

JULIET:
So would be.

JULIET:
So: but give me the sin that I but hate thee.

JULIET:
I will defend the heart; one
How night you love the better, but my love


In [52]:
# Checkpoint the model
import os

# Create a directory for checkpoints if it doesn't exist
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Save the model state
checkpoint_path = os.path.join(checkpoint_dir, "llama3_model_checkpoint.pth")
torch.save(model.state_dict(), checkpoint_path)

print(f"Model checkpoint saved to {checkpoint_path}")

# Save the model arguments
import json

model_args_path = os.path.join(checkpoint_dir, "llama3_model_args.json")
with open(model_args_path, 'w') as f:
    json.dump(vars(model.params), f, indent=2)

print(f"Model arguments saved to {model_args_path}")

Model checkpoint saved to checkpoints/llama3_model_checkpoint.pth
Model arguments saved to checkpoints/llama3_model_args.json


In [54]:
model.eval()
print(Llama(model, llama.tokenizer, model_args).text_completion("To be or not to be,", temperature=1.0, max_gen_len=500, echo=True).generation.split("<|begin_of_text|>")[1])

To be or not to be, no will to die through love itself.
Why, how to tread how do to honour newly your brother,
But to have five thousand thanks too much to Clarence:
I'll give my soul,
To should our speech of gold and too:
You are dear train, and father, poor brother,
Ere further conference with a passing small.
O Dorsetable.
Your sense may beggarly the tomb,
And bid me mistress sit dispatch: past the boy,
And well lost with one thing just proportion,
And over the board, under his liking!
And all the watchful eye of dear faith,
More fierce and an inditeous wrath!
How well, lords, I befall, and lay,
Is not forgot the tyrant, to fill the crown,
And manage of your glorious sun: regent join'd!
Yet would youravenousoddess, that went;
And well we have heard of all run a needful';
Anduile me with the root
And buryWhat! myself become a tyrant
Stands without the brat's king in Bosworth
To leap upon a black tidings was;
And in all my tumble down: great leaving me,
'Twere a bloody axe to that mak