## Preparing model for training

### In this notebook, we are going to configure models based on Meta's Llama family of models. The transformers library has several tools for working with these models

### Start by creating a **LlamaConfig** object to configure the architecture of the model:

In [1]:
# Ignore insignificant warnings (ex: deprecation warnings)
import warnings
warnings.filterwarnings('ignore')

# Set a seed value for reproducibility
import torch

def fix_torch_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

fix_torch_seed()

In [2]:
# LlamaConfig is a configuration class for the Llama model
from transformers import LlamaConfig
config = LlamaConfig()
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.51.0",
  "use_cache": true,
  "vocab_size": 32000
}



In [3]:
# now lets try to update parameters to change the model architecture
config.num_hidden_layers = 12      # reduced from 32 to 12
config.hidden_size = 1024          # reduced 1/4 from 4096 to 1024
config.intermediate_size = 4096    # reduced 1/3 from 11008 to 4096 (dimension of MLP representations)
config.num_key_value_heads = 8     # reduced 1/4 from 32 to 8 (defaults to num_attention_heads=32)
config.torch_dtype = "bfloat16"    # for half-precision training
config.use_cache = False           # `True` is incompatible w/ gradient checkpointing
print(config)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 12,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.0",
  "use_cache": false,
  "vocab_size": 32000
}



### We will explore four different ways to initialize the weights of a model for training:

1. **Random weight initialization**
2. **Using an existing model for continued pre-training**
1. **Downscaling an existing model**
1. **Upscaling an existing model**

In [4]:
# 1. Randonly initialize the weights of the model
from transformers import LlamaForCausalLM
model = LlamaForCausalLM(config)
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb)

In [5]:
# print the number of parameters in the model defined
def print_nparams(model):
    """Calculate the total number of model parameters"""
    nparams = sum(p.numel() for p in model.parameters())
    print(f"The total number of parameters is: {nparams}")

print_nparams(model)

The total number of parameters is: 342385664


In [6]:
# Take a look at a sample of the weights in a single layer
layer_name = "model.layers.0.self_attn.q_proj.weight"

for name, param in model.named_parameters():
    if name == layer_name:
        print(f"First 30 weights of layer '{layer_name}':")
        print(param.data.view(-1)[:30])
        break

First 30 weights of layer 'model.layers.0.self_attn.q_proj.weight':
tensor([ 0.0217,  0.0204, -0.0008,  0.0087, -0.0089, -0.0291,  0.0166, -0.0086,
         0.0004,  0.0017, -0.0089, -0.0095, -0.0135, -0.0160, -0.0148, -0.0131,
         0.0104,  0.0200,  0.0348,  0.0110,  0.0082, -0.0011, -0.0233, -0.0113,
         0.0087,  0.0267, -0.0030, -0.0272, -0.0098, -0.0089])


In [None]:
# lets load the tokenizer
# we will try using the model for inference. 
# Note that the current model has randomly initialized weights
from transformers import pipeline
pipe = pipeline("text-generation", model="facebook/opt-125m")

# Load a tokenizer from the facebook/opt-125m, 
# which is compatible with the Llama-2 tokenizer
tokenizer = pipe.tokenizer

# Run simple inference with prompt
from transformers import TextStreamer

prompt = "I am a human. I love"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

streamer = TextStreamer(
    tokenizer, 
    skip_prompt=True, 
    skip_special_tokens=True
)

# we will see random outputs because 'model' is randomly initialized and not trained yet
outputs = model.generate(
    **inputs, 
    streamer=streamer, 
    use_cache=True, 
    max_new_tokens=128, 
    do_sample=False
)


Device set to use mps:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 artifacts enough enough enough enough enough enough enough enough enough enough enough enough enough enough enough enough enough enough humanitarian enough enough humanitarian enough humanitarian humanitarian humanitarian humanitarian humanitarian humanitarian humanitarian humanitarian humanitarian hyper sponsors sponsorsseason c hyperseason c c c c c c upheaval upheaval upheaval upheaval upheaval upheaval upheaval upheaval ne nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut nut


In [10]:
# Remove the model from memory to avoid crashing the kerne
# NOTE: We're running large models in a limited environment. Run me if you encounter any memory issues.
import gc
del model
del streamer
del outputs
gc.collect()

0

In [11]:
# 2. Reuse general pretrained model weights
# we will load an existing model and can use it as it is to continue pretraining on new data
from transformers import pipeline
pipe = pipeline("text-generation", model="facebook/opt-125m")


Device set to use mps:0


In [12]:
model = pipe.model
print(model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,)

In [13]:
# Remove the model from memory to avoid crashing the kernel
# NOTE: We're running large models in a limited environment. Run me if you encounter any memory issues.
del model
gc.collect()

33

In [14]:
# 3. Downscaling from a general pretrained model
# we will downscale the model to a smaller size so we will
# downscale our facebook/opt-125m model from a 12 layer model to a 10 layer model
from transformers import pipeline
pipe = pipeline("text-generation", model="facebook/opt-125m")
tokenizer = pipe.tokenizer

Device set to use mps:0


In [16]:
print(pipe.model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,)

In [17]:
model = pipe.model
print_nparams(model)

The total number of parameters is: 125239296


In [18]:
# Remove the middle two layers (layers 5 and 6) and update the configuration
decoder = model.model.decoder  # OPTDecoder
layers = decoder.layers  # ModuleList

# Current layers: 0,1,2,3,4,5,6,7,8,9,10,11
# We want to remove layers 5,6 (the middle ones)
middle_start = len(layers) // 2 - 1  # 5
new_layers = torch.nn.ModuleList(
    list(layers[:middle_start]) +  # layers 0-4
    list(layers[middle_start + 2:])  # layers 7-11
)

# Replace the layers
decoder.layers = new_layers

# Verify the number of layers
print(f"Number of layers after removal: {len(decoder.layers)}")  # Should show 10

Number of layers after removal: 10


In [19]:
# Clear the memory to avoid crashing the kernel
import gc
del model
gc.collect()

577

In [44]:
# 4. Depth Upscaling from a general pretrained model
# It is a technique to upscale a model by adding more layers to it.
# We will upscale our facebook/opt-125m model from a 12 layer model to a 16 layer model
# The following are the steps we will take
#   1. Configure a 16 layer model and initialize it with random weights
#   2. Load the 12 layer facebook/opt-125m model into memory
#   3. Copy the bottom 8 and top 8 layers from the 12 layer model and use them to overwrite the random weights of the 16 layer model
#   4. Copy over the embedding and classifying layers to replace the randomly initialized counterparts in the 16 layer model
from transformers import OPTConfig

# For the OPT model, the layers are organized differently than LLaMA
# 1. Configure a 16 layer model
# config = OPTConfig(
#     num_hidden_layers=16,  # We want our model to have 16 final layers
#     hidden_size=1024,  
#     intermediate_size=4096,  
#     num_attention_heads=32,  
#     num_key_value_heads=8,
#     torch_dtype="bfloat16",
#     use_cache=False 
# )
config = OPTConfig(
    num_hidden_layers=16,        # Increase from 12 to 16 layers
    hidden_size=768,            # Same as OPT-125m
    ffn_dim=3072,              # Same as OPT-125m
    num_attention_heads=12,     # Same as OPT-125m
    vocab_size=50272,          # Same as OPT-125m
    torch_dtype="bfloat16",
    use_cache=False
)
print(config)


OPTConfig {
  "_remove_final_layer_norm": false,
  "activation_function": "relu",
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 16,
  "pad_token_id": 1,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.0",
  "use_cache": false,
  "vocab_size": 50272,
  "word_embed_proj_dim": 768
}



In [45]:
# the number of parameters are around 221M of the upscaled model
model = OPTForCausalLM(config)
model = model.to(dtype=torch.bfloat16)  # convert to bfloat16
print_nparams(model)

The total number of parameters is: 153590784


In [37]:
from transformers import pipeline
pipe = pipeline("text-generation", model="facebook/opt-125m")

pretrained_model = pipe.model
tokenizer = pipe.tokenizer

# the number of parameters are around 125M of the pretrained exisiting model
print_nparams(pretrained_model)

Device set to use mps:0


The total number of parameters is: 125239296


In [None]:
from copy import deepcopy

# Take first 8 and last 8 layers from pretrained model's 12 layers
model.model.decoder.layers = deepcopy(pretrained_model.model.decoder.layers[:8]) \
    + deepcopy(pretrained_model.model.decoder.layers[-8:])

# 4. Copy embeddings and other components
model.model.decoder.embed_tokens = deepcopy(pretrained_model.model.decoder.embed_tokens)
model.lm_head = deepcopy(pretrained_model.lm_head)

print(f"Number of layers after upscaling: {len(model.model.decoder.layers)}")  # Should show 16
print_nparams(model) 

Number of layers after upscaling: 16
The total number of parameters is: 192199680


In [51]:
print(model.config)

OPTConfig {
  "_attn_implementation_autoset": true,
  "_remove_final_layer_norm": false,
  "activation_function": "relu",
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "enable_bias": true,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layer_norm_elementwise_affine": true,
  "layerdrop": 0.0,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 16,
  "pad_token_id": 1,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.0",
  "use_cache": false,
  "vocab_size": 50272,
  "word_embed_proj_dim": 768
}



In [42]:
# Run simple inference to show no trained model
prompt = "I am a GenAI expert. I love"

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

streamer = TextStreamer(
    tokenizer, 
    skip_prompt=True, 
    skip_special_tokens=True
)

outputs = model.generate(
    **inputs, 
    streamer=streamer, 
    use_cache=True, 
    max_new_tokens=128, 
    do_sample=False
)

 the game and I am a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game. I am not a fan of the game.
