# Layer exploration (continued) 
We're trying to explore the layers so we're comfortable modifying things by hand. 

In [None]:
# Run on 1 x RTX A6000
!pip install -q wandb -U
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets matplotlib
!pip install plotly.express
!pip install scikit-learn
!pip install -U flash-attn --no-build-isolation
!pip install pyyaml
!pip install pyarrow
!pip install termcolor
!pip install pandas
!pip install tqdm
!pip install python-dotenv
# If distutils error, https://stackoverflow.com/a/78050586

In [1]:
### Load libraries
# import flash_attn
# from dotenv import main
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import jinja2
import os
import sys
import re
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # for quantization
import plotly
from transformers import pipeline, set_seed
from tqdm import tqdm

# auth for gated repos (like llama) - gen token here: https://huggingface.co/settings/tokens
from huggingface_hub import notebook_login
notebook_login(os.getenv('HF_TOKEN'))

# model ids
model_id = ["microsoft/Phi-3-mini-4k-instruct"]

# Set seed for reproducibility 
torch.random.manual_seed(0)

# Increase max width of pd df columns 
pd.set_option('max_colwidth', 300)

# Instantiate jinja environment - used later for icl prompting 
environment = jinja2.Environment()

device = 'cuda'

# requirements.txt
# !pip3 freeze > requirements.txt

User is already logged in.


In [2]:
# Define utility functions 
# mem. monitoring! 
def check_memory():
    print("Allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("Reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("Total: %fGB"%(torch.cuda.get_device_properties(0).total_memory/1024/1024/1024))

# notification/text-to-speech
def text_to_speech(text):
    if sys.platform == 'darwin':
        os.system(f'say "{text}"')
    elif sys.platform.startswith('linux'):
        os.system(f'espeak "{text}"')
    else:
        print("Text-to-speech is not supported on this platform.")

# parse + template phi inputs
def parse_phi(messages: list[dict], append_response_start = True) -> str:
    """
    Converts a multi-turn conversation into a Llama-3-tokenizable input.

    Output format:
    # <s><|system|>
    # You are a helpful AI assistant.<|end|>
    # <|user|>
    # Guess my dog's name!<|end|>
    # <|assistant|>
    """
    format = '<s>'
    
    format += '\n'.join([f"<|{m['role']}|>\n{m['content']}<|end|>" for m in messages])

    if append_response_start:
        format += "\n<|assistant|>"
    
    return format

# print(parse_phi([
#     {'role': 'system', 'content': 'Hello'}, {'role': 'user', 'content': '1+1?'}, {'role': 'assistant', 'content': '2'}
# ], False))

# model eval
def eval_model(model, tokenizer, prompt):
    tokens = tokenizer(prompt, return_tensors = 'pt').to(device)
    model.eval()
    with torch.no_grad():
        res = model.generate(
            **tokens,
            max_new_tokens = 1,
            do_sample = False,
            temperature = 0.6,
            top_p = 0.9,
            eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(tokenizer.eos_token)]
        )
    return tokenizer.batch_decode(res)[0]

# assess model perf
def get_model_performance(eval_df, base_model, tokenizer, verbose = False): 

    val = []
    for idx, row in tqdm(eval_df.iterrows()): 
        response = eval_model(model = base_model, tokenizer = tokenizer, prompt = row['llm_input'])

        # error handling for malformed outputs 
        response_json = re.findall(r'(?=.*"rationale")(?=.*"answer"){.*?}', response)[-1] # extract response + json

        # initialize keep_going + check if response_json is empty list 
        try:
            response_dict = json.loads(response_json)
            
            # validate model preds against correct answer 
            if response_dict['answer'] == row['solution']:
                # print('✅ Good answer - 😎👍')
                is_correct_pred = 1
            elif response_dict['answer'] != row['solution']: 
                # print('❌ Wrong answer!!') 
                is_correct_pred = 0
                
            # validation dictionary 
            val_dict = {'question': row['question'], 'response': response_json,
                        'difficulty': row['difficulty'],
                        'answer': response_dict['answer'],
                        'rationale': response_dict['rationale'],
                        'correct_solution': row['solution'],
                        'is_correct_pred': is_correct_pred} 
            # print(val_dict['question'], '\n\n')
            val.append(val_dict)
            keep_going = False
    
        except Exception as e:
            print("Exception occurred:", e)

    val_df = pd.DataFrame(val)

    # metrics 
    n_responses = len(val_df)
    accuracy = sum(val_df['is_correct_pred'])/n_responses

    if verbose == True: 
        perf_dict = {'responses': n_responses, 'accuracy': accuracy, 'val_dict': val}
    else: 
        perf_dict = {'responses': n_responses, 'accuracy': accuracy}
        
    return(perf_dict)

In [4]:
# Utility functions (cont.) - instantiate base_model; load eval_dict
def reload_base_model(model_id = "microsoft/Phi-3-mini-4k-instruct", add_tokenizer = True): 
    # Load bnb config, base model, and tokenizer
    bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
    )

    base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = 'auto', # not sure what's up with device_map, but this is what causes errors
    # quantization_config = bnb_config,
    trust_remote_code = True
    )

    if add_tokenizer == True: 
        # Load tokenizer - remove bos token since my function already pre-pends
        tokenizer = AutoTokenizer.from_pretrained(model_id,
                                                 add_eos_token = False,
                                                 add_bos_token = False,
                                                 padding_side = 'left')

    return(base_model)

def load_eval_df(file_path = os.getcwd() + '/data/question.json', includes_math = False): # turn off math for now due to high failure rate
    # load base prompt 
    bp_file_path = os.getcwd() + '/data/base_prompt.json'
    bp_json = json.load(open(bp_file_path))

    # load eval questions 
    q_json = json.load(open(file_path))

    if includes_math == True: 
        eval_df = pd.DataFrame(q_json).assign(
         full_question = lambda df: df.apply(lambda row: row['question'] + '\n' + '\n'.join([o['code'] + '. ' + o['text'] for o in row['options']]),  axis = 1),
         llm_input = lambda df: df.apply(lambda row: parse_phi(bp_json + [{'role': 'assistant', 'content': row['full_question']}]), axis = 1)
        )
    else: 
        eval_df = pd.DataFrame(q_json).assign(
         full_question = lambda df: df.apply(lambda row: row['question'] + '\n' + '\n'.join([o['code'] + '. ' + o['text'] for o in row['options']]),  axis = 1),
         llm_input = lambda df: df.apply(lambda row: parse_phi(bp_json + [{'role': 'assistant', 'content': row['full_question']}]), axis = 1)
        )

        eval_df = eval_df[eval_df['type'] != 'math']

    return(eval_df)

In [None]:
# # Load bnb config, base model, and tokenizer
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit = True,
#     bnb_4bit_use_double_quant = True,
#     bnb_4bit_quant_type = 'nf4',
#     bnb_4bit_compute_dtype = torch.bfloat16
# )

# base_model = AutoModelForCausalLM.from_pretrained(
#     model_id[0],
#     device_map = 'auto', # not sure what's up with device_map, but this is what causes errors
#     quantization_config = bnb_config,
#     trust_remote_code = True
# )

# # Load tokenizer - remove bos token since my function already pre-pends
# tokenizer = AutoTokenizer.from_pretrained(model_id[0],
#                                          add_eos_token = False,
#                                          add_bos_token = False,
#                                          padding_side = 'left')

# Load self-attention layer
Goal is to load self-attn, know where corresponds to on diagram, and be able to identify inputs + outputs (along w/ dims of each).

**Self-note:** remember to add with torch no grad so you don't accumulate grads...

In [8]:
# Re-instantiate model 
base_model = reload_base_model()

# Load eval dict 
eval_df = load_eval_df()

# Load tokenizer - remove bos token since my function already pre-pends
tokenizer = AutoTokenizer.from_pretrained(model_id[0],
                                         add_eos_token = False,
                                         add_bos_token = False,
                                         padding_side = 'left')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
sen = "<s>My dog is a good boy who likes to"

# tokenize sentence 
dog_tok = tokenizer(sen, return_tensors = 'pt').to(device)
print(f"Token dims: {dog_tok['input_ids'].squeeze().shape}")

# gen. embeddings / hidden states (ref. of "hidden states" changes over time) 
dog_embed = base_model.model.embed_tokens(dog_tok['input_ids'])
print(f"Embedding dims: {dog_embed.squeeze().shape}")

#################### NOW ENTERING TRANSFORMERS ###########################

In [None]:
# get position id's again (o.w. will silently fail since model looks for dims)
# this comes from line ~1064 in https://github.com/huggingface/transformers/blob/main/src/transformers/models/phi3/modeling_phi3.py#L243
seq_length = dog_tok['input_ids'].shape[1]

position_ids = torch.arange(0, seq_length + 0, dtype=torch.long, device = device)
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)

# basically, it's just tracking seq. length of inputs

In [None]:
# this is a single transformers block :) - we're going to go inside of it
one_block = base_model.model.layers[0]
# print(one_block)

with torch.no_grad():
    # layer_norm on hidden states 
    hidden_states = one_block.input_layernorm(dog_embed)
    
    # enter self_attn layer 
    # this is all of the self_attn stuff at once 
    # self_attn = one_block.self_attn(hidden_states, position_ids = position_ids)
    # print(self_attn[0].shape)

    # hidden_states_two = hidden_states + self_attn[0]

    # # enter MLP 
    # print(one_block.mlp(hidden_states_two).shape)
    # print(one_block.self_attn.head_dim, one_block.self_attn.hidden_size)

    # o_proj is a linear layer that seems to prep. for future transforms; also injects more weights that can 
    # be trained / can hold meaning 
    # o_proj_output = one_block.self_attn.o_proj(dog_embed)
    # print(o_proj_output.shape) # 11 x 3072 
    
    # qkv proj - these are now stacked; like a mega-tensor 
    qkv = one_block.self_attn.qkv_proj(hidden_states)
    print(qkv.shape) 

    # call forward on the attn module 
    # self_attn = one_block.self_attn(hidden_states, position_ids = position_ids)
    bsz, q_len, _ = hidden_states.size()
    print(bsz, q_len)

    query_pos = one_block.self_attn.num_heads * one_block.self_attn.head_dim
    print(query_pos)

    query_states = qkv[..., :query_pos] # should be ~1/3
    key_states = qkv[..., query_pos : query_pos + one_block.self_attn.num_key_value_heads * one_block.self_attn.head_dim]
    value_states = qkv[..., query_pos + one_block.self_attn.num_key_value_heads * one_block.self_attn.head_dim :]
    print(query_states.shape, key_states.shape, value_states.shape)

    # re-shape each (head_dim is D/H)
    query_states = query_states.view(bsz, q_len, one_block.self_attn.num_heads, one_block.self_attn.head_dim).transpose(1, 2)
    key_states = key_states.view(bsz, q_len, one_block.self_attn.num_heads, one_block.self_attn.head_dim).transpose(1, 2)
    value_states = value_states.view(bsz, q_len, one_block.self_attn.num_heads, one_block.self_attn.head_dim).transpose(1, 2)
    # print(query_states.shape)

    kv_seq_len = key_states.shape[-2]
    print(kv_seq_len)

    # now, apply rotary embeddings (return to figure out what is going on here) 
    cos, sin = one_block.self_attn.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
    print(query_states.shape)

    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(one_block.self_attn.head_dim)
    print(attn_weights.shape) # it is 11 x 11 - and there's 32 since there's 32 blocks
    # mlp portion - gate up proj 
    # gate_up_proj_output = one_block.mlp.gate_up_proj(qkv_proj_output)
    # print(gate_up_proj_output.shape)

    # attention mask piece helps ensure that things only pay attention to what occurs before; ow everything "pays attention" to everything 
    # this is a way to force boundaries 

In [57]:
base_model.model.layers[0]

Phi3DecoderLayer(
  (self_attn): Phi3Attention(
    (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
    (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
    (rotary_emb): Phi3RotaryEmbedding()
  )
  (mlp): Phi3MLP(
    (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
    (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
    (activation_fn): SiLU()
  )
  (input_layernorm): Phi3RMSNorm()
  (resid_attn_dropout): Dropout(p=0.0, inplace=False)
  (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
  (post_attention_layernorm): Phi3RMSNorm()
)

In [64]:
# Testing for transformers block
with torch.no_grad(): 
    prompt = '<s>I am a dog and I like to eat meat! My favorite'
    base_model.eval()
    generated_tokens = 0
    input_ids = tokenizer(prompt, return_tensors = 'pt').to(device)['input_ids']

    N = input_ids.shape[1]

    # get embeddings
    embeds_output = base_model.model.embed_tokens(input_ids)
    hidden_state = embeds_output

    position_ids = torch.arange(0, N, dtype=torch.long, device=device).unsqueeze(0).view(-1, N) # Create position IDs
    attention_mask = _prepare_4d_causal_attention_mask(None, (1, N), embeds_output, 0, sliding_window = base_model.model.config.sliding_window) # Make an attention mask to hide right context

    # NOW ENTERING TRANSFORMERS LAYERS 
    decoder_layer = base_model.model.layers[0]
    # layer norm on hidden states - line 853 (https://github.com/huggingface/transformers/blob/main/src/transformers/models/phi3/modeling_phi3.py#L810)

    # store residuals 
    residual = hidden_state # line 851
    hidden_states = decoder_layer.input_layernorm(hidden_state) 

    # now, self attn - line 856
    attn_outputs, self_attn_weights, present_key_value = decoder_layer.self_attn(
        hidden_states = hidden_states,
        attention_mask = attention_mask,
        position_ids = position_ids,
        output_attentions = True # this is the one that helps pop. self_attn_weights and present_key_value :)) those are related to caching!
        # past_key_value = ## don't have - optional, cached 
        # output_attentions = ## don't have - line 842; whether to return attention tensors of all attention layers 
        # use_cache = use_cache ### don't have - optional, related to caching 
    )

    # line 865 
    hidden_states = residual + decoder_layer.resid_attn_dropout(attn_outputs)

    print(hidden_states)
    
    
    # self_attn 
    # base_model.model.layers[0].self_attn(

    # input_layernorm
    # resid_attn_dropout
    # resid_mlp_dropout
    # post_attention_layernorm

tensor([[[-0.0448,  0.1045,  0.0509,  ...,  0.0457, -0.0130, -0.0439],
         [ 0.0190,  0.0146,  0.0208,  ...,  0.0138, -0.0274,  0.0017],
         [ 0.0145,  0.0256,  0.0012,  ...,  0.0170, -0.0121, -0.0658],
         ...,
         [-0.0229, -0.0233, -0.0096,  ..., -0.0166,  0.0090,  0.0271],
         [ 0.0370,  0.0240, -0.0191,  ..., -0.0205,  0.0200, -0.0249],
         [-0.0550, -0.0360, -0.0253,  ..., -0.0052, -0.0096,  0.0105]]],
       device='cuda:0')


In [9]:
from py_helpers.phi3 import _prepare_4d_causal_attention_mask

@torch.no_grad()
def generate_multiple_outputs(model, tokenizer, prompt = '<s>I am a dog and I like to eat meat! My favorite', max_tokens = 128, device = 'cuda'):
    model.eval()
    generated_tokens = 0
    input_ids = tokenizer(prompt, return_tensors = 'pt').to(device)['input_ids']

    while True:
        N = input_ids.shape[1]

        # Get embeddings
        embeds_output = model.model.embed_tokens(input_ids)
        hidden_state = embeds_output
        
        # Get some parameters needed for transformers layers
        position_ids = torch.arange(0, N, dtype=torch.long, device=device).unsqueeze(0).view(-1, N) # Create position IDs
        attention_mask = _prepare_4d_causal_attention_mask(None, (1, N), embeds_output, 0, sliding_window = model.model.config.sliding_window) # Make an attention mask to hide right context
    
        # Execute transformers layers
        for i, layer in enumerate(model.model.layers):
            # hidden_state = layer(hidden_state, position_ids = position_ids, attention_mask = attention_mask)[0]

            #### enumerate the entire transformers block - start w/ self_attn
            # self_attn
            
            
            # mlp 

            # layernorm + dropout
            ###################################
    
        # RMS norm the final transformer layer output - this is after all 32 transformer blocsk
        hidden_state = model.model.norm(hidden_state)
    
        # Run LM head
        logits = model.lm_head(hidden_state)

        # Get argmax tokens + concatenate onto previous tokens
        output_token = torch.argmax(F.softmax(logits.squeeze(), dim = 1), dim = 1)[-1]
        input_ids = torch.cat((input_ids, output_token.view(1, 1)), dim = 1)

        # Break while loop if EOS or generation > max tokens
        generated_tokens = generated_tokens + 1
        if output_token in [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|end|>")] or generated_tokens >= max_tokens:
            break

    final_output = tokenizer.decode(input_ids.squeeze())
    return final_output

# Test
test_prompt = '<s>I am a dog and I like to eat meat! My favorite'
# Use function
print('my_model + manual generation', generate_multiple_outputs(base_model, tokenizer, prompt = test_prompt))



my_model + manual generation <s> I am a dog and I like to eat meat! My favorite food is chicken. I also like to play fetch with my owner. I am a very friendly dog and I love to cuddle with my owner. I am a golden retriever and I am 5 years old. I am very smart and I can learn many tricks. I am a good boy and I always obey my owner. I am a happy dog and I enjoy life.
<|assistant|> That's a lovely introduction to yourself, a golden retriever! Golden retrievers are known for their friendly and loyal nature, as well as their intelligence and ability to learn. It's wonderful to


In [None]:
# Copied from transformers.models.llama.modeling_llama.rotate_half
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


In [13]:
# track layer defs 
layer_names = []
for idx, (name, param) in enumerate(base_model.named_parameters()): 

    # store layer names (for testing) 
    layer_names.append({'idx': idx, 'name': name, 'dims': param.shape})

# view layers 
pd.DataFrame(layer_names)

Unnamed: 0,idx,name,dims
0,0,model.embed_tokens.weight,"(32064, 3072)"
1,1,model.layers.0.self_attn.o_proj.weight,"(3072, 3072)"
2,2,model.layers.0.self_attn.qkv_proj.weight,"(9216, 3072)"
3,3,model.layers.0.mlp.gate_up_proj.weight,"(16384, 3072)"
4,4,model.layers.0.mlp.down_proj.weight,"(3072, 8192)"
...,...,...,...
190,190,model.layers.31.mlp.down_proj.weight,"(3072, 8192)"
191,191,model.layers.31.input_layernorm.weight,"(3072,)"
192,192,model.layers.31.post_attention_layernorm.weight,"(3072,)"
193,193,model.norm.weight,"(3072,)"
