# Selective lesioning
Now, our aim is to erode a model in a **controlled manner**. Michael Levin's "multiple levels of competency" (planaria; evaluate biology by how well it reacts to errors) is a main inspiration. How can higher level layers of "agents" resolve errors from lower levels? Why are these systems (biological agents + llm's) so robust? How robust is our model to perturbances? 

Model performance is tracked by benchmarking over a question set (generated w/ an assist from gpt-4 :)). 

To do: 
+ Load base model + replace mps code x
+ Pull in question set x
+ Write some code to test/track tokens per second
+ Establish basic eval framework - need to (1) feed questions (async?); (2) randomly shut off weights from non-embed layers; (3) track perf. changes as culling increases
+ think abt ways to selectively kill off weights :) (got some good suggestions from group :D)
+ (lower prio) Figure out way to track code efficiency/read up on o complexity 

Notes: 
+ Keep an eye on mem. use - disk space can be monitored via `du -hs $HOME /workspace/*` - we have 100GB avail. 

In [2]:
### Load libraries
# import flash_attn
from dotenv import main
import torch
import torch.nn as nn
import json
import jinja2
import os
import sys
import re
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # for quantization
import plotly
from transformers import pipeline, set_seed
from tqdm import tqdm

# auth for gated repos (like llama) - gen token here: https://huggingface.co/settings/tokens
from huggingface_hub import notebook_login
notebook_login(os.getenv('HF_TOKEN'))

# model ids
model_id = ["microsoft/Phi-3-mini-4k-instruct"]

# Set seed for reproducibility 
torch.random.manual_seed(0)

# Increase max width of pd df columns 
pd.set_option('max_colwidth', 300)

# Instantiate jinja environment - used later for icl prompting 
environment = jinja2.Environment()

device = 'cuda'

# requirements.txt
# !pip3 freeze > requirements.txt

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Define utility functions 
# mem. monitoring! 
def check_memory():
    print("Allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("Reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("Total: %fGB"%(torch.cuda.get_device_properties(0).total_memory/1024/1024/1024))

# notification/text-to-speech
def text_to_speech(text):
    if sys.platform == 'darwin':
        os.system(f'say "{text}"')
    elif sys.platform.startswith('linux'):
        os.system(f'espeak "{text}"')
    else:
        print("Text-to-speech is not supported on this platform.")

# parse + template phi inputs
def parse_phi(messages: list[dict], append_response_start = True) -> str:
    """
    Converts a multi-turn conversation into a Llama-3-tokenizable input.

    Output format:
    # <s><|system|>
    # You are a helpful AI assistant.<|end|>
    # <|user|>
    # Guess my dog's name!<|end|>
    # <|assistant|>
    """
    format = '<s>'
    
    format += '\n'.join([f"<|{m['role']}|>\n{m['content']}<|end|>" for m in messages])

    if append_response_start:
        format += "\n<|assistant|>"
    
    return format

# print(parse_phi([
#     {'role': 'system', 'content': 'Hello'}, {'role': 'user', 'content': '1+1?'}, {'role': 'assistant', 'content': '2'}
# ], False))

# model eval
def eval_model(model, tokenizer, prompt):
    tokens = tokenizer(prompt, return_tensors = 'pt').to(device)
    model.eval()
    with torch.no_grad():
        res = model.generate(
            **tokens,
            max_new_tokens = 1,
            do_sample = False,
            temperature = 0.6,
            top_p = 0.9,
            eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(tokenizer.eos_token)]
        )
    return tokenizer.batch_decode(res)[0]

# assess model perf
def get_model_performance(eval_df, base_model, tokenizer, verbose = False): 

    val = []
    for idx, row in tqdm(eval_df.iterrows()): 
        response = eval_model(model = base_model, tokenizer = tokenizer, prompt = row['llm_input'])

        # error handling for malformed outputs 
        response_json = re.findall(r'(?=.*"rationale")(?=.*"answer"){.*?}', response)[-1] # extract response + json

        # initialize keep_going + check if response_json is empty list 
        try:
            response_dict = json.loads(response_json)
            
            # validate model preds against correct answer 
            if response_dict['answer'] == row['solution']:
                # print('✅ Good answer - 😎👍')
                is_correct_pred = 1
            elif response_dict['answer'] != row['solution']: 
                # print('❌ Wrong answer!!') 
                is_correct_pred = 0
                
            # validation dictionary 
            val_dict = {'question': row['question'], 'response': response_json,
                        'difficulty': row['difficulty'],
                        'answer': response_dict['answer'],
                        'rationale': response_dict['rationale'],
                        'correct_solution': row['solution'],
                        'is_correct_pred': is_correct_pred} 
            # print(val_dict['question'], '\n\n')
            val.append(val_dict)
            keep_going = False
    
        except Exception as e:
            print("Exception occurred:", e)

    val_df = pd.DataFrame(val)

    # metrics 
    n_responses = len(val_df)
    accuracy = sum(val_df['is_correct_pred'])/n_responses

    if verbose == True: 
        perf_dict = {'responses': n_responses, 'accuracy': accuracy, 'val_dict': val}
    else: 
        perf_dict = {'responses': n_responses, 'accuracy': accuracy}
        
    return(perf_dict)

In [4]:
# Utility functions (cont.) - instantiate base_model; load eval_dict
def reload_base_model(model_id = "microsoft/Phi-3-mini-4k-instruct", add_tokenizer = True): 
    # Load bnb config, base model, and tokenizer
    bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
    )

    base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = 'auto', # not sure what's up with device_map, but this is what causes errors
    quantization_config = bnb_config,
    trust_remote_code = True
    )

    if add_tokenizer == True: 
        # Load tokenizer - remove bos token since my function already pre-pends
        tokenizer = AutoTokenizer.from_pretrained(model_id,
                                                 add_eos_token = False,
                                                 add_bos_token = False,
                                                 padding_side = 'left')

    return(base_model)

def load_eval_df(file_path = os.getcwd() + '/data/question.json', includes_math = False): # turn off math for now due to high failure rate
    # load base prompt 
    bp_file_path = os.getcwd() + '/data/base_prompt.json'
    bp_json = json.load(open(bp_file_path))

    # load eval questions 
    q_json = json.load(open(file_path))

    if includes_math == True: 
        eval_df = pd.DataFrame(q_json).assign(
         full_question = lambda df: df.apply(lambda row: row['question'] + '\n' + '\n'.join([o['code'] + '. ' + o['text'] for o in row['options']]),  axis = 1),
         llm_input = lambda df: df.apply(lambda row: parse_phi(bp_json + [{'role': 'assistant', 'content': row['full_question']}]), axis = 1)
        )
    else: 
        eval_df = pd.DataFrame(q_json).assign(
         full_question = lambda df: df.apply(lambda row: row['question'] + '\n' + '\n'.join([o['code'] + '. ' + o['text'] for o in row['options']]),  axis = 1),
         llm_input = lambda df: df.apply(lambda row: parse_phi(bp_json + [{'role': 'assistant', 'content': row['full_question']}]), axis = 1)
        )

        eval_df = eval_df[eval_df['type'] != 'math']

    return(eval_df)

In [5]:
# Load bnb config, base model, and tokenizer
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id[0],
    device_map = 'auto', # not sure what's up with device_map, but this is what causes errors
    quantization_config = bnb_config,
    trust_remote_code = True
)

# Load tokenizer - remove bos token since my function already pre-pends
tokenizer = AutoTokenizer.from_pretrained(model_id[0],
                                         add_eos_token = False,
                                         add_bos_token = False,
                                         padding_side = 'left')

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Initial eval. setup
Here, we template our questions and run an initial evaluation of phi-3's performance before modification.

In [6]:
# set base prompt 
base_prompt = [
    {
        "role": "system",
        "content": "You are a helpful, honest, and intelligent AI assistant who can only respond with a single JSON object. Solve each of the following questions. Return a JSON object containing two keys, `rationale` and `answer`."
    },
    {
        "role": "user",
        "content": "What's the integer ceiling of 5/3?\nA. 3\nB. 4.25\nC. dog\nD. 2"
    },
    {
        "role": "assistant",
        "content": '{"rationale": "5/3 is between 1 (3/3) and 2 (6/3), so the integer ceiling is 2.", "answer": "D"}'
    }, 
    {
        "role": "user",
        "content": "What's the capital of the U.S. state of Georgia?\nA. Tblisi\nB. Atlanta\nC. Nashville\nD. Toronto"
    },
    {
        "role": "assistant",
        "content": '{"rationale": "The capital of the U.S. state of Georgia is Atlanta, located in the Northwest of the state.", "answer": "B"}'
    }
]


In [7]:
# create eval/questions df 
# GPT-4 generation prompt
# I am benchmarking an LLM. I want you to create 100 MMLU-style questions. Return them in a JSON array of the format specified below. The questions should be a mix of easy/medium/hard difficulty. 
# The types should be "math", "extraction", "reasoning", "facts". 
# - "Math" questions should be related to arithmetic, calculus, or statistics. 
# - "Extraction" questions should focus on NLP-style NER tasks.
# - "Reasoning" should focus on logic. 
# - "Facts" should be focused on facts related to science or nature.
# Here is an example of a question (do not use this question).
# ```
# [
# {"question": "Suppose you have a data source that generates binary messages. Each message can either be 0 or 1. If both outcomes are equally likely, what is the entropy of this data source?", "options": [{"code": "A", "text": "0 bits"}, {"code": "B", "text": "0.5 bits"}, {"code": "C", "text": "1 bit"}, {"code": "D", "text": "2 bits"}], "solution": "C", "difficulty": "hard", "type": "math"},
# {"question": "What element is represented by the symbol 'Na' on the periodic table?", "options": [{"code": "A", "text": "Nitrogen"}, {"code": "B", "text": "Nickel"}, {"code": "C", "text": "Neon"}, {"code": "D", "text": "Sodium"}], "solution": "D", "difficulty": "easy", "type": "facts"},
# ]
# ```

# Load questions.json
q_file_path = os.getcwd() + '/data/question.json'
q_file = open(q_file_path)
q_list = json.load(q_file) # yields list of dicts 


# create list of dicts, with addtl. keys allocated for full question + llm input
eval_df = pd.DataFrame(q_list).assign(
     full_question = lambda df: df.apply(lambda row: row['question'] + '\n' + '\n'.join([o['code'] + '. ' + o['text'] for o in row['options']]),  axis = 1),
     llm_input = lambda df: df.apply(lambda row: parse_phi(base_prompt + [{'role': 'assistant', 'content': row['full_question']}]), axis = 1)
)

# print(len(eval_df)) 
# print(eval_df['llm_input'][0]) # check on single input to ensure correct structure :) 
# eval_df.groupby('difficulty').count() # overall eval metrics

In [None]:
# question + answer - generate validation dictionary 
# is it better to keep eval_df as a list of dicts like prev? isn't iterrows() slightly less efficient? 
curated_eval_df = eval_df[eval_df['type'] != 'math'] # have to remove math for now, failure rates too high; prob. issue w/ trying to output latex etc.

val = []
for idx, row in curated_eval_df.iterrows(): # limit to first 10 rows for now during testing 
    print(f"Now processing question {idx}") 

    # answer validation 
    keep_going = True 
    
    while keep_going == True: 
        # generate response 
        response = eval_model(model = base_model, tokenizer = tokenizer, prompt = row['llm_input'])
        # print(response)

        # error handling for malformed outputs 
        response_json = re.findall(r'(?=.*"rationale")(?=.*"answer"){.*?}', response)[-1] # extract response + json

        # initialize keep_going + check if response_json is empty list 
        try:
            response_dict = json.loads(response_json)
            
            # validate model preds against correct answer 
            if response_dict['answer'] == row['solution']:
                # print('✅ Good answer - 😎👍')
                is_correct_pred = 1
            elif response_dict['answer'] != row['solution']: 
                # print('❌ Wrong answer!!') 
                is_correct_pred = 0
                
            # validation dictionary 
            val_dict = {'question': row['question'], 'response': response_json,
                        'difficulty': row['difficulty'],
                        'answer': response_dict['answer'],
                        'rationale': response_dict['rationale'],
                        'correct_solution': row['solution'],
                        'is_correct_pred': is_correct_pred} 
            print(val_dict['question'], '\n\n')
            val.append(val_dict)
            keep_going = False

        except Exception as e:
            print("Exception occurred:", e)

# notify when execution finishes
text_to_speech("Hello, responses are done generating!")

In [None]:
# prediction summary (allow this to serve as control - eventually will want to store output more formally :))
val_df = pd.DataFrame(val)

# metrics 
n_responses = len(val_df)
accuracy = sum(val_df['is_correct_pred'])/n_responses

print(f"n_responses = {n_responses}\naccuracy: {accuracy}")

# Performance under perturbance

There are two steps here: **(1) need to identify phi-3 activation layers** (can do this by doing a forward pass, storing output, and looking at output distribution) and **(2) determine/carry out weight culling using a given method** (e.g. randomly killing weights, sort weights by magnitude and kill the smallest first, forward passes over multiple inputs + look at average firing for neurons and cull the weights feeding into neurons with low activation across inputs). 

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id[0],
    device_map = 'auto', # not sure what's up with device_map, but this is what causes errors
    quantization_config = bnb_config,
    trust_remote_code = True
)

In [None]:
# Objective: cull 20% of weights from a single layer (here, I'm working with one piece of an initial mlp layer) 
my_tensor = base_model.model.layers[0].self_attn.o_proj.weight

# calc # of weights to eliminate within layer 
num_elements = my_tensor.numel()
num_to_zero = int(num_elements * .20)

# Generate a mask w/ zeroes and ones - this helps mark out locations to be zeroed
mask = torch.zeros_like(my_tensor) # zeros_like returns tensor filled w/ 0-valued scalars; overall size is same as input
mask[num_to_zero:] = 1

# shuffle mask 
mask = mask[torch.randperm(num_elements)]

# check for # starting zeroes 
print(f'starting zeroes in my_tensor: {(my_tensor == 0).sum().item()}')

# element-wise multiplication to zero out weights 
my_tensor *= mask 

# check # 0s in resulting tensor - resulting zeroes can be slightly lower due to some starting zeroes
print(f'zeroes in resulting tensor: {(my_tensor == 0).sum().item()}, elements to zero: {num_to_zero}')

In [None]:
# load eval_df 
eval_df = load_eval_df(includes_math = True)

# assess perf
perf_dict = get_model_performance(eval_df.sample(n = 50), base_model = base_model, tokenizer = tokenizer, verbose = True)

# check perf 
print(f'# responses generated: {perf_dict["responses"]}, overall accuracy: {perf_dict["accuracy"]}')

# Tracking activations (w/ forward hooks) 


In [None]:
# re-instantiate model
base_model = reload_base_model()

In [19]:
base_model.model.layers[0].self_attn

Phi3Attention(
  (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
  (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
  (rotary_emb): Phi3RotaryEmbedding()
)

In [10]:
# Identify phi-3 activation layers 
layer_names = []
for idx, (name, param) in enumerate(base_model.named_parameters()): 

    # store layer names (for testing) 
    layer_names.append({'idx': idx, 'name': name, 'dims': param.shape})

# view layers 
pd.DataFrame(layer_names)


# Identify phi-3 activation layers 
# my_param = []
# for idx, (name, param) in enumerate(base_model.named_parameters()): 
#     if name == 'model.layers.0.mlp.gate_up_proj.weight':
#         # my_param = param
#         print(param)

Unnamed: 0,idx,name,dims
0,0,model.embed_tokens.weight,"(32064, 3072)"
1,1,model.layers.0.self_attn.o_proj.weight,"(4718592, 1)"
2,2,model.layers.0.self_attn.qkv_proj.weight,"(14155776, 1)"
3,3,model.layers.0.mlp.gate_up_proj.weight,"(25165824, 1)"
4,4,model.layers.0.mlp.down_proj.weight,"(12582912, 1)"
...,...,...,...
190,190,model.layers.31.mlp.down_proj.weight,"(12582912, 1)"
191,191,model.layers.31.input_layernorm.weight,"(3072,)"
192,192,model.layers.31.post_attention_layernorm.weight,"(3072,)"
193,193,model.norm.weight,"(3072,)"


In [25]:
for idx, (name, param) in enumerate(base_model.named_parameters()): 
    if name == 'model.embed_tokens.weight':
        p = param
        
a, b = p.size()

In [27]:
b

3072

In [37]:
base_model.model.layers[0].self_attn

Phi3Attention(
  (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
  (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
  (rotary_emb): Phi3RotaryEmbedding()
)

In [29]:
# store activations 
activation = {} 

# define function to capture activations 
def getActivation(name): 
    # hook signature 
    def hook(model, input, output): 
        activation[name] = output[0].detach() # in future, might want to check if tuple and if it is extract 0th elem. - for now, we're just losing the batch # in embed
    return hook

# set a single, sample input 
test_input = eval_df['llm_input'][0]

# register forward hooks on a chosen layer - let's choose model.layers.0.self_attn.o_proj.weight for now :) 
h1 = base_model.model.embed_tokens.register_forward_hook(getActivation('embed_layer'))
h2 = base_model.model.layers[0].self_attn.register_forward_hook(getActivation('self_attention'))

# forward pass + store activations for this pass - this step has an issue; 'tuple object has no attribute detach'
test_response = eval_model(model = base_model, tokenizer = tokenizer, prompt = test_input)

# print activation 
print(activation)

# detach hooks 
h1.remove()    
h2.remove()

You are not running the flash-attention implementation, expect numerical differences.


{'embed_layer': tensor([[-0.0303,  0.0913,  0.0562,  ...,  0.0101, -0.0216, -0.0242],
        [ 0.0076,  0.0106,  0.0063,  ..., -0.0194, -0.0118,  0.0005],
        [-0.0315,  0.0128,  0.0415,  ...,  0.0255,  0.0112,  0.0004],
        ...,
        [ 0.0228, -0.0231, -0.0232,  ...,  0.0137,  0.0439,  0.0144],
        [-0.0096,  0.0334,  0.0075,  ..., -0.0352,  0.0219,  0.0205],
        [-0.0074, -0.0078,  0.0261,  ..., -0.0342,  0.0139,  0.0206]],
       device='cuda:0', dtype=torch.float16), 'self_attention': tensor([[[-0.0095,  0.0123, -0.0048,  ...,  0.0309,  0.0044, -0.0198],
         [-0.0161,  0.0176, -0.0060,  ...,  0.0166,  0.0069, -0.0184],
         [ 0.0024, -0.0059,  0.0043,  ...,  0.0021, -0.0053,  0.0014],
         ...,
         [-0.0097,  0.0026, -0.0010,  ..., -0.0039, -0.0057, -0.0037],
         [-0.0065,  0.0099,  0.0004,  ..., -0.0040, -0.0082, -0.0068],
         [-0.0082,  0.0070,  0.0002,  ..., -0.0049, -0.0089, -0.0032]]],
       device='cuda:0', dtype=torch.float16)

In [31]:
activation['embed_layer'].shape

torch.Size([269, 3072])

In [None]:
# store activations 
activation = {} 

# define function to capture activations 
def getActivationTwo(name): 
    # hook signature 
    def hook(model, input, output): 
        activation[name] = output
    return hook

# set a single, sample input 
test_input = eval_df['llm_input'][0]

# register forward hooks on a chosen layer - let's choose model.layers.0.self_attn.o_proj.weight for now :) 
h2 = base_model.model.layers[0].self_attn.register_forward_hook(getActivationTwo('self_attn')) # tk - question about how we know what comprises a layer? what's the right level of hierarchy to discuss at?

# forward pass + store activations for this pass - this step has an issue; 'tuple object has no attribute detach'
test_response = eval_model(model = base_model, tokenizer = tokenizer, prompt = test_input)[0]

# print activation 
print(activation)

# detach hooks 
h2.remove()              

In [None]:
base_model.model.layers[0].self_attn

In [None]:
activation = {}
for name, child in base_model.model.layers[0].named_children(): 
    print(name, child)
    if name == 'mlp': 
        my_child_h1 = child.register_forward_hook(getActivation('mlp'))


# detach hooks at end to prevent memory errors 

outputs = []
with torch.no_grad(): 
    # base_model('banana')
    bn_tok = tokenizer('banana', return_tensors = 'pt').to(device)
    outputs.append(base_model(**bn_tok))

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt 
mlp_output = activation['mlp'].squeeze().cpu().numpy().flatten() # squeeze to remove batch dim.; flatten - necessary for plotting 



In [None]:
mlp_output*100

In [None]:
plt.hist(mlp_output*100, bins = 20, alpha = 0.75)

In [None]:
# to get activations for internal layers do below
for name, child in my_child.named_children(): 
    print(name, child)
    if name == 'o_proj': 
        my_grandie_h1 = child.register_forward_hook(getActivation('o_proj'))



In [None]:
my_grandie_h1

In [None]:
# for module in base_model.model.layers[0].modules(): 
#     print(module)

layer = base_model.model.named_parameters()
for name, module in layer.named_children():
    print(name, module)


In [None]:
my_grandie_h1

In [None]:
base_model.model.named_parameters()

In [None]:
# Killing weights randomly across layers 
# Here, we simply kill a random portion of weights across layers, scaling up from 10-90% of weights in 5% increments. The only goal here is 
# to see how performance changes. 

# Desired output: line chart tracking successively increasing performance decay. 

# 1. Subset out activation layers (these are the ones we can modify) and count # of neurons across them so we can figure out which quantity = 10, 15, 20% etc. :) 
## a. Look at layer outputs; run forward pass and store intermediary computations - this tells us which are the activation layers 

In [None]:
# layer_names # these are the available layers 
torch.nn.ModuleList([layer for i, layer in enumerate(base_model.model.layers) if i != 16])