# Circuit Analysis

Attempt to do easy static stuff here with more complex stuff happening in the app. 



# Set Up

In [None]:
import sys 
sys.path.append('..')
import torch 
import json 
from IPython.display import display, HTML
from src.decision_transformer.utils import (
    load_decision_transformer,
    # get_max_len_from_model_type,
)
from src.environments.registration import register_envs
from src.environments.environments import make_env

register_envs()

from src.config import EnvironmentConfig

model_path = "../models/MiniGrid-MemoryS7FixedStart-v0/WorkingModel.pt"
state_dict = torch.load(model_path)

env_config = state_dict["environment_config"]
env_config = EnvironmentConfig(**json.loads(env_config))

env = make_env(env_config, seed=4200, idx=0, run_name="dev")
env = env()

dt = load_decision_transformer(
    model_path, env, tlens_weight_processing=True
)

## Functions

In [None]:
import pandas as pd
import itertools
import numpy as np

def tensor_to_long_data_frame(tensor_result, dimension_names):
    assert len(tensor_result.shape) == len(
        dimension_names
    ), "The number of dimension names must match the number of dimensions in the tensor"

    tensor_2d = tensor_result.reshape(-1)
    df = pd.DataFrame(tensor_2d.detach().numpy(), columns=["Score"])

    indices = pd.MultiIndex.from_tuples(
        list(np.ndindex(tensor_result.shape)),
        names=dimension_names,
    )
    df.index = indices
    df.reset_index(inplace=True)
    return df


def get_row_names_from_index_labels(names, index_labels):
    indices = list(itertools.product(*index_labels))
    multi_index = pd.MultiIndex.from_tuples(
        indices,
        names=names,  # use labels differently if we have index labels
    )
    if len(names) == 3:
        multi_index = multi_index.to_series().apply(
            lambda x: "{0}, ({1},{2})".format(*x)
        )

    elif names == 2:
        multi_index = multi_index.to_series().apply(
            lambda x: "({0},{1})".format(*x)
        )
    else:
        raise ("Index labels must be 2 or 3 dimensional")

    return multi_index

## Components

In [None]:

# Get Components
mlp0_in = dt.transformer.blocks[0].mlp.W_in.T.detach()
mlp1_in = dt.transformer.blocks[1].mlp.W_in.T.detach()
mlp2_in = dt.transformer.blocks[2].mlp.W_in.T.detach()

mlp0_in_bias = dt.transformer.blocks[0].mlp.b_in.detach()
mlp1_in_bias = dt.transformer.blocks[1].mlp.b_in.detach()
mlp2_in_bias = dt.transformer.blocks[2].mlp.b_in.detach()

mlp0_out = dt.transformer.blocks[0].mlp.W_out.detach()
mlp1_out = dt.transformer.blocks[1].mlp.W_out.detach()
mlp2_out = dt.transformer.blocks[2].mlp.W_out.detach()


# stack the heads
W_V = dt.transformer.W_V.detach()
W_O = dt.transformer.W_O.detach()

# Attention Head Stuff

In [None]:
# inner OV circuits.
W_OV = torch.einsum("lhmd,lhdn->lhmn", W_V, W_O)
print(W_OV.shape) # for each layer/head we have a mapping of in vectors to out vectors

# Unembedding Values
W_U = dt.action_predictor.weight
W_U = W_U.detach()
print(W_U.shape)


U, S, V = torch.linalg.svd(W_OV)
print(U.shape, S.shape, V.shape)

In [None]:
projection = V @ W_U.T # this is the final circuit
projection.shape # for each layer/head we have a mapping to "out vectors"

In [None]:
S.shape

In [None]:
# what if instead we projectect onto W_in from MLP?
W_in = mlp2_in # calculate the projection matrix
# W_in = mlp2_in / np.linalg.norm(mlp2_in, axis=1)[:, None] # normalize by the norm of the output vectors

U, S, V = torch.linalg.svd(W_OV)
# V = dt.transformer.ln_final(V) # layernorm

projection = V @ W_in.T # this is the final circuit
projection.shape # for each layer/head we have a mapping to "out vectors"

# note layernorm might be worth applying, ignore for now. 

In [None]:
V[0,0].norm(dim=0) # this is the norm of the first head of the first layer

In [None]:
V = dt.transformer.ln_final(V)

In [None]:
# but we know the rank is much smaller so let's just take the first 32 svd 
projection = projection[:,:,0:32]

In [None]:
# let's flatten this so we can make easier sense of it. 
ov_mlp_in_congruence = tensor_to_long_data_frame(projection, ["layer", "head", "head_right_svd", "neuron"])
# rename heads to L{Layer}H{head}
ov_mlp_in_congruence["head"] = ov_mlp_in_congruence["head"].astype(str)
ov_mlp_in_congruence["layer"] = ov_mlp_in_congruence["layer"].astype(str)
ov_mlp_in_congruence["head"] = ov_mlp_in_congruence.apply(lambda x: "L{0}H{1}".format(*x), axis=1)

#Let's do the same thing for Neurons
ov_mlp_in_congruence["neuron"] = "L2N" + ov_mlp_in_congruence["neuron"].astype(str)

print(ov_mlp_in_congruence.shape)
ov_mlp_in_congruence.head() # length 3*8*32*256

In [None]:
ov_mlp_in_congruence.Score.describe(percentiles=[0.001, 0.01, 0.25, 0.5, 0.95, 0.99, 0.999])

In [None]:
# now let's visualize this.
import plotly.express as px
fig = px.ecdf(ov_mlp_in_congruence, x="Score")
# make plot smaller
fig.update_layout(
    autosize=False,
    width=500,
    height=300,
)
fig.show()

In [None]:
# display(ov_mlp_in_congruence.sort_values("Score").head(30))
ov_mlp_in_congruence[ov_mlp_in_congruence.Score < -0.277].neuron.value_counts()
# ov_mlp_in_congruence[ov_mlp_in_congruence.Score < -6.043009].neuron.value_counts()
ov_mlp_in_congruence[ov_mlp_in_congruence.Score > 0.286078].neuron.value_counts()
# ov_mlp_in_congruence[ov_mlp_in_congruence.Score > 6.0975].neuron.value_counts()
# ov_mlp_in_congruence.sort_values("Score").tail(10)

In [None]:
# let's draw 768 random edges and calculate how many are above the 99.9th percentile or below the 0.1th percentile, do this 100 times and average.
scores = ov_mlp_in_congruence.sample(768).Score.values
print(np.mean(scores > 0.380826)*768)
# print(np.mean(scores > 0.380826)*768)
# print(np.mean(scores > 6.0975)*768)
print(np.mean(scores < -0.377494)*768)
# print(np.mean(scores < -0.377494)*7÷68)
# print(np.mean(scores < -6.043009)*768)

In [None]:

heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
tmp = ov_mlp_in_congruence[ov_mlp_in_congruence["head"].isin(heads)]
px.strip(
    tmp,
    # x="neuron",
    y="Score",
    color="head",
    hover_data=["neuron", "head", "head_right_svd"],
    template="plotly_dark",
).show()

# display(tmp.sort_values("Score").head(5))
# ov_mlp_in_congruence.query("head == @head and layer == @layer").sort_values("Score").tail(5)

# make a table with the top 5 neurons for each head
top_neurons = tmp.groupby(["head", "layer"]).apply(lambda x: x.sort_values("Score").tail(5))
top_neurons = top_neurons.reset_index(drop=True)
# top_neurons

bottom_neurons = tmp.groupby(["head", "layer"]).apply(lambda x: x.sort_values("Score").head(5))
bottom_neurons = bottom_neurons.reset_index(drop=True)

# bottom_neurons

# concatenate them
top_neurons = pd.concat([top_neurons, bottom_neurons])

# get top connections by aggregatating any head / head_right svd. List the neurons and the scores
top_connections = top_neurons.groupby(["head", "head_right_svd"]).apply(lambda x: x.sort_values("Score"))
top_connections.drop(["head", "head_right_svd", "layer"], axis=1, inplace=True)
# top_connections = top_connections.reset_index(drop=True)
top_connections

In [None]:
top_connections.query("head == 'L1H0'")

In [None]:
# list neurons by heads/head_right_svd
top_connections.reset_index().drop("level_2", axis=1).groupby(["head", "head_right_svd"]).apply(lambda x: x["neuron"].tolist())

In [None]:
# list heads/head_right_svd by neuron
top_connections.reset_index().drop("level_2", axis=1).groupby(["neuron"]).apply(lambda x: x[["head", "head_right_svd"]].values.tolist())

In [None]:
neurons= [108, 204, 1, 4, 235, 79, 63, 132, 169, 255, 151]
neurons= [f"L2N{n}" for n in neurons]
heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
tmp = ov_mlp_in_congruence.copy()
# tmp = tmp[tmp["head"].isin(heads)]
tmp["in_candidate_heads"] = tmp["head"].isin(heads)
tmp["in_top_5_svd"] = tmp["head_right_svd"] < 6
tmp = tmp[tmp["neuron"].isin(neurons)]
# tmp = tmp[tmp["head_right_svd"] < 6]
# tmp = tmp.sort_values("neuron")
px.strip(
    tmp,
    x="neuron",
    y="Score",
    color="neuron",
    hover_data=["neuron", "head", "head_right_svd"],
    template="plotly_dark",
).show()

In [None]:
neurons= [108, 204, 1, 4, 235, 79, 63, 132, 169, 255, 151]
neurons= [f"L2N{n}" for n in neurons]
heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
tmp = ov_mlp_in_congruence.copy()
# tmp = tmp[tmp["head"].isin(heads)]
# tmp["in_candidate_heads"] = tmp["head"].isin(heads)
# tmp["in_top_5_svd"] = tmp["head_right_svd"] < 6
tmp = tmp[tmp["neuron"] == "L2N108"]
# tmp = tmp[tmp["head_right_svd"] < 6]
# tmp = tmp.sort_values("neuron")
px.strip(
    tmp,
    x="neuron",
    y="Score",
    color="head",
    hover_data=["neuron", "head", "head_right_svd"],
    template="plotly_dark",
).show()

In [None]:
neurons= [108, 204, 1, 4, 235, 79, 63, 132, 169, 255, 151]
neurons= [f"L2N{n}" for n in neurons]
heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
tmp = ov_mlp_in_congruence.copy()
tmp["Score"] = tmp["Score"].abs()
# group by neurons and heads and calculate the average score
tmp = tmp.groupby(["neuron", "head"]).Score.mean().reset_index()
tmp['candidate_head'] = tmp["head"].isin(heads)
# tmp
fig = px.ecdf(tmp, x = "Score", color = "neuron")
fig.show()

px.strip(
    tmp[tmp.neuron.isin(neurons)],
    x="neuron",
    y="Score",
    color="candidate_head",
    hover_data=["neuron", "head"],
    template="plotly_dark",
).show()


In [None]:
tmp.shape

In [None]:
neurons= [108, 204, 1, 4, 235, 79, 63, 132, 169, 255, 151]
neurons= [f"L2N{n}" for n in neurons]
heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
tmp = ov_mlp_in_congruence.copy()
tmp["Score"] = tmp["Score"].abs()
# replace "Score" with rank in entire dataset
# tmp["Score"] = tmp["Score"].rank().astype(int)

# filter by heads and neurons
tmp = tmp[tmp["head"].isin(heads)]
tmp = tmp[tmp["neuron"].isin(neurons)]

# aggregate by head/neuron and take the max score
tmp = tmp.groupby(["neuron", "head"]).Score.max().reset_index()
# pivote the table
tmp = tmp.pivot(index="neuron", columns="head", values="Score")
# color the table
tmp.style.background_gradient(cmap="Blues", vmin =0, vmax = 0.524874)

In [None]:
# let's make a graph. 
heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
neurons= [108, 204, 1, 4, 235, 79, 63, 132, 169, 255, 151]
neurons = [f"L2N{n}" for n in neurons]

ov_mlp_in_congruence["head_direction"] = ov_mlp_in_congruence["head"] + " -> " + ov_mlp_in_congruence["head_right_svd"].astype(str)

# now let's do some arbitrary filtering: 
tmp = ov_mlp_in_congruence.copy()

# filter by heads and neurons
tmp = tmp[tmp["head"].isin(heads)]
tmp = tmp[tmp["neuron"].isin(neurons)]

# get 0.01% and 99.99% quantiles for scores in general and filter by them
q = tmp["Score"].quantile([0.01, 0.99])
tmp = tmp[(tmp["Score"] < q[0.01]) | (tmp["Score"] > q[0.99])]
tmp.shape

In [None]:
import networkx as nx 
from pyvis.network import Network

# create a graph
G = nx.from_pandas_edgelist(tmp, source="head_direction", target="neuron", edge_attr="Score")

# print out the number of disconnected components
# add colors, wed anderson pallette
color_head_map = {
    "L0H0": "#2A9D8F",
    "L0H4": "#E9C46A",
    "L1H0": "#F4A261",
    "L2H1": "#E76F51",
}


# # create a pyvis network
nt = Network("1000px", "2000px", notebook=True, bgcolor="#222222", font_color="white")
nt.from_nx(G)
nt.show("tmp.html")

# Heads SVD to Actions

In [None]:
# For completeness, I also want to see SVD's for heads directly writing to outputs 

unembed_projection = V @ W_U.T # this is the final circuit
unembed_projection.shape # for each layer/head we have a mapping to "out vectors"
unembed_projection = unembed_projection[:,:,0:32]
ov_unembed_congruence = tensor_to_long_data_frame(unembed_projection, ["layer", "head", "head_right_svd", "Action"])
# rename heads to L{Layer}H{head}
ov_unembed_congruence["head"] = ov_unembed_congruence["head"].astype(str)
ov_unembed_congruence["layer"] = ov_unembed_congruence["layer"].astype(str)
ov_unembed_congruence["head"] = ov_unembed_congruence.apply(lambda x: "L{0}H{1}".format(*x), axis=1)

display(ov_unembed_congruence.head())
ov_unembed_congruence.Score.describe(percentiles=[0.001, 0.01, 0.25, 0.5, 0.95, 0.99, 0.999])

In [None]:
# heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
# tmp = ov_mlp_in_congruence[ov_mlp_in_congruence["head"].isin(heads)]
px.strip(ov_unembed_congruence,
    y="Score",
    color="head",
    hover_data=["Action", "head", "head_right_svd"],
    template="plotly_dark",
).show()

In [None]:
# convert long to wide (score for each action, head, head_right_svd
from src.streamlit_app.constants import ACTION_NAMES

# ov_unembed_congruence["Action"] = ov_unembed_congruence["Action"].apply(lambda x: ACTION_NAMES[x])
ov_unembed_congruence_wide = ov_unembed_congruence.pivot(index=["layer", "head", "head_right_svd"], columns="Action", values="Score")
#reset column index
ov_unembed_congruence_wide.reset_index(inplace=True)
# set opacity column to be 1 - head_right_svd / 32
ov_unembed_congruence_wide["opacity"] = 1 - ov_unembed_congruence_wide["head_right_svd"] / 32
ov_unembed_congruence_wide.opacity.max()

In [None]:

# # scatter plot left vs right scores
fig = px.scatter(
    ov_unembed_congruence_wide,
    y="left",
    x="right",
    color="head",
    opacity= ov_unembed_congruence_wide["opacity"],
    hover_data=["head", "head_right_svd"],
    template="plotly_dark",
)
# get max abs value in either left or right
max_abs = np.max(np.abs(ov_unembed_congruence_wide[["left", "right"]].values))+0.05
# set the range
fig.update_layout(xaxis=dict(range=[-max_abs, max_abs]), yaxis=dict(range=[-max_abs, max_abs]))
# fig.update_layout(xaxis=dict(aspectmode='equal'), yaxis=dict(aspectmode='equal'))
# add dotted grey y=x line
fig.add_shape(
    type="line",
    x0=-max_abs,
    y0=-max_abs,
    x1=max_abs,
    y1=max_abs,
    line=dict(
        color="Grey",
        width=1,
        dash="dot",
    ),
)

fig.show()


# State Observations into Heads SVD

In [None]:
# get the labels. 
from src.streamlit_app.constants import SPARSE_CHANNEL_NAMES
import itertools 

all_index_labels = [
    SPARSE_CHANNEL_NAMES,
    list(range(7)),
    list(range(7)),
]
indices = list(itertools.product(*all_index_labels))
index_labels = ["{0}, ({1},{2})".format(*index) for index in indices]
print(index_labels[:4])

# extract just the channels
channel_labels = [label.split(",")[0] for label in index_labels]
print(channel_labels[:4])

embedding = dt.state_embedding.weight.detach().T
print(embedding.shape)

In [None]:
# now we can get the cossine similarity matrix. but first let's filter for the channels we care about
channels_we_care_about = ["key", "ball", "unseen", "empty", "green", "grey", "red"]
index_mask = [label in channels_we_care_about for label in channel_labels]
print(sum(index_mask)) # 7*7*7 = 343 channels

restricted_embeddings = embedding[index_mask]
restricted_labels = [label for label, mask in zip(index_labels, index_mask) if mask]
print(restricted_embeddings.shape)
print(len(restricted_labels))

restricted_embeddings = restricted_embeddings / np.linalg.norm(restricted_embeddings, axis=1, keepdims=True)

In [None]:
# what if instead we projectect onto W_in from MLP?
# W_in = mlp2_in # calculate the projection matrix
# # W_in = mlp2_in / np.linalg.norm(mlp2_in, axis=1)[:, None] # normalize by the norm of the output vectors

U, S, V = torch.linalg.svd(W_OV)
# V = dt.transformer.ln_final(V) # layernorm
U = U[:, :, 0:32] # only keep the first 32 singular values

projection = U @ restricted_embeddings.T  # this is the final circuit
projection.shape # for each layer/head we have a mapping to "out vectors"

# # note layernorm might be worth applying, ignore for now. 


In [None]:
ov_embedding_congruence = tensor_to_long_data_frame(projection, ["layer", "head", "head_left_svd", "embedding"])

# rename embedding dimensions restricted labels
ov_embedding_congruence["embedding"] = ov_embedding_congruence["embedding"].map(dict(zip(range(len(restricted_labels)), restricted_labels)))

# do the whole head/layer rename thingo
ov_embedding_congruence["head"] = ov_embedding_congruence["head"].astype(str)
ov_embedding_congruence["head"] = ov_embedding_congruence.apply(lambda x: "L{0}H{1}".format(*x), axis=1)
# drop layer now
ov_embedding_congruence.drop(columns=["layer"], inplace=True)

display(ov_embedding_congruence.Score.describe(percentiles=[0.001, 0.01, 0.25, 0.5, 0.95, 0.99, 0.999]))
ov_embedding_congruence # shape 3*8*256*343

In [None]:
display(ov_embedding_congruence.Score.describe(percentiles=[0.001, 0.01, 0.25, 0.5, 0.95, 0.99, 0.999]))

In [None]:

import plotly.express as px 

# heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
px.strip(
    ov_embedding_congruence.query("embedding == 'ball, (2,6)' or embedding == 'key, (2,6)'"),
    y="Score",
    color="head",
    x="head",
    facet_col="embedding",
    hover_data=["embedding", "head", "head_left_svd"],
    template="plotly_dark",
).show()



In [None]:
tmp = ov_embedding_congruence.query("embedding == 'key, (1,2)' or embedding == 'key, (5,2)' or embedding == 'ball, (1,2)' or embedding == 'ball, (5,2)'")
px.strip(
    tmp,
    y="Score",
    color="head",
    x="head",
    facet_col="embedding",
    hover_data=["embedding", "head", "head_left_svd"],
    template="plotly_dark",
).show()


In [None]:
tmp = ov_embedding_congruence.query("embedding == 'key, (1,6)' or embedding == 'key, (5,6)' or embedding == 'ball, (1,6)' or embedding == 'ball, (5,6)'")
px.strip(
    tmp,
    y="Score",
    color="head",
    x="head",
    facet_col="embedding",
    hover_data=["embedding", "head", "head_left_svd"],
    template="plotly_dark",
).show()

In [None]:
import plotly.express as px 

heads = ["L0H0", "L0H4", "L1H0", "L2H1"]
px.strip(
    ov_embedding_congruence[ov_embedding_congruence["head"].isin(heads)],
    y="Score",
    color="head",
    x="head",
    hover_data=["embedding", "head", "head_left_svd"],
    template="plotly_dark",
).show()



In [None]:
heads = [f"L0H{i}" for i in range(8)]
px.strip(
    ov_embedding_congruence[ov_embedding_congruence["head"].isin(heads)],
    y="Score",
    color=ov_embedding_congruence[ov_embedding_congruence["head"].isin(heads)].embedding.str.contains("ball, (2,6)", regex=False),
    x="head",
    hover_data=["embedding", "head", "head_left_svd"],
    template="plotly_dark",
).show()


In [None]:
tmp = ov_embedding_congruence#[ov_embedding_congruence["head"].isin(heads)]

# make a table with the top 5 embedding for each head
top_embeddings = tmp.groupby(["head"]).apply(lambda x: x.sort_values("Score").tail(5))
top_embeddings = top_embeddings.reset_index(drop=True)
# top_neurons

bottom_embeddings = tmp.groupby(["head"]).apply(lambda x: x.sort_values("Score").head(5))
bottom_embeddings = bottom_embeddings.reset_index(drop=True)

# bottom_neurons

# concatenate them
top_embeddings = pd.concat([top_embeddings, bottom_embeddings])

# get top connections by aggregatating any head / head_right svd. List the neurons and the scores
top_connections = top_embeddings.groupby(["head", "head_left_svd"]).apply(lambda x: x.sort_values("Score"))
top_connections.drop(["head", "head_left_svd"], axis=1, inplace=True)
# top_connections = top_connections.reset_index(drop=True)
top_connections

In [None]:
px.bar(top_connections.reset_index().embedding.value_counts())

In [None]:
top_connections.reset_index().drop("level_2", axis=1).groupby(["head", "head_left_svd"]).apply(lambda x: x["embedding"].tolist())

In [None]:
top_connections.reset_index().drop("level_2", axis=1).groupby(["embedding"]).apply(lambda x: x[["head", "head_left_svd"]].values.tolist())

# Comparing in to out

In [None]:
# If SVD works then the corresponding/in outs of at least neurons in layer L0H0 should make sense?

display(ov_embedding_congruence.query("head == 'L0H0' and head_left_svd == 1").sort_values("Score", ascending=False).head(10))
display(ov_embedding_congruence.query("head == 'L0H0' and head_left_svd == 1").sort_values("Score", ascending=False).tail(10))

In [None]:
display(ov_embedding_congruence.query("head == 'L1H0' and head_left_svd == 1").sort_values("Score", ascending=False).head(10))
display(ov_embedding_congruence.query("head == 'L1H0' and head_left_svd == 1").sort_values("Score", ascending=False).tail(10))

In [None]:
display(ov_embedding_congruence.query("head == 'L1H0' and head_left_svd == 1").sort_values("Score", ascending=False).head(10))

In [None]:

# # let's flatten this so we can make easier sense of it. 
# ov_mlp_in_congruence = tensor_to_long_data_frame(projection, ["layer", "head", "head_right_svd", "neuron"])
# # rename heads to L{Layer}H{head}
# ov_mlp_in_congruence["head"] = ov_mlp_in_congruence["head"].astype(str)
# ov_mlp_in_congruence["layer"] = ov_mlp_in_congruence["layer"].astype(str)
# ov_mlp_in_congruence["head"] = ov_mlp_in_congruence.apply(lambda x: "L{0}H{1}".format(*x), axis=1)

# #Let's do the same thing for Neurons
# ov_mlp_in_congruence["neuron"] = "L2N" + ov_mlp_in_congruence["neuron"].astype(str)

# print(ov_mlp_in_congruence.shape)
# ov_mlp_in_congruence.head() # length 3*8*32*256

In [None]:
W_OV.shape

In [None]:
import torch.nn.functional as F
# get cosine similarity of OV vectors with themselves

w_OV_similarity = F.cosine_similarity(W_OV, W_OV, dim=2)

In [112]:
w_OV_similarity.shape

torch.Size([3, 8, 256])