# Set Up


In [1]:
!nvidia-smi 
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import os
if "cd" not in globals():
    os.chdir("../")
    cd = True
print(os.getcwd())
save_dir = 'paper/figures'

Wed Jan 18 14:50:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA Tesla V1...  On   | 00000000:00:05.0 Off |                    0 |
| N/A   55C    P0   127W / 300W |   1413MiB / 32510MiB |     56%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import einops
import wandb
import statsmodels.api as sm
import json
import pickle 
import copy

from tqdm.notebook import tqdm

# plotting
from functools import partial
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import plotly.subplots

pio.renderers.default = "vscode"

# my own tooling
from utils.hook_points import HookPoint, HookedRootModule
from utils.plotting import *
from utils.groups import *
from utils.models import *
from utils.config import *
from utils.figures import *

In [3]:
if torch.cuda.is_available:
    print('Good to go!')
else:
    print('Things might be rather slow')

Good to go!


In [4]:
task_dir = "paper/mainline-S5"

seed, frac_train, layers, lr, group_param, weight_decay, betas, num_epochs, group_type, architecture_type = load_cfg(task_dir)
group = group_type(group_param, init_all = True)
model = architecture_type(layers, group.order, seed).cuda()
model.load_state_dict(torch.load(f"{task_dir}/model.pt"), strict=False)
model.eval()
all_data, _, all_labels, _, _= generate_train_test_data(group, frac_train = 1)
logits, activations = model.run_with_cache(group.all_data, return_cache_object=False)
activations['logits'] = logits
#metric_obj = Metrics(group, training=False, track_metrics = True)
key_reps= []
with open(os.path.join(task_dir, 'key_reps.txt'), 'r') as f:
    for line in f:
        key_reps.append(line.strip())
metrics_path = os.path.join(task_dir, 'metrics.csv')
summary_metrics_path = os.path.join(task_dir, 'summary_metrics.json')

# load the metrics
metrics = pd.read_csv(metrics_path)
summary_metrics = json.load(open(summary_metrics_path, 'r'))

reps_to_plot = list(group.non_trivial_irreps.keys())

Computing multiplication table...
... loading from file
Computing trace tensor cube for trivial representation
... loading from file
Computing trace tensor cube for sign representation
... loading from file
Computing trace tensor cube for standard representation
... loading from file
Computing trace tensor cube for standard_sign representation
... loading from file
Computing trace tensor cube for s5_5d_a representation
... loading from file
Computing trace tensor cube for s5_5d_b representation
... loading from file
Computing trace tensor cube for s5_6d representation
... loading from file


# Logit Attribution


In [5]:
# figure: a slice of true vs hypothetical logit cubes
logits = activations['logits']
logits = logits.reshape(group.order, group.order, group.order)

true_logits = logits[:, :, 0]/logits[:, :, 0].abs().max()
key_rep_logits = []
for key_rep in key_reps:
    key_rep_logits.append(group.irreps[key_rep].logit_trace_tensor_cube[:, :, 0]/group.irreps[key_rep].logit_trace_tensor_cube[:, :, 0].abs().max())
key_rep_logits = torch.stack(key_rep_logits, dim=0)

stack = torch.vstack([true_logits.unsqueeze(0), key_rep_logits])
print(stack.shape)
fig = px.imshow(to_numpy(stack), color_continuous_scale='RdBu', color_continuous_midpoint=0.0, title=f'logit 0', facet_col=0, labels={'x':'b', 'y':'a', 'facet_col': 'label'})
fig.layout.annotations[0]['text'] = 'true logit 0 over all inputs' 
for i in range(len(key_reps)):
    fig.layout.annotations[i+1]['text'] = f"{key_reps[i]} hypothesised logit 0"
fig.show()
fig.write_image(f"{save_dir}/logit_cubes.png", width=1000)


torch.Size([3, 120, 120])


In [6]:
# figure: evolution of cosine similarity
template = "logit_{}_rep_trace_similarity"
lines_from_template(metrics, template, reps_to_plot, title="cosine similarity of true logits and hypothesised logits", yaxis="cosine similarity", save=f"{save_dir}/logit_similarity.png", log_x=True, legend_pos='tl')


Saving to paper/figures/logit_similarity.png


In [7]:
# evidence: end of training logit cosine similarity

template = "logit_{}_rep_trace_similarity"
print("cosine similarity of true logits and hypothesised logits at end of training")
for irrep in group.non_trivial_irreps:
    print(f"{irrep}: {summary_metrics[template.format(irrep)]}")


cosine similarity of true logits and hypothesised logits at end of training
sign: 0.5090430974960327
standard: 0.7674336433410645
standard_sign: -1.102307578548789e-09
s5_5d_a: -1.0477378964424133e-09
s5_5d_b: 0.001785235945135355
s5_6d: 5.187757778912783e-09


In [8]:
# percentage explained
print(summary_metrics['percent_logits_explained'])

0.848082423210144


# Embeddings and Unembeddings

In [9]:
# figure: percent a, b, c embed by representation over course of training

template = "percent_x_embed_{}_rep"
lines_from_template(metrics, template, reps_to_plot, title="Fraction of variance of left embedding explained by representation", yaxis="fraction of variance", save=f"{save_dir}/percent_x_embed.png", log_x=True, legend_pos='tl')
template = "percent_y_embed_{}_rep"
lines_from_template(metrics, template, reps_to_plot, title="Fraction of variance of right embedding explained by representation", yaxis="fraction of variance", save=f"{save_dir}/percent_y_embed.png", log_x=True, legend_pos='tl')
template = "percent_unembed_{}_rep"
lines_from_template(metrics, template, reps_to_plot, title="Fraction of variance of unembedding explained by representation", yaxis="fraction of variance", save=f"{save_dir}/percent_unembed.png", log_x=True, legend_pos='tl')

Saving to paper/figures/percent_x_embed.png
Saving to paper/figures/percent_y_embed.png
Saving to paper/figures/percent_unembed.png


In [10]:
# table: percents explained
print("fraction of variance explained by representation at end of training")
print("x, y, unembed")
for rep in key_reps:
    print(f"{rep}: {summary_metrics['percent_x_embed_{}_rep'.format(rep)]}, {summary_metrics['percent_y_embed_{}_rep'.format(rep)]}, {summary_metrics['percent_unembed_{}_rep'.format(rep)]}")

fraction of variance explained by representation at end of training
x, y, unembed
standard: 0.9304873943328857, 0.9304895401000977, 0.8446281552314758
sign: 0.06951265037059784, 0.06951045244932175, 0.09584786742925644


# Hidden Layer Neurons

In [11]:
# figure: evolution of \rho(a), \rho(b), \rho(ab)
template = "percent_hidden_{}_rep"
lines_from_template(metrics, template, reps_to_plot, title="Fraction of variance of MLP neurons explained by representation", yaxis="fraction of variance", save=f"{save_dir}/percent_hidden.png", log_x=True, legend_pos='tl')

Saving to paper/figures/percent_hidden.png


In [12]:
# evidence: neuron clustering pre ReLU

threshold = 1

x_embed = model.x_embed
y_embed = model.y_embed

x_embed_summed = x_embed.pow(2).sum(dim=0)
off_neurons_x = (x_embed_summed < threshold).nonzero().squeeze()

y_embed_summed = y_embed.pow(2).sum(dim=0)
off_neurons_y = (y_embed_summed < threshold).nonzero().squeeze()

assert (off_neurons_x == off_neurons_y).all()

off_neurons = off_neurons_x

print(f'Off neurons: {len(off_neurons)}, {off_neurons}')

rep_neurons = {}

print('Neurons corresponding to each representation')
for rep_name in group.non_trivial_irreps:
    rep = group.irreps[rep_name].orth_rep
    coefs_x = rep.T @ x_embed
    coefs_y = rep.T @ y_embed
    coefs_x_summed = coefs_x.pow(2).sum(dim=0)
    coefs_y_summed = coefs_y.pow(2).sum(dim=0)

    x_neurons = (coefs_x_summed > threshold).nonzero().squeeze()
    y_neurons = (coefs_y_summed > threshold).nonzero().squeeze()
    assert (x_neurons == y_neurons).all()
    x_neurons = torch.tensor(x_neurons)
    if x_neurons.dim() == 0:
        x_neurons = x_neurons.unsqueeze(0)
    rep_neurons[rep_name] = x_neurons
    print(f'{rep_name}: {len(x_neurons)}, {x_neurons}')

print(rep_neurons)

all_neurons = torch.arange(model.W_U.shape[0])
unaccounted_neurons = set(all_neurons.tolist())
unaccounted_neurons -= set(off_neurons.tolist())
for rep_name, neurons in rep_neurons.items():
    unaccounted_neurons -= set(neurons.tolist())

print('Unaccounted neurons')
print(unaccounted_neurons)

Off neurons: 2, tensor([ 48, 110], device='cuda:0')
Neurons corresponding to each representation
sign: 7, tensor([  2,   8,  17,  65, 111, 113, 120], device='cuda:0')
standard: 119, tensor([  0,   1,   3,   4,   5,   6,   7,   9,  10,  11,  12,  13,  14,  15,
         16,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
         31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
         45,  46,  47,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
         60,  61,  62,  63,  64,  66,  67,  68,  69,  70,  71,  72,  73,  74,
         75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
         89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 112, 114, 115, 116, 117, 118, 119,
        121, 122, 123, 124, 125, 126, 127], device='cuda:0')
standard_sign: 0, tensor([], device='cuda:0', dtype=torch.int64)
s5_5d_a: 0, tensor([], device='cuda:0', dtype=torch.int64)
s5_5d_b: 


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



In [13]:
# evidence and table: neuron clustering in post hidden layer

threshold = 110

hidden = activations['hidden'].reshape(group.order**2, -1)

hidden_summed = hidden.pow(2).sum(dim=0)
off_neurons = (hidden_summed < threshold).nonzero().squeeze()

assert (off_neurons == off_neurons_x).all()

print(f'Off neurons: {off_neurons}')


fracs_explained_x = {}
fracs_explained_y = {}
fracs_explained_xy = {}
fracs_explained_trivial = {}

for rep_name in group.irreps.keys():
    group.irreps[rep_name].hidden_reps_x = group.irreps[rep_name].rep[all_data[:, 0]].reshape(group.order**2, -1)
    group.irreps[rep_name].hidden_reps_x_orth = torch.linalg.qr(group.irreps[rep_name].hidden_reps_x)[0]
    group.irreps[rep_name].hidden_reps_y = group.irreps[rep_name].rep[all_data[:, 1]].reshape(group.order**2, -1)
    group.irreps[rep_name].hidden_reps_y_orth = torch.linalg.qr(group.irreps[rep_name].hidden_reps_y)[0]
    group.irreps[rep_name].hidden_reps_xy = group.irreps[rep_name].rep[all_labels].reshape(group.order*group.order, -1)
    group.irreps[rep_name].hidden_reps_xy_orth = torch.linalg.qr(group.irreps[rep_name].hidden_reps_xy)[0]

for rep_name in key_reps:
    rep_x = group.irreps[rep_name].hidden_reps_x_orth
    rep_y = group.irreps[rep_name].hidden_reps_y_orth
    rep_xy = group.irreps[rep_name].hidden_reps_xy_orth

    trivial = group.irreps['trivial'].hidden_reps_x_orth

    coefs_x = rep_x.T @ hidden
    coefs_y = rep_y.T @ hidden
    coefs_xy = rep_xy.T @ hidden

    coefs_trivial = trivial.T @ hidden

    coefs_x_summed = coefs_x.pow(2).sum(dim=0)
    coefs_y_summed = coefs_y.pow(2).sum(dim=0)
    coefs_xy_summed = coefs_xy.pow(2).sum(dim=0)
    coefs_trivial_summed = coefs_trivial.pow(2).sum(dim=0)


    neurons = rep_neurons[rep_name]

    frac_x = (coefs_x_summed[neurons]).sum() / (hidden[:, neurons].pow(2).sum())
    frac_y = (coefs_y_summed[neurons]).sum() / (hidden[:, neurons].pow(2).sum())
    frac_xy = (coefs_xy_summed[neurons]).sum() / (hidden[:, neurons].pow(2).sum())
    frac_trivial = (coefs_trivial_summed[neurons]).sum() / (hidden[:, neurons].pow(2).sum())

    fracs_explained_x[rep_name] = frac_x
    fracs_explained_y[rep_name] = frac_y
    fracs_explained_xy[rep_name] = frac_xy
    fracs_explained_trivial[rep_name] = frac_trivial

print('Neurons corresponding to each representation')
for key in key_reps:
    print(f'frac variance explained in {key} x, y, xy: {fracs_explained_x[key], fracs_explained_y[key], fracs_explained_xy[key], fracs_explained_trivial[key]}')
    print(f'Sum of explained variance: {fracs_explained_x[key] + fracs_explained_y[key] + fracs_explained_xy[key] + fracs_explained_trivial[key]}')

Off neurons: tensor([ 48, 110], device='cuda:0')
Neurons corresponding to each representation
frac variance explained in standard x, y, xy: (tensor(0.2718, device='cuda:0'), tensor(0.2546, device='cuda:0'), tensor(0.0776, device='cuda:0'), tensor(0.3134, device='cuda:0'))
Sum of explained variance: 0.917451024055481
frac variance explained in sign x, y, xy: (tensor(0.2500, device='cuda:0'), tensor(0.2500, device='cuda:0'), tensor(0.2500, device='cuda:0'), tensor(0.2500, device='cuda:0'))
Sum of explained variance: 0.9999998211860657


In [14]:
# evidence: only \rho(ab) is important
hidden = activations['hidden'].reshape(group.order**2, -1)
loss = loss_fn(logits.reshape(group.order**2, -1), all_labels)
print(f'baseline loss: {loss}')

for rep_name in group.non_trivial_irreps:

    hidden_rep_x = group.irreps[rep_name].hidden_reps_x_orth
    hidden_rep_y = group.irreps[rep_name].hidden_reps_y_orth
    hidden_rep_xy = group.irreps[rep_name].hidden_reps_xy_orth

    coefs_x = hidden_rep_x.T @ hidden
    coefs_y = hidden_rep_y.T @ hidden
    coefs_xy = hidden_rep_xy.T @ hidden
    coefs_trivial = trivial.T @ hidden

    hidden_x = hidden_rep_x @ coefs_x
    hidden_y = hidden_rep_y @ coefs_y
    hidden_xy = hidden_rep_xy @ coefs_xy
    hidden_trivial = trivial @ coefs_trivial

    hidden_ablated_x = hidden - hidden_x
    hidden_ablated_y = hidden - hidden_y
    hidden_ablated_xy = hidden - hidden_xy
    hidden_ablated_trivial = hidden - hidden_trivial

    logits_x = hidden_ablated_x @ model.W_U
    logits_y = hidden_ablated_y @ model.W_U
    logits_xy = hidden_ablated_xy @ model.W_U
    logits_trivial = hidden_ablated_trivial @ model.W_U

    loss_x = loss_fn(logits_x, all_labels)
    loss_y = loss_fn(logits_y, all_labels)
    loss_xy = loss_fn(logits_xy, all_labels)
    loss_trivial = loss_fn(logits_trivial, all_labels)

    print(f'Ablating directions corresponding to {rep_name} rep loss, xy: {loss_xy}, x: {loss_x}, y: {loss_y}, trivial: {loss_trivial}')
    

baseline loss: 2.3838242884590774e-06
Ablating directions corresponding to sign rep loss, xy: 0.0009095180870913975, x: 2.372950527269995e-06, y: 2.3709135243412416e-06, trivial: 2.419312382417704e-06
Ablating directions corresponding to standard rep loss, xy: 7.5507839045127305, x: 2.209098210702922e-06, y: 2.2082870514831203e-06, trivial: 2.419312382417704e-06
Ablating directions corresponding to standard_sign rep loss, xy: 2.383824176064554e-06, x: 2.3838226086524663e-06, y: 2.3838241945102326e-06, trivial: 2.419312382417704e-06
Ablating directions corresponding to s5_5d_a rep loss, xy: 2.383824804402695e-06, x: 2.383821273166266e-06, y: 2.3838145275565636e-06, trivial: 2.419312382417704e-06
Ablating directions corresponding to s5_5d_b rep loss, xy: 2.5641532875129013e-06, x: 2.382457362479762e-06, y: 2.3824629432198597e-06, trivial: 2.419312382417704e-06
Ablating directions corresponding to s5_6d rep loss, xy: 2.3838250839977197e-06, x: 2.383818820888995e-06, y: 2.3838137551645755e

In [15]:
# evidence: explicit extraction of \rho(ab). 

def projection_matrix_general(B):
    """Compute the projection matrix onto the space spanned by the columns of `B`
    Args:
        B: ndarray of dimension (D, M), the basis for the subspace
    
    Returns:
        P: the projection matrix
    """
    P = B @ (B.T @ B).inverse() @ B.T
    return P

hidden = activations['hidden'].reshape(group.order*group.order, -1)
hidden_to_reps_proj = {}
coefs = {}

for rep_name in key_reps:
    hidden_reps_xy = group.irreps[rep_name].hidden_reps_xy



    P = projection_matrix_general(hidden_reps_xy)
    hidden_xy = P @ hidden

    hidden_to_reps_proj[rep_name] = hidden_reps_xy.T @ hidden_xy

    imshow(hidden_to_reps_proj[rep_name], title=f'Change of basis from neuron basis to rho(ab) {rep_name} representation basis', input2='neuron basis', input1='representation basis', save=f'{save_dir}/hidden_to_{rep_name}_rep_change_of_basis.png')

    hidden_in_rep = hidden_xy @ hidden_to_reps_proj[rep_name].T

    theoretical_reps = hidden_reps_xy.reshape(group.order*group.order, -1)
    imshow(hidden_in_rep[:10], title=f'Projected hidden layer in the {rep_name} representation basis', input2='representation basis', input1='input index')
    imshow(theoretical_reps[:10], title=f'rho(ab) in {rep_name}', input2='representation basis', input1='input index')

    sim = F.cosine_similarity(hidden_in_rep.flatten(), theoretical_reps.flatten(), dim=0)
    print(f'Cosine similarity between hidden layer and theoretical representations: {sim}')

    # get the coef
    coef = (hidden_in_rep.norm() / theoretical_reps.norm())
    coefs[rep_name] = coef

Saving to paper/figures/hidden_to_standard_rep_change_of_basis.png


Cosine similarity between hidden layer and theoretical representations: 0.999484121799469


Saving to paper/figures/hidden_to_sign_rep_change_of_basis.png


Cosine similarity between hidden layer and theoretical representations: 1.0000001192092896


# Logit Computation

In [16]:
# evidence: 
rep_name = 'standard'
W_U = model.W_U
rep = group.irreps[rep_name].rep.reshape(group.order, -1)
W_U_rep = hidden_to_reps_proj[rep_name] @ W_U @ rep [group.inverses]
print(W_U_rep.shape)
imshow(W_U_rep, title='Unembedding matrix in both input and output representation space', input2='input representation basis', input1='output representation basis', save=f'{save_dir}/unembedding_matrix_in_rep_basis.png')

real_linear_map = W_U_rep > 1e5
sim = F.cosine_similarity(W_U_rep.flatten(), real_linear_map.flatten(), dim=0)
print(f'Cosine similarity between unembedding matrix and real linear map: {sim}')

torch.Size([16, 16])


Saving to paper/figures/unembedding_matrix_in_rep_basis.png
Cosine similarity between unembedding matrix and real linear map: 0.9985767602920532


In [17]:
# do this orthogonally

def projection_matrix_general(B):
    """Compute the projection matrix onto the space spanned by the columns of `B`
    Args:
        B: ndarray of dimension (D, M), the basis for the subspace
    
    Returns:
        P: the projection matrix
    """
    P = B @ (B.T @ B).inverse() @ B.T
    return P

hidden = activations['hidden'].reshape(group.order*group.order, -1)
hidden_to_reps_proj_orth = {}
coefs = {}

for rep_name in key_reps:
    hidden_reps_xy = group.irreps[rep_name].hidden_reps_xy_orth


    P = projection_matrix_general(hidden_reps_xy)
    hidden_xy = P @ hidden

    hidden_to_reps_proj_orth[rep_name] = hidden_reps_xy.T @ hidden_xy

    imshow(hidden_to_reps_proj_orth[rep_name], title=f'Change of basis from neuron basis to rho(ab) {rep_name} representation basis', input2='neuron basis', input1='representation basis', save=f'{save_dir}/hidden_to_{rep_name}_orth_rep_change_of_basis.png')

    hidden_in_rep = hidden_xy @ hidden_to_reps_proj_orth[rep_name].T

    theoretical_reps = hidden_reps_xy.reshape(group.order*group.order, -1)
    imshow(hidden_in_rep[:10], title=f'Projected hidden layer in the {rep_name} representation basis', input2='representation basis', input1='input index')
    imshow(theoretical_reps[:10], title=f'rho(ab) in {rep_name}', input2='representation basis', input1='input index')

    sim = F.cosine_similarity(hidden_in_rep.flatten(), theoretical_reps.flatten(), dim=0)
    print(f'Cosine similarity between hidden layer and theoretical representations: {sim}')

    # get the coef
    coef = (hidden_in_rep.norm() / theoretical_reps.norm())
    coefs[rep_name] = coef

Saving to paper/figures/hidden_to_standard_orth_rep_change_of_basis.png


Cosine similarity between hidden layer and theoretical representations: 0.9995379447937012


Saving to paper/figures/hidden_to_sign_orth_rep_change_of_basis.png


Cosine similarity between hidden layer and theoretical representations: 1.0000001192092896


In [18]:
# percentage of W_U explained
W_U = model.W_U
for key, value in hidden_to_reps_proj_orth.items():
    rep_orth = group.irreps[key].orth_rep.reshape(group.order, -1)
    W_U_rep_neuron_basis = W_U[rep_neurons[key]]
    W_U_rep = W_U_rep_neuron_basis @ rep_orth[group.inverses]
    print(W_U_rep_neuron_basis.shape)
    print(f'Percentage of W_u explained in {key} representation: {(torch.norm(W_U_rep.flatten()) / torch.norm(W_U_rep_neuron_basis)).pow(2)}')


torch.Size([119, 120])
Percentage of W_u explained in standard representation: 0.9340784549713135
torch.Size([7, 120])
Percentage of W_u explained in sign representation: 0.9993630647659302


# Ablations

In [19]:
# MLP neurons
hidden = activations['hidden'].reshape(group.order**2, -1)

hidden_constructed = torch.zeros_like(hidden)
for key, value in hidden_to_reps_proj_orth.items():
    CoB_orth = value
    rep_orth = group.irreps[key].orth_rep
    hidden_reps_orth = group.irreps[key].hidden_reps_xy_orth
    hidden_rep = hidden_reps_orth @ CoB_orth
    hidden_rep = hidden_rep.reshape(group.order**2, -1)
    hidden_constructed += coefs[key] * hidden_rep
    

logits_constructed = hidden_constructed @ model.W_U
loss_constructed = loss_fn(logits_constructed, all_labels)
loss_base = loss_fn(hidden @ model.W_U, all_labels)
# percentage chagne
print(loss_base)
print(loss_constructed)


tensor(2.3838e-06, device='cuda:0', dtype=torch.float64,
       grad_fn=<NegBackward0>)
tensor(-0., device='cuda:0', dtype=torch.float64, grad_fn=<NegBackward0>)


In [20]:
# W_U

W_U = model.W_U
hidden = activations['hidden'].reshape(group.order**2, -1)
# restrict W_U to only output representation space
W_U_cont = torch.zeros_like(W_U)
for rep in key_reps:
    rep_orth = group.irreps[rep].orth_rep
    W_U_rep = W_U @ rep_orth[group.inverses]
    W_U_cont += W_U_rep @ rep_orth[group.inverses].T

W_U_null = W_U - W_U_cont
base_logits = model(all_data)
base_loss = loss_fn(base_logits, all_labels)
new_logits = hidden @ W_U_cont
null_logits = hidden @ W_U_null
new_loss = loss_fn(new_logits, all_labels)
null_loss = loss_fn(null_logits, all_labels)
print(base_loss)
print(new_loss)
print(null_loss)
#percent change
print((base_loss - new_loss)/base_loss)

tensor(2.3838e-06, device='cuda:0', dtype=torch.float64,
       grad_fn=<NegBackward0>)
tensor(2.0779e-06, device='cuda:0', dtype=torch.float64,
       grad_fn=<NegBackward0>)
tensor(4.7955, device='cuda:0', dtype=torch.float64, grad_fn=<NegBackward0>)
tensor(0.1283, device='cuda:0', dtype=torch.float64, grad_fn=<DivBackward0>)


In [21]:
# logits
template = 'logit_excluded_loss_{}_rep'
for rep in key_reps:
    print(f'Excluding {rep}: {summary_metrics[template.format(rep)]}')

print(f'Excluding all: {summary_metrics["total_logit_excluded_loss"]}')

Excluding standard: 7.279840630335901
Excluding sign: 0.0006027620128980678
Excluding all: 7.601493474399844


In [22]:
# logits: ablating other directions improves performance...

# Full Circuit Analysis: Sign rep


In [23]:
# figure: blocky neurons

signature_neurons = rep_neurons['sign']
print(signature_neurons)
sig_labels = [str(x) for x in signature_neurons.tolist()]

hidden = activations['hidden'].reshape(group.order, group.order, -1)
fig = px.imshow(to_numpy(hidden[:, :, signature_neurons]), color_continuous_scale='RdBu', color_continuous_midpoint=0.0, title=f'hidden activations', facet_col=2, labels={'x':'b', 'y':'a', 'facet_col': 'neuron'})
for i, neuron in enumerate(sig_labels):
    fig.layout.annotations[i]['text'] = f'neuron = {neuron}' 
fig.show()
fig.write_image(f'{save_dir}/blocky_sign_neurons.png')


tensor([  2,   8,  17,  65, 111, 113, 120], device='cuda:0')


In [24]:
# identify form of neurons
sigs = group.signatures.unsqueeze(-1)
xs = model.x_embed[:, signature_neurons]
stack = torch.hstack([sigs, xs]).T
imshow(stack, y=['sig'] + sig_labels, input2='input group element', title='Total x embeddings on select neurons')
ys = y_embed[:, signature_neurons]
stack = torch.hstack([sigs, ys]).T
imshow(stack, y=['sig'] + sig_labels, input2='input group element', title='Total y embeddings on select neurons')

In [25]:
# evidence: form of W_U on sign neurons

sigs = group.signatures.unsqueeze(-1)
W_U_signatures = model.W_U[signature_neurons, :].T
stack = torch.hstack([sigs, W_U_signatures]).T
imshow(stack, y=['sig'] + sig_labels, input2='output group element', title='W_U on select neurons')

# Progress Measures

In [26]:
# figure: total excluded loss
keys = ['total_embed_excluded_loss', 'total_embed_restricted_loss', 'test_loss', 'train_loss']
lines_from_keys(metrics, keys, title='Excluded Loss', labels=['Excluded Loss', 'Restricted Loss', 'Test Loss', 'Train Loss'], yaxis='Loss', save=f'{save_dir}/total_embed_excluded_and_restricted_loss.png', log_x=True, log_y = True, legend_pos='bl')


Saving to paper/figures/total_embed_excluded_and_restricted_loss.png


In [27]:
# figure: total excluded loss
keys = ['total_logit_excluded_loss', 'total_logit_restricted_loss', 'test_loss', 'train_loss']
lines_from_keys(metrics, keys, title='Excluded Loss', labels=['Excluded Loss', 'Restricted Loss', 'Test Loss', 'Train Loss'], yaxis='Loss', save=f'{save_dir}/total_logit_excluded_and_restricted_loss.png', log_x=True, log_y = True, legend_pos='bl')


Saving to paper/figures/total_logit_excluded_and_restricted_loss.png


In [28]:
# figure: excluded loss by rep
template = 'logit_excluded_loss_{}_rep'
lines_from_template(metrics, template, reps_to_plot,title='Excluded Loss by Representation', yaxis='Loss', save=f'{save_dir}/logit_excluded_loss_by_rep.png', log_y = True, log_x = True, legend_pos='bl')

Saving to paper/figures/logit_excluded_loss_by_rep.png


In [29]:
template = 'logit_restricted_loss_{}_rep'
lines_from_template(metrics, template, reps_to_plot,title='Restricted Loss by Representation', yaxis='Loss', save=f'{save_dir}/logit_restricted_loss_by_rep.png', log_y = True, log_x = True, legend_pos='bl')

Saving to paper/figures/logit_restricted_loss_by_rep.png


In [30]:
# figure: sum of square weights
keys = ['sum_of_squared_weights']
lines_from_keys(metrics, keys, title='Sum of Square Weights', labels=['Sum of Square Weights'], yaxis='Sum of Square Weights', save=f'{save_dir}/sum_of_square_weights.png', log_x=True, log_y=True)

Saving to paper/figures/sum_of_square_weights.png


In [54]:
labels = [str(group.idx_to_perm(x)) for x in range(group.order)]

In [61]:
# t-SNE projection of unembed 

from sklearn.manifold import TSNE
U = model.W_U.T.cpu().detach().numpy()
U_tsne = TSNE(n_components=2, learning_rate='auto',
            init='random', perplexity=3).fit_transform(U)
U_tsne.shape

(120, 2)

In [62]:
fig = px.scatter(x=U_tsne[:, 0], y=U_tsne[:, 1], title='t-SNE projection of W_U')
fig.show()

In [64]:
X=model.x_embed.cpu().detach().numpy()
X_tsne = TSNE(n_components=2, learning_rate='auto',
            init='random', perplexity=3).fit_transform(X)
print(X_tsne.shape)
fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], text=labels, title='t-SNE projection of W_x')
fig.show()
            

(120, 2)


In [46]:
Y = model.y_embed.cpu().detach().numpy()
Y_tsne = TSNE(n_components=2, learning_rate='auto',
            init='random', perplexity=3).fit_transform(Y)
fig = px.scatter(x=Y_tsne[:, 0], y=Y_tsne[:, 1], labels=np.arange(group.order), title='t-SNE projection of W_y')
fig.show()

In [71]:
from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=20, random_state=0).fit(Y_tsne)
print(k_means)

#plot the clusters
fig = px.scatter(x=Y_tsne[:, 0], y=Y_tsne[:, 1], color=k_means.labels_, title='t-SNE projection of W_y')
fig.show()

#print out the clusters
for i in range(10):
    print(f'cluster {i}:')
    for j in range(len(k_means.labels_)):
        if k_means.labels_[j] == i:
            print(labels[j])

KMeans(n_clusters=20, random_state=0)






cluster 0:
(0 3 1 4 2)
(0 3)(1 4 2)
(4)(0 2 1 3)
(4)(0 3)(1 2)
(4)(0 2)(1 3)
(4)(0 3 1 2)
cluster 1:
(2 4 3)
(0 1 3 2 4)
(1 3 4)
(0 1)(2 4 3)
(3 4)
(0 1 3 4)
(1 3 2 4)
(0 1)(3 4)
cluster 2:
(4)
(0 4)
(4)(0 1)
(0 1 4)
(1 4)
(0 4 1)
cluster 3:
(0 3)(1 4)
(4)(0 3 2)
(4)(0 1 3)
(0 1 4 3 2)
(0 3 2)(1 4)
(4)(0 3)
(4)(0 1 3 2)
cluster 4:
(4)(0 2 1)
(4)(1 2)
(0 4)(1 2)
(1 4 2)
(0 4 2 1)
(0 2 1 4)
cluster 5:
(0 1 2 4)
(0 2 4)
(2 4)
(0 2 4 1)
(0 1)(2 4)
(1 2 4)
cluster 6:
(4)(0 1 2)
(0 2)(1 4)
(4)(0 2)
(0 1 4 2)
(0 4 2)
(0 4 1 2)
cluster 7:
(0 3)(1 2 4)
(0 2)(1 3 4)
(0 3 4 1 2)
(0 3 4 2)
(0 2)(3 4)
(0 3)(2 4)
cluster 8:
(0 2 4 3 1)
(1 3)(2 4)
(0 2 4)(1 3)
(0 3 1)(2 4)
(0 3 1 2 4)
(1 2 4 3)
cluster 9:
(1 4 3 2)
(0 4 3 2 1)
(4)(0 3 2 1)
(0 3 2 1 4)
(4)(1 3 2)
(0 4)(1 3 2)
