In [3]:
%%capture
!pip install torch tqdm matplotlib scikit-learn

In [5]:
import os
import torch
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE

In [6]:
def save_activation_projection_tsne(
    model1_activations1,
    model1_activations2,
    model2_activations1,
    model2_activations2,
    fname,
    title,
    model1_label1="Model 1 Positive Examples",
    model1_label2="Model 1 Negative Examples",
    model2_label1="Model 2 Positive Examples",
    model2_label2="Model 2 Negative Examples",
):
    """
    model1_activations1: n_samples x vector dim tensor
    model1_activations2: n_samples x vector dim tensor
    model2_activations1: n_samples x vector dim tensor
    model2_activations2: n_samples x vector dim tensor

    projects to n_samples x 2 dim tensor using t-SNE (over the full dataset of both activations 1 and 2) and saves visualization.
    Colors projected activations1 as blue and projected activations2 as red for both models.
    """
    plt.clf()
    activations = torch.cat([model1_activations1, model1_activations2, model2_activations1, model2_activations2], dim=0)
    activations_np = activations.cpu().numpy()

    # t-SNE transformation
    tsne = TSNE(n_components=2)
    projected_activations = tsne.fit_transform(activations_np)

    # Splitting back into activations1 and activations2 for both models
    indices = [
        (0, model1_activations1.shape[0]),
        (model1_activations1.shape[0], model1_activations1.shape[0] + model1_activations2.shape[0]),
        (model1_activations1.shape[0] + model1_activations2.shape[0], model1_activations1.shape[0] + model1_activations2.shape[0] + model2_activations1.shape[0]),
        (model1_activations1.shape[0] + model1_activations2.shape[0] + model2_activations1.shape[0], None)
    ]
    model1_activations1_projected = projected_activations[indices[0][0]: indices[0][1]]
    model1_activations2_projected = projected_activations[indices[1][0]: indices[1][1]]
    model2_activations1_projected = projected_activations[indices[2][0]: indices[2][1]]
    model2_activations2_projected = projected_activations[indices[3][0]: indices[3][1]]

    # Visualization for model 1
    for x, y in model1_activations1_projected:
        plt.scatter(x, y, color="blue", marker="o", alpha=0.4)

    for x, y in model1_activations2_projected:
        plt.scatter(x, y, color="red", marker="o", alpha=0.4)
        
    # Visualization for model 2
    for x, y in model2_activations1_projected:
        plt.scatter(x, y, color="blue", marker="s", alpha=0.4)

    for x, y in model2_activations2_projected:
        plt.scatter(x, y, color="red", marker="s", alpha=0.4)

    # Adding the legend
    model1_scatter1 = plt.Line2D(
        [0],
        [0],
        marker="o",
        color="w",
        markerfacecolor="blue",
        markersize=10,
        label=model1_label1,
    )
    model1_scatter2 = plt.Line2D(
        [0],
        [0],
        marker="o",
        color="w",
        markerfacecolor="red",
        markersize=10,
        label=model1_label2,
    )
    
    model2_scatter1 = plt.Line2D(
        [0],
        [0],
        marker="s",
        color="w",
        markerfacecolor="blue",
        markersize=10,
        label=model2_label1,
    )
    model2_scatter2 = plt.Line2D(
        [0],
        [0],
        marker="s",
        color="w",
        markerfacecolor="red",
        markersize=10,
        label=model2_label2,
    )

    plt.legend(handles=[model1_scatter1, model1_scatter2, model2_scatter1, model2_scatter2])
    plt.title(title)
    plt.xlabel("t-SNE 1")
    plt.ylabel("t-SNE 2")
    plt.savefig(fname)

In [None]:
def plot_all_activations(layers):
    if not os.path.exists("clustering"):
        os.mkdir("clustering")
        
    first_dirpath = "refusal_data_2splits_0_exp_data"
    second_dirpath = "refusal_data_2splits_1_exp_data"
    for layer in layers:
        first_pos = torch.load(f"{first_dirpath}/positive_layer_{layer}.pt")
        first_neg = torch.load(f"{first_dirpath}/negative_layer_{layer}.pt")
        second_pos = torch.load(f"{second_dirpath}/positive_layer_{layer}.pt")
        second_neg = torch.load(f"{second_dirpath}/negative_layer_{layer}.pt")
        save_activation_projection_tsne(
            first_pos,
            first_neg,
            second_pos,
            second_neg,
            f"clustering/activations_layer_{layer}.png",
            f"t-SNE projected activations layer {layer}",
            model1_label1="Split 1 Positive Examples",
            model1_label2="Split 1 Negative Examples",
            model2_label1="Split 2 Positive Examples",
            model2_label2="Split 2 Negative Examples",
        )