## Part I: 验证probe翻译的棋盘与groundtruth一样

In [1]:
import os
%load_ext autoreload
%autoreload 2
# make deterministic
from mingpt.utils import set_seed
set_seed(44)
import math
import time
import numpy as np
from copy import deepcopy
import pickle
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data.dataloader import DataLoader
from torch.nn import functional as F
from torch.utils.data import Subset
from tqdm import tqdm
from matplotlib import pyplot as plt

from data import get_othello, plot_probs, plot_mentals
from data.othello import permit, start_hands, OthelloBoardState, permit_reverse
from mingpt.dataset import CharDataset
from mingpt.model import GPT, GPTConfig, GPTforProbeIA
from mingpt.utils import sample, intervene, print_board
from mingpt.probe_model import BatteryProbeClassification, BatteryProbeClassificationTwoLayer
championship = False
mid_dim = 128
how_many_history_step_to_use = 99
exp = f"state_tl{mid_dim}"
if championship:
    exp += "_championship"

### load benchmark dataset and its samples
(verification的相同samples)

In [2]:
with open("intervention_benchmark.pkl", "rb") as input_file:
    dataset = pickle.load(input_file)

case_ids = [0, 16, 99, 123, 233, 500, 666, 777, 888, 1000]
completions = [dataset[case_id]["history"] for case_id in case_ids]

In [3]:
with open("intervention_benchmark.pkl", "rb") as input_file:
    dataset = pickle.load(input_file)

completions = [data["history"] for data in dataset]

### load non-linear probes

In [4]:
probes = {}
layer_s = 1
layer_e = 9
for layer in range(layer_s, layer_e):
    p = BatteryProbeClassificationTwoLayer(torch.cuda.current_device(), probe_class=3, num_task=64, mid_dim=mid_dim)
    load_res = p.load_state_dict(torch.load(f"./ckpts/battery_othello/{exp}/layer{layer}/checkpoint.ckpt")) # state_tl128/layer[4:9]/checkpoint
    p.eval() # Set the module in evaluation mode.
    probes[layer] = p # probes用来装(layer数量, probe) pairs

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

### load trained models

In [None]:
# othello = get_othello(ood_perc=.2, data_root="data/othello_pgn", wthor=False)
othello = get_othello(ood_perc=0., data_root=None, wthor=False, ood_num=1)
train_dataset = CharDataset(othello)

mconf = GPTConfig(61, 59, n_layer=8, n_head=8, n_embd=512)

models = {}
for layer in range(layer_s, layer_e):
    model = GPTforProbeIA(mconf, probe_layer=layer)
    # model = GPT(mconf)
    load_res = model.load_state_dict(torch.load("./ckpts/gpt_synthetic.ckpt" if not championship else "./ckpts/gpt_championship.ckpt"))
    if torch.cuda.is_available():
        device = torch.cuda.current_device()
        model = model.to(device)
    _ = model.eval()
    models[layer] = model

### compare the board labels between the probe and the model for each layer

In [None]:
err_rate = 0
for j, completion in enumerate(completions):
    probe_trans = []
    partial_game = torch.tensor([train_dataset.stoi[s] for s in completion], dtype=torch.long).to(device) 
    '''get ground truth'''
    ab = OthelloBoardState()
    ab.update(completion, prt=False)
    pre_intv_truth = ab.get_state()
    '''1st layer'''
    p = probes[layer_s]
    whole_mid_act = models[layer_s].forward_1st_stage(partial_game[None, :])
    mid_act = whole_mid_act[0, -1] #[512, ], first batch, last token
    pre_intv_logits = p(mid_act[None, :])[0].squeeze(0)     
    labels_pre_intv = pre_intv_logits.detach().argmax(dim=-1) 
    probe_trans.append(labels_pre_intv) # add the probe's translation to the list
    '''后续layers'''
    for i, layer in enumerate(range(layer_s, layer_e - 1)):  # 4, 5, 6, 7, indices of the layers to be passed
        p = probes[layer+1]
        whole_mid_act = models[layer_s].forward_2nd_stage(whole_mid_act, layer, layer+1)[0]
        mid_act = whole_mid_act[0, -1] #? first time step? [512, ]
        pre_intv_logits = p(mid_act[None, :])[0].squeeze(0)
        labels_pre_intv = pre_intv_logits.detach().argmax(dim=-1)
        probe_trans.append(labels_pre_intv)
    assert len(probe_trans) == 8
    for i, tran in enumerate(probe_trans):
        match = torch.tensor(pre_intv_truth).to('cuda:0') == tran.to(torch.get_default_dtype())
        err = torch.count_nonzero(~match).item()
        if err:
            # print("_____________________________________________________________")
            # print("sample_id: ", j, ", layer: ", i+1)
            # print("mismatch numbers: ", err)
            # print("truth: \n", torch.tensor(pre_intv_truth).to('cuda:0'))
            # print("translation: \n", tran.to(torch.get_default_dtype()))
            # print("match resut: \n", match)
            err_rate += err

print("overall error rate: ", (err_rate / (64 * 8 * len(dataset))) * 100, "%")


## Part II: Verify $D(x)=D(x')$

### Check the correctness of OthelloGPT's decoder

In [5]:
'''load the decoder'''
championship = False

mconf = GPTConfig(61, 59, n_layer=8, n_head=8, n_embd=512) #vocab_size = 61

# models = {}
# for layer in range(layer_s, layer_e):
model = GPTforProbeIA(mconf, probe_layer=-1) #probe_layer is the last layer
# model = GPT(mconf)

'''load the retrained model (no layer norm in the decoder)'''
retrained_path = "./ckpts/OthelloGPT-no_lrnorm/non_layernorm_gpt__at_20240810_100610.ckpt"
load_res = model.load_state_dict(torch.load(retrained_path))
if torch.cuda.is_available():
    device = torch.cuda.current_device()
    model = model.to(device)
_ = model.eval()
    # models[layer] = model

'''
decoder:
  (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=512, out_features=61, bias=False)
'''
# print(model)
decoder = nn.Sequential(model.ln_f, model.head)
device = torch.cuda.current_device()
decoder = decoder.to(device)
_ = decoder.eval()


In [6]:
import json
'''load the probe dataset for the last layer'''
path = "./probe_epsilon_dataset.json"
with open(path, 'r') as file:
    json_data = file.read()
data = json.loads(json_data)

In [7]:
samples = data['layer_8']

x = samples[777]['x']
x = torch.tensor(x).to(device)
print('x shape: ', torch.tensor(x).shape)
y = samples[777]['y']
print('y shape: ', torch.tensor(y).shape)
pre_intv_pred = decoder(x)
print("pre_intv_pred shape: ", pre_intv_pred.shape)  #[1, 61]
pre_intv_pred = pre_intv_pred.view(-1, pre_intv_pred.size(-1))
print("pre_intv_pred shape: ", pre_intv_pred.shape)  #
pre_intv_pred = torch.softmax(pre_intv_pred, dim=0)
padding = torch.zeros(2).cuda()
pre_intv_pred = torch.cat([pre_intv_pred[:27], padding, pre_intv_pred[27:33], padding, pre_intv_pred[33:]], dim=0) 
print("pre_intv_pred shape: ", pre_intv_pred.shape)

  print('x shape: ', torch.tensor(x).shape)


x shape:  torch.Size([1, 512])
y shape:  torch.Size([64])
pre_intv_pred shape:  torch.Size([1, 61])
pre_intv_pred shape:  torch.Size([1, 61])


RuntimeError: Tensors must have same number of dimensions: got 2 and 1

In [13]:
'''Visualize the decoder's prediction'''
fig=plt.figure(figsize=(10, 6), dpi= 80, facecolor='w', edgecolor='k')
vv = 0.2
sns.heatmap(pre_intv_pred.detach().cpu().numpy().reshape(8, 8), vmin=0., vmax=vv, 
            yticklabels=list("ABCDEFGH"), xticklabels=list(range(1,9)), square=True, 
            annot=True, fmt=".2f")

ValueError: cannot reshape array of size 65 into shape (8,8)

<Figure size 800x480 with 0 Axes>