<a href="https://colab.research.google.com/github/YanaySoker/Specificity_of_ROME/blob/main/main_experimant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/kmeng01/rome/blob/main/notebooks/causal_trace.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" align="left"/></a>&nbsp;or in a local notebook.

In [None]:
%%bash
!(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit
cd /content && rm -rf /content/rome
git clone https://github.com/kmeng01/rome rome > install.log 2>&1
pip install -r /content/rome/scripts/colab_reqs/rome.txt >> install.log 2>&1
pip install --upgrade google-cloud-storage >> install.log 2>&1

In [None]:
%cd rome

/content/rome


In [None]:
%%writefile ./experiments/py/demo.py
# New demo.py
import os
from pathlib import Path
from typing import Dict, List, Tuple

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from baselines.ft import FTHyperParams, apply_ft_to_model
from rome import ROMEHyperParams, apply_rome_to_model
from util import nethook
from util.generate import generate_fast
from util.globals import *

LAYER_IDX = [17]

def demo_model_editing(
    model: AutoModelForCausalLM,
    tok: AutoTokenizer,
    requests: List[Dict],
    generation_prompts: List[str],
    alg_name: str = "ROME",
) -> Tuple[AutoModelForCausalLM, Dict[str, torch.Tensor]]:
    """
    Applies the selected model editing algorithm. Generates text both before and after
    for comparison of model behavior. Returns the updated model and the original values of
    weights that were changed.
    """
    print(requests, "\n")
    nethook.set_requires_grad(True, model)

    RewritingParamsClass, apply_method, hparams_prefix, hparams_suffix = load_alg(
        alg_name
    )
    params_name = (
        HPARAMS_DIR
        / hparams_prefix
        / f"{model.config._name_or_path.replace('/', '_')}{hparams_suffix}.json"
    )
    params_name = "hparams/ROME/gpt2-xl.json"
    hparams = RewritingParamsClass.from_json(params_name)
    hparams.layers=[LEVEL_IDX[0]]   # New
    model_new, orig_weights = apply_method(
        model, tok, requests, hparams, return_orig_weights=True
    )

    return model_new, orig_weights

def load_alg(alg_name):
    """
    Loads dependencies for the desired algorithm.
    Implementation is slightly awkward to prevent unnecessary imports on Colab.

    The return value is a tuple of the following:
    1. Class for storing hyperparameters
    2. Method for applying rewrites
    3. Location of parameters
    4. Predefined suffix for the param file
    """
    assert alg_name in [
        "FT",
        "FT-L",
        "FT-AttnEdit",
        "KN",
        "MEND",
        "MEND-CF",
        "MEND-zsRE",
        "KE",
        "KE-CF",
        "ROME",
    ]

    if alg_name == "ROME":
        return ROMEHyperParams, apply_rome_to_model, "ROME", ""
    elif "FT" in alg_name:
        d = {
            "FT": (FTHyperParams, apply_ft_to_model, "FT", "_unconstr"),
            "FT-AttnEdit": (FTHyperParams, apply_ft_to_model, "FT", "_attn"),
            "FT-L": (FTHyperParams, apply_ft_to_model, "FT", "_constr"),
        }
        return d[alg_name]
    else:
        from baselines.efk import EFKHyperParams, EfkRewriteExecutor
        from baselines.kn import KNHyperParams, apply_kn_to_model
        from baselines.mend import MENDHyperParams, MendRewriteExecutor

        d = {
            "KN": (KNHyperParams, apply_kn_to_model, "KN", ""),
            "MEND": (MENDHyperParams, MendRewriteExecutor().apply_to_model, "MEND", ""),
            "KE": (EFKHyperParams, EfkRewriteExecutor().apply_to_model, "KE", ""),
            "MEND-CF": (
                MENDHyperParams,
                MendRewriteExecutor().apply_to_model,
                "MEND",
                "_CF",
            ),
            "MEND-zsRE": (
                MENDHyperParams,
                MendRewriteExecutor().apply_to_model,
                "MEND",
                "_zsRE",
            ),
            "KE-CF": (
                EFKHyperParams,
                EfkRewriteExecutor().apply_to_model,
                "MEND",
                "_CF",
            ),
        }
        return d[alg_name]

def print_loud(x, pad=3):
    """
    Prints a string with # box for emphasis.

    Example:
    ############################
    #                          #
    #  Applying ROME to model  #
    #                          #
    ############################
    """

    n = len(x)
    print()
    print("".join(["#" for _ in range(n + 2 * pad)]))
    print("#" + "".join([" " for _ in range(n + 2 * (pad - 1))]) + "#")
    print(
        "#"
        + "".join([" " for _ in range(pad - 1)])
        + x
        + "".join([" " for _ in range(pad - 1)])
        + "#"
    )
    print("#" + "".join([" " for _ in range(n + 2 * (pad - 1))]) + "#")
    print("".join(["#" for _ in range(n + 2 * pad)]))

class StopExecution(Exception):
    def _render_traceback_(self):
        pass

def stop_execution():
    raise StopExecution


Overwriting ./experiments/py/demo.py


In [None]:
%%writefile ./rome/rome_main.py
# New rome_main.py
from copy import deepcopy
from typing import Dict, List, Tuple

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from util import nethook
from util.generate import generate_fast

from .compute_u import compute_u
from .compute_v import compute_v
from .rome_hparams import ROMEHyperParams

CONTEXT_TEMPLATES_CACHE = None

def apply_rome_to_model(
    model: AutoModelForCausalLM,
    tok: AutoTokenizer,
    requests: List[Dict],
    hparams: ROMEHyperParams,
    copy=False,
    return_orig_weights=False,
) -> Tuple[AutoModelForCausalLM, List[str]]:
    """
    Returns a model with the desired changes.

    :param copy: If true, will preserve the original model while creating a new one to edit.
        Note that you are responsible for deallocating the new model's memory to avoid leaks.

    :return: (1) the updated model, (2) an original copy of the weights that changed
    """

    if copy:
        model = deepcopy(model)

    weights_copy = {}

    for i, request in enumerate(requests):
        deltas = execute_rome(model, tok, request, hparams)

        with torch.no_grad():
            for w_name, (delta_u, delta_v) in deltas.items():
                upd_matrix = delta_u.unsqueeze(1) @ delta_v.unsqueeze(0)
                w = nethook.get_parameter(model, w_name)
                upd_matrix = upd_matrix_match_shape(upd_matrix, w.shape)

                if return_orig_weights and w_name not in weights_copy:
                    assert i == 0
                    weights_copy[w_name] = w.detach().clone()

                w[...] += upd_matrix

    return model, weights_copy

def execute_rome(
    model: AutoModelForCausalLM,
    tok: AutoTokenizer,
    request: Dict,
    hparams: ROMEHyperParams,
) -> Dict[str, Tuple[torch.Tensor]]:
    """
    Executes the ROME update algorithm for the specified update at the specified layer
    Invariant: model at beginning of function == model at end of function
    """

    # Update target and print info
    request = deepcopy(request)
    if request["target_new"]["str"][0] != " ":
        # Space required for correct tokenization
        request["target_new"]["str"] = " " + request["target_new"]["str"]

    # Retrieve weights that user desires to change
    weights = {
        f"{hparams.rewrite_module_tmp.format(layer)}.weight": nethook.get_parameter(
            model, f"{hparams.rewrite_module_tmp.format(layer)}.weight"
        )
        for layer in hparams.layers
    }
    # Save old weights for future restoration
    weights_copy = {k: v.detach().clone() for k, v in weights.items()}

    # Update loop: sequentially intervene at each specified layer
    deltas = {}
    for layer in sorted(hparams.layers):
        # Compute rank-1 update matrix
        left_vector: torch.Tensor = compute_u(
            model,
            tok,
            request,
            hparams,
            layer,
            get_context_templates(model, tok, hparams.context_template_length_params),
        )
        right_vector: torch.Tensor = compute_v(
            model,
            tok,
            request,
            hparams,
            layer,
            left_vector,
            get_context_templates(model, tok, hparams.context_template_length_params),
        )

        with torch.no_grad():
            # Determine correct transposition of delta matrix
            weight_name = f"{hparams.rewrite_module_tmp.format(layer)}.weight"
            upd_matrix = left_vector.unsqueeze(1) @ right_vector.unsqueeze(0)
            upd_matrix = upd_matrix_match_shape(upd_matrix, weights[weight_name].shape)

            # Update model weights and record desired changes in `delta` variable
            weights[weight_name][...] += upd_matrix
            deltas[weight_name] = (
                left_vector.detach(),
                right_vector.detach(),
            )

    # Restore state of original model
    with torch.no_grad():
        for k, v in weights.items():
            v[...] = weights_copy[k]

    return deltas

def upd_matrix_match_shape(matrix: torch.Tensor, shape: torch.Size) -> torch.Tensor:
    """
    GPT-2 and GPT-J have transposed weight representations.
    Returns a matrix that matches the desired shape, else raises a ValueError
    """

    if matrix.shape == shape:
        return matrix
    elif matrix.T.shape == shape:
        return matrix.T
    else:
        raise ValueError(
            "Update matrix computed by ROME does not match original weight shape. "
            "Check for bugs in the code?"
        )

def get_context_templates(model, tok, length_params):
    global CONTEXT_TEMPLATES_CACHE

    if CONTEXT_TEMPLATES_CACHE is None:
        CONTEXT_TEMPLATES_CACHE = ["{}"] + [
            x + ". {}"
            for x in sum(
                (
                    generate_fast(
                        model,
                        tok,
                        ["<|endoftext|>"],
                        n_gen_per_prompt=n_gen,
                        max_out_len=length,
                    )
                    for length, n_gen in length_params
                ),
                [],
            )
        ]

        print(f"Cached context templates {CONTEXT_TEMPLATES_CACHE}")

    return CONTEXT_TEMPLATES_CACHE


Overwriting ./rome/rome_main.py


In [None]:
from experiments.py.demo import LAYER_IDX

In [None]:
IS_COLAB = False
ALL_DEPS = False
try:
    import google.colab, torch, os

    IS_COLAB = True
    os.chdir("/content/rome")
    if not torch.cuda.is_available():
        raise Exception("Change runtime type to include a GPU.")
except ModuleNotFoundError as _:
    pass

## Causal Tracing

A demonstration of the double-intervention causal tracing method.

The strategy used by causal tracing is to understand important
states within a transfomer by doing two interventions simultaneously:

1. Corrupt a subset of the input.  In our paper, we corrupt the subject tokens
   to frustrate the ability of the transformer to accurately complete factual
   prompts about the subject.
2. Restore a subset of the internal hidden states.  In our paper, we scan
   hidden states at all layers and all tokens, searching for individual states
   that carry the necessary information for the transformer to recover its
   capability to complete the factual prompt.

The traces of decisive states can be shown on a heatmap.  This notebook
demonstrates the code for conducting causal traces and creating these heatmaps.

In [None]:
%load_ext autoreload
%autoreload 2

The `experiments.causal_trace` module contains a set of functions for running causal traces.

In this notebook, we reproduce, demonstrate and discuss the interesting functions.

We begin by importing several utility functions that deal with tokens and transformer models.

In [None]:
# from rome file
from transformers import AutoModelForCausalLM, AutoTokenizer
from util.generate import generate_interactive, generate_fast

from experiments.py.demo import demo_model_editing, stop_execution

In [None]:
import os, re, json
import torch, numpy
from collections import defaultdict
from util import nethook
from util.globals import DATA_DIR
from experiments.causal_trace import (
    ModelAndTokenizer,
    layername,
    guess_subject,
    plot_trace_heatmap,
)
from experiments.causal_trace import (
    make_inputs,
    decode_tokens,
    find_token_range,
    # predict_token,
    predict_from_input,
    collect_embedding_std,
)
from dsets import KnownsDataset

torch.set_grad_enabled(True)

<torch.autograd.grad_mode.set_grad_enabled at 0x7ff12b6c3190>

In [None]:
import random
_seed = 1
random.seed(_seed)
numpy.random.seed(seed=_seed)
torch.manual_seed(_seed)

<torch._C.Generator at 0x7ff1c8137370>

Now we load a model and tokenizer, and show that it can complete a couple factual statements correctly.

In [None]:
model_name = "gpt2-xl"  # or "EleutherAI/gpt-j-6B" or "EleutherAI/gpt-neox-20b"
mt = ModelAndTokenizer(
    model_name,
    low_cpu_mem_usage=IS_COLAB,
    torch_dtype=(torch.float16 if "20b" in model_name else None),
)

Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.99G [00:00<?, ?B/s]

To obfuscate the subject during Causal Tracing, we use noise sampled from a zero-centered spherical Gaussian, whose stddev is 3 times the $\sigma$ stddev the model's embeddings. Let's compute that value.

In [None]:
knowns = KnownsDataset(DATA_DIR)  # Dataset of known facts
noise_level = 3 * collect_embedding_std(mt, [k["subject"] for k in knowns])
print(f"Using noise level {noise_level}")

data/known_1000.json does not exist. Downloading from https://rome.baulab.info/data/dsets/known_1000.json


  0%|          | 0.00/335k [00:00<?, ?B/s]

Loaded dataset with 1209 elements
Using noise level 0.13462981581687927


## Tracing a single location

The core intervention in causal tracing is captured in this function:

`trace_with_patch` a single causal trace.

It enables running a batch of inferences with two interventions.

  1. Random noise can be added to corrupt the inputs of some of the batch.
  2. At any point, clean non-noised state can be copied over from an
     uncorrupted batch member to other batch members.
  
The convention used by this function is that the zeroth element of the
batch is the uncorrupted run, and the subsequent elements of the batch
are the corrupted runs.  The argument tokens_to_mix specifies an
be corrupted by adding Gaussian noise to the embedding for the batch
inputs other than the first element in the batch.  Alternately,
subsequent runs could be corrupted by simply providing different
input tokens via the passed input batch.

To ensure that corrupted behavior is representative, in practice, we
will actually run several (ten) corrupted runs in the same batch,
each with its own sample of noise.

Then when running, a specified set of hidden states will be uncorrupted
by restoring their values to the same vector that they had in the
zeroth uncorrupted run.  This set of hidden states is listed in
states_to_patch, by listing [(token_index, layername), ...] pairs.
To trace the effect of just a single state, this can be just a single
token/layer pair.  To trace the effect of restoring a set of states,
any number of token indices and layers can be listed.

Note that this function is also in experiments.causal_trace; the code
is shown here to show the logic.

In [None]:
def trace_with_patch(
    model,  # The model
    inp,  # A set of inputs
    states_to_patch,  # A list of (token index, layername) triples to restore
    answers_t,  # Answer probabilities to collect
    tokens_to_mix,  # Range of tokens to corrupt (begin, end)
    noise=0.1,  # Level of noise to add
    trace_layers=None,  # List of traced outputs to return
):
    prng = numpy.random.RandomState()  ### For reproducibility, use pseudorandom noise
    patch_spec = defaultdict(list)
    for t, l in states_to_patch:
        patch_spec[l].append(t)
    embed_layername = layername(model, 0, "embed")

    def untuple(x):
        return x[0] if isinstance(x, tuple) else x

    # Define the model-patching rule.
    def patch_rep(x, layer):
        if layer == embed_layername:
            # If requested, we corrupt a range of token embeddings on batch items x[1:]
            if tokens_to_mix is not None:
                b, e = tokens_to_mix
                x[1:, b:e] += noise * torch.from_numpy(
                    prng.randn(x.shape[0] - 1, e - b, x.shape[2])
                ).to(x.device)
            return x
        if layer not in patch_spec:
            return x
        # If this layer is in the patch_spec, restore the uncorrupted hidden state
        # for selected tokens.
        h = untuple(x)
        for t in patch_spec[layer]:
            h[1:, t] = h[0, t]
        return x

    # With the patching rules defined, run the patched model in inference.
    additional_layers = [] if trace_layers is None else trace_layers
    with torch.no_grad(), nethook.TraceDict(
        model,
        [embed_layername] + list(patch_spec.keys()) + additional_layers,
        edit_output=patch_rep,
    ) as td:
        outputs_exp = model(**inp)

    # We report softmax probabilities for the answers_t token predictions of interest.
    probs = torch.softmax(outputs_exp.logits[1:, -1, :], dim=1).mean(dim=0)[answers_t]

    # If tracing all layers, collect all activations together to return.
    if trace_layers is not None:
        all_traced = torch.stack(
            [untuple(td[layer].output).detach().cpu() for layer in trace_layers], dim=2
        )
        return probs, all_traced

    return probs

## Scanning all locations

A causal flow heatmap is created by repeating `trace_with_patch` at every individual hidden state, and measuring the impact of restoring state at each location.

The `calculate_hidden_flow` function does this loop.  It handles both the case of restoring a single hidden state, and also restoring MLP or attention states.  Because MLP and attention make small residual contributions, to observe a causal effect in those cases, we need to restore several layers of contributions at once, which is done by `trace_important_window`.

In [None]:
def calculate_hidden_flow(
    mt, prompt, subject, samples=10, noise=0.1, window=10, kind=None
):
    """
    Runs causal tracing over every token/layer combination in the network
    and returns a dictionary numerically summarizing the results.
    """
    inp = make_inputs(mt.tokenizer, [prompt] * (samples + 1))
    with torch.no_grad():
        answer_t, base_score = [d[0] for d in predict_from_input(mt.model, inp)]
    [answer] = decode_tokens(mt.tokenizer, [answer_t])
    e_range = find_token_range(mt.tokenizer, inp["input_ids"][0], subject)
    low_score = trace_with_patch(
        mt.model, inp, [], answer_t, e_range, noise=noise
    ).item()
    if not kind:
        differences = trace_important_states(
            mt.model, mt.num_layers, inp, e_range, answer_t, noise=noise
        )
    else:
        differences = trace_important_window(
            mt.model,
            mt.num_layers,
            inp,
            e_range,
            answer_t,
            noise=noise,
            window=window,
            kind=kind,
        )
    differences = differences.detach().cpu()
    return dict(
        scores=differences,
        low_score=low_score,
        high_score=base_score,
        input_ids=inp["input_ids"][0],
        input_tokens=decode_tokens(mt.tokenizer, inp["input_ids"][0]),
        subject_range=e_range,
        answer=answer,
        window=window,
        kind=kind or "",
    )


def trace_important_states(model, num_layers, inp, e_range, answer_t, noise=0.1):
    ntoks = inp["input_ids"].shape[1]
    table = []
    for tnum in range(ntoks):
        row = []
        for layer in range(0, num_layers):
            r = trace_with_patch(
                model,
                inp,
                [(tnum, layername(model, layer))],
                answer_t,
                tokens_to_mix=e_range,
                noise=noise,
            )
            row.append(r)
        table.append(torch.stack(row))
    return torch.stack(table)


def trace_important_window(
    model, num_layers, inp, e_range, answer_t, kind, window=10, noise=0.1
):
    ntoks = inp["input_ids"].shape[1]
    table = []
    for tnum in range(ntoks):
        row = []
        for layer in range(0, num_layers):
            layerlist = [
                (tnum, layername(model, L, kind))
                for L in range(
                    max(0, layer - window // 2), min(num_layers, layer - (-window // 2))
                )
            ]
            r = trace_with_patch(
                model, inp, layerlist, answer_t, tokens_to_mix=e_range, noise=noise
            )
            row.append(r)
        table.append(torch.stack(row))
    return torch.stack(table)

## Plotting the results

The `plot_trace_heatmap` function draws the data on a heatmap.  That function is not shown here; it is in `experiments.causal_trace`.


In [None]:
## bdika
# model, tok = (
#     AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=IS_COLAB).to(
#         "cuda"
#     ),
#     AutoTokenizer.from_pretrained(model_name),
# )
tok = AutoTokenizer.from_pretrained(model_name)
tok.pad_token = tok.eos_token

In [None]:
# from rome file

ALG_NAME = "ROME"
# Colab-only: install deps for MEND* and KE*
if IS_COLAB and not ALL_DEPS and any(x in ALG_NAME for x in ["MEND", "KE"]):
    print("Installing additional dependencies required for MEND and KE")
    !pip install -r /content/rome/scripts/colab_reqs/additional.txt >> /content/install.log 2>&1
    print("Finished installing")
    ALL_DEPS = True

import copy
model = mt.model
mt2 = copy.deepcopy(mt)

In [None]:
def plot_hidden_flow(
    mt,
    prompt,
    subject=None,
    samples=10,
    noise=0.1,
    window=10,
    kind=None,
    modelname=None,
    savepdf=None,
):
    if subject is None:
        subject = guess_subject(prompt)
    result = calculate_hidden_flow(
        mt, prompt, subject, samples=samples, noise=noise, window=window, kind=kind
    )
    print("result:\n",result)
    plot_trace_heatmap(result, savepdf, modelname=modelname)


def plot_all_flow(mt, prompt, subject=None, noise=0.1, modelname=None):
    for kind in ["mlp"]:
        plot_hidden_flow(
            mt, prompt, subject, modelname=modelname, noise=noise, kind=kind
        )

In [None]:
mt2.model = model

# **New Code**

In [None]:
def combine_prompt(subject, relation):
  if relation is not None:
    pref, suff = relation.split("{}")
    prompt = f"{pref}{subject}{suff}"
  else:
    prompt = subject
  return prompt

In [None]:
def is_in_prompt(prompt, sentence):
  pref, suff = prompt.split("{}")
  return pref==sentence[:len(pref)] and suff==sentence[-len(suff):]


In [None]:
def get_subject(prompt, relation):
  pref, suff = relation.split("{}")
  start = len(pref)
  end = len(prompt)-len(suff)
  subject = prompt[start:end]
  return subject


In [None]:
def sum_matrices(A, B, alpha):
  # adding alpha*B to A:
  for i in range(len(A)):
    row = A[i]
    for j in range(len(row)):
      to_add = alpha * B[i][j] 
      row[j]+=to_add

In [None]:
## predict_token
def predict_all_from_input(model, inp):
    out = model(**inp)["logits"]
    probs = torch.softmax(out[:, -1], dim=1)
    return probs

def predict_token(mt, prompts, return_p=False, return_idx = False):
    inp = make_inputs(mt.tokenizer, prompts)
    preds, p = predict_from_input(mt.model, inp)
    result = [mt.tokenizer.decode(c) for c in preds]
    if return_p:
        result = (result, p)
    elif return_idx:
        preds = preds[0]
        result = (result, preds)
    return result

def predict_by_idx(mt, prompt, idx):
  # model, str, int --> float
  # idx: index of object we want to know its probability
  inp = make_inputs(mt.tokenizer, [prompt])
  preds = predict_all_from_input(mt.model, inp)
  return preds[0][idx].item

In [None]:
def naiv_predict(subject, relation = None, return_idx = False):
  prompt = combine_prompt(subject, relation)

  t = predict_token(
    mt2,
    [prompt],
    return_p=False,
    return_idx = return_idx
  )
  
  if return_idx:
    return t[0][0][1:], t[1]
  return t[0][1:]
  

def predict(subject, relation=None, return_idx = False):
  if return_idx:
    next_tok, idx = naiv_predict(subject, relation, return_idx)
  else:
    next_tok = naiv_predict(subject, relation, return_idx)

  if next_tok not in ["the", "a"]:
    if return_idx:
      return next_tok, idx.item
    else:
      return next_tok

  # prompt = prompt + " " + next_tok
  # if count==8:
  #   if return_idx:
  #     return f"[{next_tok}]", idx.item
  #   else:
  #     return f"[{next_tok}]"
  
  try:
    if return_idx:
      next_next, idx = naiv_predict(subject, relation, return_idx)
    else:
      next_next = naiv_predict(subject, relation, return_idx)
  except:
    if return_idx:
      return f"[{next_tok}]", idx.item
    else:
      return f"[{next_tok}]"
  
  if return_idx:
    return f"[{next_tok}] {next_next}", idx.item
  else:
    return f"[{next_tok}] {next_next}"

In [None]:
# NEW_REMOTE_ROOT_URL = "https://rome.baulab.info"
# NEW_REMOTE_URL = f"{NEW_REMOTE_ROOT_URL}/data/dsets/zsre_mend_eval.json"

In [None]:
 import urllib, json

In [None]:
file1_name = "base_neighbors.py"   # All dictionaries, without splitting
file2_name = "new_neighborhood.py"   # Final dataset
minimal_neighborhood_len = 5

In [None]:
counterfacts_url = "https://rome.baulab.info/data/dsets/counterfact.json"
response = urllib.request.urlopen(counterfacts_url)
data = json.loads(response.read())

In [None]:
def print_dict(dict, file_name=None):
  func = print
  if file_name:
    file = open(file_name, "w", encoding="utf-8")
    func = file.write

  func("d = {\n")
  for key in dict.keys():
    func(f"\t\"{key}\": {dict[key]},\n")
  func("}")

  if file_name:
    file.close()

def print_list(list_input, file_name=None):
  func = print
  if file_name:
    file = open(file_name, "w", encoding="utf-8")
    func = file.write

  func("d = [\n")
  for item in list_input:
    func(f"\t{item},\n")
  func("]")

  if file_name:
    file.close()

In [None]:
def subset(sorted_list):
  # count_list.sort()
  S = sum(sorted_list)
  start, end = 0, 0
  while sum(sorted_list[start:end])<S/2:
    end+=1
  while sum(sorted_list[start:end])>S/2:
    start+=1
  if start==end:
    return range(end-1)
  return range(start,end)

Version 1: unfiltered, with known objects

In [None]:
# Collect all the relations as keys: 
neighborhood_dict = dict()   # neighborhood_dict: dictionary in form {ralation: {object: [subjects]}}:
count=0
temp_counterfact = data
for fact in temp_counterfact:
  if fact["requested_rewrite"]['prompt'] not in neighborhood_dict.keys():
    neighborhood_dict[fact["requested_rewrite"]['prompt']]=dict()

print("keys: done")

# Collect, for every relation, its subjects that the model can predict, grouping by objects:
for fact in temp_counterfact:
  o_true = fact["requested_rewrite"]["target_true"]["str"]
  for neighbor in fact["neighborhood_prompts"]:
    o_pred = naiv_predict(neighbor)
    if o_true==o_pred:
      for prompt in neighborhood_dict.keys():
        if is_in_prompt(prompt, neighbor):
          if o_true not in neighborhood_dict[prompt].keys():
            neighborhood_dict[prompt][o_true] = []
          neighbor_subject = get_subject(neighbor, prompt)
          if neighbor_subject not in neighborhood_dict[prompt][o_true]:
            neighborhood_dict[prompt][o_true].append(neighbor_subject)
          break
  count+=1
  if count%100==0:
    print(count)

print_dict(neighborhood_dict, file1_name)

keys: done
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300

In [None]:
from base_neighbors import d as nbrs
new_nbrs = {k: nbrs[k] for k in nbrs.keys() if sum([len(v) for v in nbrs[k].values()])>=minimal_neighborhood_len}

In [None]:
new_dataset = []

In [None]:
Ks = list(new_nbrs.keys())    # Ks: relations


# split every relation-gruop
for i in range(len(Ks)):  
  k = Ks[i]   # relation
  objects = new_nbrs[k] # {object: [subject]}

  o_list = list(objects.keys())
  o_counts = {ob: len(objects[ob]) for ob in o_list}
  sorted_counts = dict(sorted(o_counts.items(), key=lambda x:x[1]))
  subset_indices = subset(list(sorted_counts.values()))
  subset1_objects = [o_list[i] for i in subset_indices]
  subset2_objects = [ob for ob in o_list if ob not in subset1_objects]

  subset1_subjects = []
  for object in subset1_objects:
    subset1_subjects += objects[object]

  subset2_subjects = []
  for object in subset2_objects:
    subset2_subjects += objects[object]

  if len(subset1_subjects)>=minimal_neighborhood_len and len(subset2_subjects)>=minimal_neighborhood_len:
    dataset1 = (k, subset1_subjects, subset1_objects, subset2_objects)
    dataset2 = (k, subset2_subjects, subset2_objects, subset1_objects)
    new_dataset.append(dataset1)
    new_dataset.append(dataset2)
  else:
    dataset = (k, subset1_subjects+subset2_subjects, o_list, [])
    new_dataset.append(dataset)

In [None]:
print_list(new_dataset, file2_name)

Version 2: filtered with unknown objects

In [None]:
from source import nbrs 

In [None]:
new_dataset = []

In [None]:
Ks = list(nbrs.keys())    # Ks: relations

# k = Ks[0]  #
# L = nbrs[k]  #

for i in range(len(Ks)):
  k = Ks[i]
  L = nbrs[k]

  objects = dict()      # objects = {object: [prompts]}
  for p in L:           # p: entire prompt (subject + relation)
    o = naiv_predict(p)
    if o not in objects.keys():
      objects[o]=[]
    s = get_subject(p, k)
    if s not in objects[o]:
      objects[o].append(s)

  o_list = list(objects.keys())
  o_counts = {ob: len(objects[ob]) for ob in o_list}
  sorted_counts = dict(sorted(o_counts.items(), key=lambda x:x[1]))
  subset_indices = subset(list(sorted_counts.values()))
  subset1_objects = [o_list[i] for i in subset_indices]
  subset2_objects = [ob for ob in o_list if ob not in subset1_objects]

  subset1_subjects = []
  for object in subset1_objects:
    subset1_subjects += objects[object]

  subset2_subjects = []
  for object in subset2_objects:
    subset2_subjects += objects[object]


  if len(subset1_subjects)>=minimal_neighborhood_len and len(subset2_subjects)>=minimal_neighborhood_len:
    dataset1 = (k, subset1_subjects, subset1_objects, subset2_objects)
    dataset2 = (k, subset2_subjects, subset2_objects, subset1_objects)
    new_dataset.append(dataset1)
    new_dataset.append(dataset2)
  else:
    dataset = (k, subset1_subjects+subset2_subjects, o_list, [])
    new_dataset.append(dataset)

In [None]:
print_list(new_dataset, file2_name)

In [None]:
# new_neighborhood_list = dict()   # neighborhood_dict: dictionary in form {ralation: {object: [subjects]}}:

# for relation, prompts, objects in nbrs:
#   subjects = []
#   for prompt in prompts:
#     new_subject = get_subject(prompt, relation)
#     subjects.append(new_subject)
#   dataset = (relation, subjects, objects)
#   new_neighborhood_list.append(dataset)

## Validation

In [None]:
# check:
dataset_semple = [2,3,6,7,11,12,15,16]
promts_sample = [1,2,6,42,43,45,51,93,120]

for i in  dataset_semple:
  ds = new_dataset[i]
  objects = ds[2]
  prompts = ds[1]
  relation = ds[0]
  for j in promts_sample:
    if j<len(prompts):
      subject = prompts[j]
      pmpt = combine_prompt(subject, relation)
      pred = naiv_predict(pmpt)
      print(pred in objects)
      if not pred in objects:
        print(f"\ti={i}, j={j}, prompt={pmpt}, pred={pred}")

In [None]:
# check length:
for relation, subjects, objects in new_dataset:
  print(len(prompts))

print()

print(len([x for x in new_dataset if len(x[1])>5]))

## **Trials**

In [None]:
# New
LEVEL_IDX[0] = 5

request = [
      {
          "prompt": "{} is ",
          "subject": "Yanay",
          "target_new": {"str": "genius"},
      }
  ]

    
M["model_new"], M["orig_weights"] = demo_model_editing(mt2.model, tok, request, ["a"], alg_name=ALG_NAME)
mt2.model = M["model_new"]

clean()


[{'prompt': '{} is ', 'subject': 'Yanay', 'target_new': {'str': 'genius'}}] 


&&&
&&&
&&&
&&& hparams/ROME/gpt2-xl.json 
&&&
&&&
&&&
&&&
5

###hparams:
 [5] 
$$$
$$$
Cached context templates ['{}', 'The new season of. {}', 'The new generation of. {}', 'The first time you. {}', 'A man has been. {}', 'The U.S. {}', 'In the wake of. {}', 'The U.S. {}', '"The only thing. {}', '"I have been. {}', 'The first time you. {}', 'In an exclusive interview with Breitbart News, a. {}', 'I have to admit, I am not the. {}', 'I have a lot of experience with the Raspberry. {}', '"The only thing we have to fear is. {}', 'A man was shot dead in the city centre. {}', 'The U.S. Supreme Court ruled Thursday. {}', 'The first thing to understand about the new,. {}', '"I have a dream." That. {}', '"It\'s not about the money, it. {}', 'The UESPWiki – Your source for. {}']
Computing left vector (u)...
Selected u projection object Yanay
Retrieving inverse covariance statistics for gpt2-xl @ transformer.h.5.mlp.c_

  0%|          | 0.00/156M [00:00<?, ?B/s]

Successfully downloaded.
Loading cached data/stats/gpt2-xl/wikipedia_stats/transformer.h.5.mlp.c_proj_float32_mom2_100000.npz


  0%|          | 0/1000 [00:00<?, ?it/s]

Computing right vector (v)
Lookup index found: 1 | Sentence: Yanay is  | Token: ay
Rewrite layer is 5
Tying optimization objective to 47
Recording initial value of v*
loss 14.749 = 14.749 + 0.0 + 0.0 avg prob of [ genius] 6.15194721831358e-07
loss 13.928 = 13.816 + 0.025 + 0.087 avg prob of [ genius] 1.5049603234729148e-06
loss 13.067 = 12.867 + 0.057 + 0.143 avg prob of [ genius] 3.52335723619035e-06
loss 11.929 = 11.671 + 0.071 + 0.187 avg prob of [ genius] 1.1593338967941236e-05
loss 10.407 = 10.151 + 0.069 + 0.187 avg prob of [ genius] 5.442638212116435e-05
loss 7.996 = 7.743 + 0.066 + 0.187 avg prob of [ genius] 0.0006433093803934753
loss 6.325 = 6.011 + 0.126 + 0.187 avg prob of [ genius] 0.005619416479021311
loss 3.311 = 3.001 + 0.123 + 0.187 avg prob of [ genius] 0.08348087966442108
loss 0.858 = 0.533 + 0.137 + 0.187 avg prob of [ genius] 0.6867366433143616
loss 0.464 = 0.131 + 0.146 + 0.187 avg prob of [ genius] 0.8853286504745483
loss 0.438 = 0.098 + 0.153 + 0.187 avg prob of

In [None]:
relation, subjects, _, _ = 	('The original language of {} was', ["L'Atlantide", 'Şarkıcı', 'Zärtliche Chaoten', 'Moordwijven', 'Los Olvidados', 'Tel Aviv-Los Angeles', 'Adi Shankaracharya'], ['a', 'written', 'Dutch', 'Spanish', 'Sanskrit'], [])
for subject in subjects:
  prompt = combine_prompt(subject, relation)
  print(prompt,"  :  ", naiv_predict(prompt))

The original language of L'Atlantide was   :   written
The original language of Şarkıcı was   :   written
The original language of Zärtliche Chaoten was   :   written
The original language of Moordwijven was   :   Dutch
The original language of Los Olvidados was   :   Spanish
The original language of Tel Aviv-Los Angeles was   :   a
The original language of Adi Shankaracharya was   :   Sanskrit


In [None]:
mt2.tokenizer.encode("the")[0]

1169

In [None]:
naiv_predict("", "The native language of Raymond Barre is")

'French'

In [None]:
naiv_predict("Alan Turing", "is a kind of")

'hero'

In [None]:
naiv_predict("Alan Turing", "is not a cat, but a")

'computer'

# **Updates experiment**

In [None]:
NUM_OF_LAYERS = 48
M = dict()


In [None]:
def neighbors_probs(main_subject, relation, neighbors, target_id):
  # str, list(str), int --> list(float)
  probs = []
  for neighbor in neighbors:
    if main_subject is None or neighbor!=main_subject:
      prompt = combine_prompt(neighbor, relation)
      probs.append(predict_by_idx(mt2, prompt, target_id))
  return probs

def neighboring(probs1, probs2):
  # list(float), list(float) --> float
  m = len(probs1)
  f = []
  for i in range(m):
    numerator = abs(probs1[i]-probs2[i])
    denominator = 0.5+abs(probs1[i]-0.5)
    ngbring = 1 - numerator / denominator
    f.append(ngbring)
  return sum(f) / m

In [None]:
def plot_all_flow(mt, prompt, subject=None, noise=0.1, modelname=None):
    for kind in ["mlp"]:
        plot_hidden_flow(
            mt, prompt, subject, modelname=modelname, noise=noise, kind=kind
        )

In [None]:
import math
def return_map(
    prompt,
    subject,
    mt=mt2,
    samples=10,
    noise=noise_level,
    window=10,
    kind="mlp",
    modelname=None,
    savepdf=None,
):
    if subject is None:
        subject = guess_subject(prompt)
    result = calculate_hidden_flow(
        mt, prompt, subject, samples=samples, noise=noise, window=window, kind=kind
    )
    return result


def generate_city_prompt(city):
  prompt = "is the capital city of"
  word=naiv_predict(city, prompt)
  while word in ["the", "state", "State", "of", "Republic", "province", "Province"]:
    prompt = prompt+ " " + word
    word=naiv_predict(city, prompt)
  return prompt


def entropy(tens):
    tens_norm = tens / tens.sum()
    logs = torch.log2(tens_norm)
    logs = torch.where(logs==-float("inf"),0,logs)
    y = logs * tens_norm
    return -y.sum().item() / math.log2(len(tens))


def max_layer_and_entropy(prompt, subject, max_neighbors=[1], effect_idx=[]):
  print(f"prompt: {prompt}, subject: {subject}")
  result = return_map(prompt, subject)
  scores = result['scores']
  a, b = result['subject_range']
  argmax = scores[a:b].argmax().item()

  relevant_token_idx = int(argmax / len(scores[0])) + a
  relevant_token = scores[relevant_token_idx]

  _max = scores[a:b].max().item()
  _min = scores[a:b].min().item()
  avrg = relevant_token.sum().item() / (len(scores[0]))
  effs = []
  for idx in effect_idx: 
    effs.append(relevant_token[idx].item())

  layer = argmax % len(scores[0])
  cent = []
  for i in max_neighbors:
    if layer+i>=0:
      cent.append(((relevant_token[layer] - relevant_token[layer+1]) / relevant_token[layer]).item())
    else:
      cent.append(-1)
  return layer, entropy(relevant_token), cent, _max, _min, avrg, effs


In [None]:
  def clean():
    if "orig_weights" in M.keys():
        with torch.no_grad():
            for k, v in M["orig_weights"].items():
                nethook.get_parameter(mt2.model, k)[...] = v
        print("Original model restored")
    else:
        print(f"No model weights to restore")
    
    M.clear()

In [None]:
# def change_and_check_ser(_subject, prompt, targets, affected, set_affected=None, count_flag=False):

def change_and_check(main_subject_idx, relation, new_target, neighborhood, orig_probs=None, orig_final=None, to_target_flag=False):
  # If orig_probs is not None: return list of the probability for new_target, for each neighbor. 
  # In this case: orig_ouputs = list of original probs (for all subjects including main_subject_idx)
  # If orig_final is not None: count and return the number of changed final-outputs of neighbors.
  # In this case: orig_ouputs = list of final outputs (tokens. For all subjects including main_subject_idx)

  # if to_target_flag: count only changes to new_target. else: all change.

  # neighborhood: list(subjects). incluuding main_subject

  clean()

  main_subject = neighborhood[main_subject_idx]

  random.seed(_seed)
  numpy.random.seed(seed=_seed)
  torch.manual_seed(_seed)

  tok_id = mt2.tokenizer.encode(new_target)[0]

  request = [
      {
          "prompt": relation,
          "subject": main_subject,
          "target_new": {"str": new_target},
      }
  ]

    
  M["model_new"], M["orig_weights"] = demo_model_editing(mt2.model, tok, request, ["a"], alg_name=ALG_NAME)
  mt2.model = M["model_new"]

  results = []

  if orig_probs is not None:
    filtered_orig_probs = orig_probs[:main_subject_idx]+orig_probs[main_subject_idx+1:]
    post_probs = neighbors_probs(main_prompt, neighborhood, tok_id)
    neighboring_score = neighboring(filtered_orig_probs, post_probs)
    results.append(neighboring_score)
  
  if orig_final is not None:
    count = 0
    for i in range(len(neighborhood)):
      if i!=main_subject_idx:
        pred = naiv_predict(neighborhood[i])
        if to_target_flag:
          count+= 1*(pred==new_target)
        else:
          count+= 1*(pred!=oirg_final[i])
    results.append(count / (len(neighborhood)-1))

  # results: [single float] or [single float, single float]
  return results

In [None]:
def neighborhood_score_by_object(neighborhood_data, target, orig_probs=None, orig_final=None, to_target_flag=False):
  # Calculate lists (one or two) of neighborhood_score of specific target (object) (given specific relation) over all subjects.
  # neighborhood_data: (relation: str, subjects: list(str), orig_objects: list(str), new_objects: list(str))

  relation, subjects, _, _ = neighborhood_data

  neighborhood_scores = []
  if orig_probs is not None:
    neighborhood_scores.append([])
  if orig_final is not None:
    neighborhood_scores.append([])

  for subject_idx in range(len(subjects)):
    current_scores = change_and_check(subject_idx, relation, target, subjects, orig_probs, orig_final, to_target_flag)
    if orig_probs is not None:
      neighborhood_scores[0].append(current_scores[0])
    if orig_final is not None:
      neighborhood_scores[-1].append(current_scores[-1])
  
  # neighborhood_scores: [[floats]] or [[floats], [floats]]
  # len of each [floats] is: |subjects| 
  return neighborhood_scores


def neighboring_score_of_neighborhood(neighborhood_data, orig_probs_flag=True, orig_final_flag=True, to_target_flag=False):
  relation, subjects, _, new_objects = neighborhood_data
  n_subjects = len(subjects)

  if orig_final_flag:
    orig_final = []
    for subject in subjects:
      orig_final.append(naiv_predict(subject, relation))
  else:
    orig_final = None
  
  neighborhood_scores = []
  if orig_probs_flag:
    neighborhood_scores.append([0]*n_subjects)
  if orig_final_flag:
    neighborhood_scores.append([0]*n_subjects)

  for target in new_objects:  
    if orig_probs_flag:
      target_id = mt2.tokenizer.encode(target)[0]
      orig_probs = neighbors_probs(main_subject=None, relation=neighborhood_data[0], neighbors=subjects, target_id=target_id)
    else:
      orig_probs = None
    
    current_scores = neighborhood_score_by_object(neighborhood_data, target, orig_probs, orig_final, to_target_flag)
    sum_matrices(neighborhood_scores, current_scores, alpha=1/n_subjects)

  # neighborhood_scores: [[floats]] or [[floats], [floats]]
  return neighborhood_scores

In [None]:
def neighborhood_results(neighborhood_data, orig_probs_flag=True, orig_final_flag=True, to_target_flag=False, num_of_layers=48):
  causal_features = {"max layers": [], "entropies": [], "maxs": [], "mins": [], "avrgs": [], "effs": []}
  results = {"causal_features": causal_features, "neighborhood_scores": []}
  relation, subjects, _, new_objects = neighborhood_data
  for subject in subjects:
    prompt = combine_prompt(subject, relation)
    max_layer, entropy, _, _max, _min, avrg, effs = max_layer_and_entropy(prompt, subject, max_neighbors=[], effect_idx=range(num_of_layers))
    
    causal_features["max layers"].append(max_layer)
    causal_features["entropies"].append(entropy)
    causal_features["maxs"].append(_max)
    causal_features["mins"].append(_min)
    causal_features["avrgs"].append(avrg)
    causal_features["effs"].append(effs)

  for layer_idx in range(num_of_layers):
    LAYER_IDX[0] = layer_idx
    neighboring = neighboring_score_of_neighborhood(neighborhood_data, orig_probs_flag, orig_final_flag, to_target_flag)
    results["neighborhood_scores"].append(neighboring)

In [None]:
def all_results(neighborhood_list, orig_probs_flag=True, orig_final_flag=True, to_target_flag=False, num_of_layers=NUM_OF_LAYERS):
  results = []
  idx = 0
  for neighborhood_data in neighborhood_list:
    print(idx)
    idx+=1
    new_results = neighborhood_results(neighborhood_data, orig_probs_flag, orig_final_flag, to_target_flag, num_of_layers)
    results.append(new_results)
  return results

In [None]:
from neighborhood import d

results = all_results(d)
print_list(list_input = results, file_name="attractions_and_features.py")

0
prompt: The mother tongue of Odysseas Elytis is, subject: Odysseas Elytis
prompt: The mother tongue of Andreas Papandreou is, subject: Andreas Papandreou
prompt: The mother tongue of Konstantinos Karamanlis is, subject: Konstantinos Karamanlis
prompt: The mother tongue of Georgios Rallis is, subject: Georgios Rallis
prompt: The mother tongue of Alexandros Papadiamantis is, subject: Alexandros Papadiamantis
prompt: The mother tongue of Yannis Kounellis is, subject: Yannis Kounellis
prompt: The mother tongue of Charlie Chaplin is, subject: Charlie Chaplin
prompt: The mother tongue of George Washington is, subject: George Washington
prompt: The mother tongue of Cyndi Lauper is, subject: Cyndi Lauper
prompt: The mother tongue of Elton John is, subject: Elton John
prompt: The mother tongue of Bob Dylan is, subject: Bob Dylan
prompt: The mother tongue of Paul McCartney is, subject: Paul McCartney
prompt: The mother tongue of George Orwell is, subject: George Orwell
prompt: The mother tongu

ValueError: ignored

## **Results Analysis**

In [None]:
from attractions_and_features import d as results_list 
import matplotlib.pyplot as plt
import numpy as np

In [None]:
neighboring_by_logits = True
neighboring_by_finals = True

correlations = []

for results_dict in results_list:
  max_layers = results_dict["causal_features"]["max layers"]  # [ints]
  effect_values = results_dict["causal_features"]["effs"]     # [[floats], [floats]...]
  mins = results_dict["causal_features"]["mins"]              # [floats]
  maxs = results_dict["causal_features"]["maxs"]              # [floats]
  n_neighbors = len(mins)

  scores = results_dict["neighborhood_scores"]  # [[[floats]], [[floats]]...] or [[[floats], [floats]], [[floats], [floats]]...]

  correlation_dict = dict()   # {4* key: [floats]}
  if neighboring_by_logits:
    correlation_dict["logits and distance"] = []
    correlation_dict["logits and relative effect"] = []
  if neighboring_by_finals:
    correlation_dict["finals and distance"] = []
    correlation_dict["finals and relative effect"] = []
  
  # Calculate correlations
  for layer_idx in range(NUM_OF_LAYERS):
    current_scores = scores[layer_idx]  # [[floats]] or [[floats], [floats]]
    distances = [abs(layer_idx-max_layer) for max_layer in max_layers]
    relative_effects = [(effect_values[i][layer_idx] - mins[i]) / (maxs[i] - mins[i]) for i in range(n_neighbors)]

    if neighboring_by_logits:
      correlation_dict["logits and distance"].append(np.corrcoef(scores[layer_idx][0], distances)[0][1])
      correlation_dict["logits and relative effect"].append(np.corrcoef(scores[layer_idx][0], relative_effects)[0][1])
      
    if neighboring_by_finals:
      correlation_dict["finals and distance"].append(np.corrcoef(scores[layer_idx][-1], distances)[0][1])
      correlation_dict["finals and relative effect"].append(np.corrcoef(scores[layer_idx][-1], relative_effects)[0][1])

  correlations.append(correlation_dict)

In [None]:
print_list(list_input = correlations, file_name="correlations.py")

In [None]:
from correlations import d as correlations_list

if neighboring_by_logits:
  for data_set_cor in correlations_list:
    plt.plot(correlation_dict["logits and distance"])
  plt.title(f"Correlation Between neighboring_by_logits and Distances")
  plt.xlabel("updated layer")
  plt.ylabel("correlation")
  plt.show()
  
  for data_set_cor in correlations_list:
    plt.plot(correlation_dict["logits and relative effect"])
  plt.title(f"Correlation Between neighboring_by_logits and Relative Effects")
  plt.xlabel("updated layer")
  plt.ylabel("correlation")
  plt.show()

if neighboring_by_finals:
  for data_set_cor in correlations_list:
    plt.plot(correlation_dict["finals and distance"])
  plt.title(f"Correlation Between neighboring_by_finals and Distances")
  plt.xlabel("updated layer")
  plt.ylabel("correlation")
  plt.show()
  
  for data_set_cor in correlations_list:
    plt.plot(correlation_dict["finals and relative effect"])
  plt.title(f"Correlation Between neighboring_by_finals and Relative Effects")
  plt.xlabel("updated layer")
  plt.ylabel("correlation")
  plt.show()



# Old code: serial update

In [None]:
def change_and_check_ser(_subject, prompt, targets, affected, set_affected=None, count_flag=False):
  clean()
  
  temp_name = "orig_weights"

  if set_affected is not None:
    print("Change affected:")
    for word in affected:
      print("changing", word)
      random.seed(_seed)
      numpy.random.seed(seed=_seed)
      torch.manual_seed(_seed)

      request = [
        {
            "prompt": f"\u007b\u007d {prompt}",
            "subject": word,
            "target_new": {"str": set_affected},
        }
      ]

    
      M["model_new"], M[temp_name] = demo_model_editing(mt2.model, tok, request, ["a"], alg_name=ALG_NAME)
      mt2.model = M["model_new"]

      temp_name = "_"

  orig_object = predict(_subject, prompt)
  print(f"Pre check:\n{_subject} {prompt} {orig_object}")

  change_index = 1

  all_answers = []

  if count_flag:
    counts = []
    prev_line = {}
    for word in affected:
      prev_line[word]=predict(word, prompt)

  for t in targets:
    target=orig_object if t=="origin" else t

    random.seed(_seed)
    numpy.random.seed(seed=_seed)
    torch.manual_seed(_seed)

    tok_id = mt2.tokenizer.encode(target)
    pre_probs = neighbors_probs(_subject, prompt, neighborhood, tok_id)

    if count_flag:
      drag_count = 0
      conf_count = 0

    new_line = []
    new_line.append(target)

    print("CHANGE:", change_index, ":", target)
    change_index+=1

    request = [
        {
            "prompt": f"\u007b\u007d {prompt}",
            "subject": _subject,
            "target_new": {"str": target},
        }
    ]

    
    M["model_new"], M[temp_name] = demo_model_editing(mt2.model, tok, request, ["a"], alg_name=ALG_NAME)

    mt2.model = M["model_new"]

    temp_name = "_"

    post_probs = neighbors_probs(_subject, prompt, neighborhood, tok_id)
    counts.append(neighboring(pre_probs, post_probs))
  
  return counts


def drag_animals(_subject, targets):
  return change_and_check_ser(_subject,"is a kind of", targets, affected=animals)

def drag_cities(_subject, targets, count_flag=False):
  return change_and_check_ser(_subject,"is the capital city of", targets, affected=cities, count_flag=count_flag)

In [None]:
def print_drags(subjects, targets=[], print_map=True, cents = [-2,-1,1,2], idx=17):
  clean()

  all_counts = {}

  if print_map:
    for i in cents:
      all_counts[f"centralization {i}"] = []
    all_counts["layers"] = []
    all_counts["entropies"] = []
    all_counts["max value"] = []
    all_counts["min value"] = []
    all_counts["average"] = []
    all_counts[f"effect in {idx}"] = []

    
    for i in range(len(subjects)):
      print(i, ": ", end="")
      subject = subjects[i]
      layer_idx, entropy, cent, _max, _min, avrg, eff = max_layer_and_entropy(f"{subject} {generate_city_prompt(subject)}", subject, max_neighbors=cents)
      all_counts["layers"].append(layer_idx)
      all_counts["entropies"].append(entropy)
      all_counts["max value"].append(_max)
      all_counts["min value"].append(_min)
      all_counts["average"].append(avrg)
      all_counts[f"effect in {idx}"].append(eff)

      for i in range(len(cents)):
        all_counts[f"centralization {cents[i]}"].append(cent[i])
      print("done")

  for i in range(len(targets)):
    all_counts[f"drag_{i+1}"] = []
    all_counts[f"change_{i+1}"] = []

  if len(targets)>0:
    for subject in subjects:
      counts = drag_cities(subject, targets, True)
      for i in range(len(counts)):
        all_counts[f"drag_{i+1}"].append(counts[i][0])
        all_counts[f"change_{i+1}"].append(counts[i][0]+counts[i][1])
      print(len(all_counts["drag_1"]), ", until", subject)
      for key in all_counts.keys():
        print(key, "=", all_counts[key])
  else:
    for key in all_counts.keys():
      print(key, "=", all_counts[key])

## **Draft**

In [None]:
# prcs = [
#     ["Ghana", "China", "Algiers", "Greece", "Japan", "Ethiopia", "Niue", "Switzerland", "Jordan", "Turkey", "Samoa"],
#     ["China", "Greece", "Ethiopia", "Switzerland", "Turkey", "Ghana", "Algiers", "Japan", "Niue", "Samoa", "Jordan"],
#     ["Algiers", "Ethiopia", "Jordan", "China", "Switzerland", "Japan", "Samoa", "Ghana", "Greece", "Niue", "Turkey"],
#     ["Greece", "Switzerland", "Ghana", "Japan", "Jordan", "Niue", "China", "Samoa", "Turkey", "Ethiopia", "Algiers"],
#     ["Switzerland", "Niue", "Japan", "Ghana", "Ethiopia", "Turkey", "Greece", "Jordan", "Samoa", "Algiers", "China"],
#     ["Samoa", "Japan", "Switzerland", "Algiers", "Niue", "Greece", "Ghana", "Turkey", "China", "Jordan", "Ethiopia"],
#     ["Japan", "Algiers", "Turkey", "Jordan", "Greece", "Samoa", "Switzerland", "China", "Ethiopia", "Ghana", "Niue"]
# ]

In [None]:
# print_drags(cities[:2], ["China", "T"], False)

In [None]:
# neighborhood = {
#     "Paris": ["Bangkok", "Stockholm", "Moscow", "Bucharest", "Kigali", "Zagreb"],
#     "Nicosia": ["Nairobi", "Ottawa", "Phnom Penh", "Bishkek", "Doha", "Seoul", "Havana"]
# }

In [None]:
# cities = ["Paris", "Bangkok", "Stockholm", "Moscow", "Bucharest", "Kigali", "Zagreb", "Nicosia", "Nairobi", "Ottawa", "Phnom Penh", "Bishkek", "Doha", "Seoul", "Havana", "Prague", "Lima", "Islamabad", "Port Moresby", "Helsinki", "Suva", "Lisbon", "Warsaw", "San Juan", "Riyadh", "Baghdad", "Muscat", "Belgrade", "Madrid", "Dakar", "Bratislava", "Ljubljana", "Freetown", "Damascus", "Mogadishu", "Khartoum", "Kathmandu", "Managua", "Niamey", "Wellington", "Abuja", "Kingston", "Oslo", "Rabat", "Skopje", "Cairo", "Kyiv", "Montevideo","Abu Dhabi", "Tehran", "Buenos Aires", "Berlin", "Amsterdam", "Astana", "Naypyidaw", "Lilongwe", "Kuala Lumpur","Ulaanbaatar", "Bamako", "Nouakchott", "Vilnius", "Monrovia", "Riga", "Tripoli", "Beirut", "Jerusalem"]

In [None]:
# animals = ["grizzly", "poodle", "terrier", "collie", "border collie", "Schnauzer", "bird", "sparrow", "pale rockfinch", "corvus", "jackdaw", "magpie-jay", "european goldfinch", "chaffinch", 
#            "pine grosbeak", "carpornis", "atlantic royal flycatcher","pacific royal flycatcher","northern royal flycatcher", "pigeon", "parrot", "cockatiel", "eagle", "owl", "penguin", "chameleon"]



In [None]:
# def neighborhood_score_of_subject(main_subject_idx, neighborhood_data, orig_probs=None, orig_final=None, to_target_flag=False):
#   # Calculate average neighborhood_score of specific subject (given specific relation) over all new_objects.
#   # neighborhood_data: (relation: str, subjects: list(str), orig_objects: list(str), new_objects: list(str))

#   relation, subjects, _, objects = neighborhood_data

#   neighborhood_scores = []
#   if orig_probs is not None:
#     filtered_orig_probs = orig_probs[:main_subject_idx]+orig_probs[main_subject_idx+1:]
#     neighborhood_scores.append(0)
#   if orig_final is not None:
#     filtered_orig_final = orig_final[:main_subject_idx]+orig_final[main_subject_idx+1:]
#     neighborhood_scores.append(0)

#   for object in objects:
#     current_scores = change_and_check(main_subject_idx, relation, new_target, subjects, filtered_orig_probs, filtered_orig_final, to_target_flag)
#     if orig_probs is not None:
#       neighborhood_scores[0]+=current_scores[0]
#     if orig_final is not None:
#       neighborhood_scores[-1]+=current_scores[-1]
  
#   for i in range(len(neighborhood_scores)):
#     neighborhood_scores[i]/=len(objects)
#   return neighborhood_scores

In [None]:
# def prdict_city()

In [None]:
# print(predict("corvus"))

In [None]:
# request = [
#     {
#         "prompt": "{} is the capital city of",
#         "subject": "Paris",
#         "target_new": {"str": "China"},
#     }
# ]

# # Execute rewrite
# model_new, orig_weights = demo_model_editing(model, tok, request, ["a"], alg_name=ALG_NAME)

# mt2.model = model_new


In [None]:
# for city in ["Suva", "Lisbon", "Warsaw", "San Juan", "Riyadh", "Baghdad", "Muscat", "Belgrade", "Madrid", "Dakar", "Bratislava", "Ljubljana", "Freetown", "Damascus", "Mogadishu"]:
#   print(city)
#   drag_cities(city, ["Japan", "China"], True)

In [None]:
# for city in ["Kigali", "Bishkek","Nicosia", "Bucharest", "Paris", "Moscow", "Stockholm", "Bangkok", "Prague"]:
#   print(max_layer_and_entropy(f"{city} {generate_city_prompt(city)}", city))

In [None]:
# predict("Adamstown", "is the capital city of the state of")

In [None]:
# print(generate_city_prompt("Moscow"))
# print(generate_city_prompt("Prague"))
# print(generate_city_prompt("Paris"))
# print(generate_city_prompt("Papeete"))
# print(generate_city_prompt("Adamstown"))



In [None]:
# print(max_layer_and_entropy("Stockholm is the capital city of", "Stockholm"))

In [None]:
# for city in cities:
#   print(city," | ", predict(city, "is the capital city of"))

In [None]:
# drag_animals("sparrow", ["dog", "lizard", "bird"])
# change_and_check("TTTTT", ["JJ", "KK", "LL"])

In [None]:
# drag_cities("Paris", ["Japan", "China", "France"], True)

In [None]:
# nonsense = ["kv", "fg", "de", "oj", "mdo", "mzv", "ahz", "zjx", "oxzz", "wdcp", "rfvn", "dwgq", "ofkcn", "krzrw", "zlaiq", "arzdp", "yraxjo", "edjxpa", "jdrhdq", "vjulqc", "iyapuql", "jglwuos", "bljjgzv", "ibryurx", "cxmvyvat", "twyzhcpr", "fnfvvluj", "vjrknbpp", "ftrbwywac", "swjwniqas", "ddssywine", "jgrpttwbn", "oybmpearnv", "vapkrtajcn", "coltptglwa", "mebtlpozkb"]

# def drag_nonsense(_subject, targets, _set_affected):
#   change_and_check(_subject, "is a kind of", targets, affected=nonsense, set_affected=_set_affected)

In [None]:
# plot_all_flow(mt2, f"Suva is the capital city of the Republic of the", noise=noise_level, subject="Suva")

In [None]:
# for city in [ "Lisbon", "Warsaw", "San Juan", "Riyadh", "Baghdad", "Muscat", "Belgrade", "Madrid", "Dakar", "Bratislava", "Ljubljana", "Freetown", "Damascus", "Mogadishu"]:
#   plot_all_flow(mt2, f"{city} is the capital city of", noise=noise_level, subject=city)

In [None]:
# plot_all_flow(mt2, f"Beirut is the capital city of", noise=noise_level, subject="Beirut")
# plot_all_flow(mt2, f"Tripoli is the capital city of", noise=noise_level, subject="Tripoli")
# plot_all_flow(mt2, f"Oslo is the capital city of", noise=noise_level, subject="Oslo")

In [None]:
# plot_all_flow(mt2, "grizzly is a kind of", noise=noise_level, subject="grizzly")

# plot_all_flow(mt2, "poodle is a kind of", noise=noise_level, subject="poodle")
# plot_all_flow(mt2, "terrier is a kind of", noise=noise_level, subject="terrier")
# plot_all_flow(mt2, "collie is a kind of", noise=noise_level, subject="collie")
# plot_all_flow(mt2, "border collie is a kind of", noise=noise_level, subject="border collie")
# plot_all_flow(mt2, "Schnauzer is a kind of", noise=noise_level, subject="Schnauzer")

# # texonomy:
# ##### class
# #### order
# ### suborder
# ## family

# ##### birds
# plot_all_flow(mt2, "bird is a kind of", noise=noise_level, subject="bird")

# #### Passerine

# ### Songbird
# ##
# plot_all_flow(mt2, "sparrow is a kind of", noise=noise_level, subject="sparrow")
# plot_all_flow(mt2, "pale rockfinch is a kind of", noise=noise_level, subject="pale rockfinch")
# ##
# plot_all_flow(mt2, "corvus is a kind of", noise=noise_level, subject="corvus")
# plot_all_flow(mt2, "jackdaw is a kind of", noise=noise_level, subject="jackdaw")
# plot_all_flow(mt2, "magpie-jay is a kind of", noise=noise_level, subject="magpie-jay")
# ##
# plot_all_flow(mt2, "european goldfinch is a kind of", noise=noise_level, subject="european goldfinch")
# plot_all_flow(mt2, "chaffinch is a kind of", noise=noise_level, subject="chaffinch")
# plot_all_flow(mt2, "pine grosbeak is a kind of", noise=noise_level, subject="pine grosbeak")

# ### Tyranni
# ##
# plot_all_flow(mt2, "carpornis is a kind of", noise=noise_level, subject="carpornis")
# ##
# plot_all_flow(mt2, "atlantic royal flycatcher is a kind of", noise=noise_level, subject="atlantic royal flycatcher")
# plot_all_flow(mt2, "pacific royal flycatcher is a kind of", noise=noise_level, subject="pacific royal flycatcher")
# plot_all_flow(mt2, "northern royal flycatcher is a kind of", noise=noise_level, subject="northern royal flycatcher")


# ####
# ## Columbidae
# plot_all_flow(mt2, "pigeon is a kind of", noise=noise_level, subject="pigeon")

# #### parrot
# plot_all_flow(mt2, "parrot is a kind of", noise=noise_level, subject="parrot")
# ##
# plot_all_flow(mt2, "cockatiel is a kind of", noise=noise_level, subject="cockatiel")

# ####
# ## eagle
# plot_all_flow(mt2, "eagle is a kind of", noise=noise_level, subject="eagle")

# plot_all_flow(mt2, "owl is a kind of", noise=noise_level, subject="owl")

# ####
# ## Penguin
# plot_all_flow(mt2, "penguin is a kind of", noise=noise_level, subject="penguin")


# plot_all_flow(mt2, "chameleon is a kind of", noise=noise_level, subject="chameleon")





In [None]:
# drag_nonsense("sparrow", ["dog", "dog", "dog", "dog"], "bird")

In [None]:
# request = [
#     {
#         "prompt": "{} is a kind of",
#         "subject": "zjx",
#         "target_new": {"str": "bird"},
#     }
# ]

# # Execute rewrite
# model_new, orig_weights = demo_model_editing(model, tok, request, ["a"], alg_name=ALG_NAME)

# mt2.model = model_new

In [None]:
# request = [
#     {
#         "prompt": "{} is a kind of",
#         "subject": "pigeon",
#         "target_new": {"str": "bird"},
#     }
# ]

# # Execute rewrite
# model_new, orig_weights = demo_model_editing(model, tok, request, ["a"], alg_name=ALG_NAME)

# mt2.model = model_new
# print(200)