# Imports & Installs

In [None]:
from IPython import get_ipython
ipython = get_ipython()
ipython.run_line_magic("load_ext", "autoreload")
ipython.run_line_magic("autoreload", "2")

from IPython.display import display, HTML
import torch
from datasets import load_dataset
import pickle
import webbrowser
import os
from huggingface_hub import hf_hub_download

from sae_vis.model_fns import AutoEncoder, DemoTransformer, DemoTransformerConfig
from sae_vis.data_fetching_fns import get_feature_data, get_prompt_data
from sae_vis.data_storing_fns import FeatureVizParams, MultiFeatureData, MultiPromptData
from sae_vis.utils_fns import create_vocab_dict, tokenize_and_concatenate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.set_grad_enabled(False);

# Setup

## Autoencoders

<!-- We're being a bit lazy here, and slicing our autoencoder so that we only take the first 2048 features (i.e. `dict_mult = 1`) rather than all 16384 features. This is literally just to avoid OOMs; you can increase the `DICT_MULT` parameter up to 8 if you'd like. -->

We set up our autoencoder here. You can use your own autoencoder, as long as it has the same parameters `W_enc`, `W_dec`, `b_enc` and `b_dec` (used in the same way) and has a `cfg` attribute which itself is a dataclass with attributes `d_mlp` and `dict_mult`. The forward pass method doesn't matter; we only ever use the weights directly in this codebase.

In [None]:
encoder = AutoEncoder.load_from_hf(version="run1")
encoder_B = AutoEncoder.load_from_hf(version="run2")

for k, v in encoder.named_parameters():
    print(f"{k}: {tuple(v.shape)}")

## Models

The code below loads in our GELU-1l transformer model. You can create your transformer model any way you like; all that matters is that:

* Your model has a `forward` method which takes `tokens` and returns a tuple of `(logits, residual, post_activations)`.
* This forward method has a parameter `return_logits`, which is by default `True`, and when `False` it only returns `(residual, post_activations)`.

Provided this is the case, all other code here (including calculating the effect of ablating certain features) doesn't rely on any specific implementation details of the model.

If you're trying to use a particular model, we recommend **creating a wrapper class around your model which has an altered `forward` method** to match the required behaviour. In the case of this notebook, to make it clear that a `HookedTransformer` model is not necessary, we're using a `DemoTransformer` model (code in this repository), which is a very minimal version of the `HookedTransformer` model lacking the features like hooks, caches, etc.

In [None]:
from transformer_lens import HookedTransformer
model = HookedTransformer.from_pretrained("gelu-1l")
tokenizer = model.tokenizer

path = "C:/Users/calsm/Documents/AI Alignment/hf/gelu-1l-sae"

# save tokenizer as pkl
with open(path + "/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
# Load our state dict from HuggingFace 
REPO_ID = "callummcdougall/gelu-1l"
tokenizer_path = hf_hub_download(repo_id=REPO_ID, filename="tokenizer.pkl")
with open(tokenizer_path, "rb") as f:
    tokenizer = pickle.load(f)

# Load our state dict from HuggingFace 
weights_path = hf_hub_download(repo_id=REPO_ID, filename="gelu-1l-state-dict.pt")
state_dict = torch.load(weights_path, map_location=device)

# Create config object for our tokenizer
# (see model_fns.py for an explanation of this, and to understand the architecture)
cfg = DemoTransformerConfig(
    act_fn = 'gelu',
    d_head = 64,
    d_mlp = 2048,
    d_model = 512,
    d_vocab = 48262,
    n_ctx = 1024,
    n_heads = 8,
    n_layers = 1,
    device = device,
    dtype = torch.float32,
    normalization_type ='LNPre',
)

# Create our model, and load in the state dict
model = DemoTransformer(cfg, tokenizer)
_ = model.load_state_dict(state_dict)

## Data

Obviously you can replace this code with your own data loading code. You should eventually have a 2D tensor of token ids.

In [None]:
SEQ_LEN = 128

data = load_dataset("NeelNanda/c4-code-20k", split="train")
tokenized_data = tokenize_and_concatenate(data, model.tokenizer, max_length=SEQ_LEN)
tokenized_data = tokenized_data.shuffle(42)
all_tokens: torch.Tensor = tokenized_data["tokens"]

print(all_tokens.shape)

# Creating visualisations #1 (feature-centric)

First, we have a dataclass which contains all the relevant hyperparameters for creating our visualization. 

In [21]:
feature_viz_params = FeatureVizParams(features=range(5, 15), verbose=True)
feature_viz_params.help()

Next, we actually get the feature data. This should only take a few seconds, because we're only computing the first 10 features.

In [22]:
feature_data = get_feature_data(
    encoder = encoder,
    encoder_B = encoder_B,
    model = model,
    tokens = all_tokens,
    fvp = feature_viz_params,
)

                                                                       

Estimated time for all 16384 features = 14 minutes



Now, we generate the HTML. **The `webbrowser` command will not work for you in Colab; you'll need to manually download & open the HTML file from your Colab file storage.**

In [23]:
test_idx = 8
filepath = "feature_viz_demo.html"

In [27]:
html_str = feature_data[test_idx].get_html()

display(HTML(html_str))

with open(filepath, "w") as f:
    f.write(html_str)

result = webbrowser.open(filepath)

0,1,2
Index,Value,% of L1
1221,+0.41,0.7%
1884,+0.33,0.6%
1779,+0.30,0.5%

0,1,2
Index,Pearson Corr.,Cosine Sim.
1221,+0.16,+0.15
917,+0.12,+0.12
803,+0.11,+0.11

0,1,2
Index,Pearson Corr.,Cosine Sim.
3094,+0.87,+0.87
4814,+0.22,+0.22
14546,+0.22,+0.22

0,1
Number,-1.32
Forty,-1.17
ignon,-1.11
Poly,-1.1
Discussion,-1.1
Memory,-1.1
witz,-1.06
Standard,-1.05
ograf,-1.05
Cart,-1.01

0,1
'll,1.88
eding,1.56
certainly,1.52
hope,1.52
OULD,1.51
wouldn,1.48
cheon,1.44
definitely,1.4
seem,1.39
sincerely,1.36

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,me
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,as
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,well
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.971
Loss contribution,-0.000

0,1
Token,have
Feature activation,+0.294
Loss contribution,+0.162

0,1
Pos logit contributions,Pos logit contributions
'll,+1.670
eding,+1.407
OULD,+1.398
eled,+1.354
cheon,+1.242

0,1
Neg logit contributions,Neg logit contributions
Number,-3.069
Forty,-2.813
Discussion,-2.789
Memory,-2.777
Poly,-2.740

0,1
Token,yet
Feature activation,+0.000
Loss contribution,-0.017

0,1
Pos logit contributions,Pos logit contributions
'll,+0.415
eding,+0.332
OULD,+0.319
certainly,+0.318
hope,+0.317

0,1
Neg logit contributions,Neg logit contributions
Number,-0.432
Forty,-0.393
Discussion,-0.375
Poly,-0.374
ignon,-0.373

0,1
Token,to
Feature activation,+0.021
Loss contribution,-0.000

0,1
Token,have
Feature activation,+0.000
Loss contribution,-0.003

0,1
Pos logit contributions,Pos logit contributions
'll,+0.029
eding,+0.022
OULD,+0.022
wouldn,+0.020
certainly,+0.020

0,1
Neg logit contributions,Neg logit contributions
Number,-0.036
Forty,-0.033
Discussion,-0.032
Memory,-0.031
Poly,-0.031

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,intense
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,w
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,agers
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.876
Loss contribution,-0.000

0,1
Token,get
Feature activation,+0.168
Loss contribution,-0.303

0,1
Pos logit contributions,Pos logit contributions
'll,+1.833
OULD,+1.423
cheon,+1.378
certainly,+1.372
eding,+1.310

0,1
Neg logit contributions,Neg logit contributions
Number,-2.807
Forty,-2.636
Discussion,-2.549
Memory,-2.478
ograf,-2.441

0,1
Token,our
Feature activation,+0.000
Loss contribution,+0.006

0,1
Pos logit contributions,Pos logit contributions
'll,+0.237
eding,+0.194
certainly,+0.188
hope,+0.188
OULD,+0.187

0,1
Neg logit contributions,Neg logit contributions
Number,-0.203
Forty,-0.184
ignon,-0.173
Discussion,-0.173
Poly,-0.173

0,1
Token,vitality
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,from
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,con
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,working
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,for
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,me
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,perfectly
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.870
Loss contribution,-0.000

0,1
Token,can
Feature activation,+0.127
Loss contribution,-0.377

0,1
Pos logit contributions,Pos logit contributions
'll,+1.638
eding,+1.431
OULD,+1.376
eled,+1.245
certainly,+1.242

0,1
Neg logit contributions,Neg logit contributions
Number,-2.813
Forty,-2.552
Memory,-2.454
Poly,-2.428
Import,-2.425

0,1
Token,trace
Feature activation,+0.000
Loss contribution,+0.002

0,1
Pos logit contributions,Pos logit contributions
'll,+0.151
eding,+0.124
OULD,+0.121
wouldn,+0.110
cheon,+0.107

0,1
Neg logit contributions,Neg logit contributions
Number,-0.202
Forty,-0.182
Poly,-0.177
Memory,-0.176
Discussion,-0.175

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,payments
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,from
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,almost
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,done
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.812
Loss contribution,-0.000

0,1
Token,got
Feature activation,+0.158
Loss contribution,+0.035

0,1
Pos logit contributions,Pos logit contributions
'll,+1.858
OULD,+1.456
cheon,+1.453
certainly,+1.325
eding,+1.300

0,1
Neg logit contributions,Neg logit contributions
Number,-2.664
Forty,-2.551
Memory,-2.405
Poly,-2.400
Discussion,-2.382

0,1
Token,a
Feature activation,+0.000
Loss contribution,+0.023

0,1
Pos logit contributions,Pos logit contributions
'll,+0.229
eding,+0.185
certainly,+0.180
hope,+0.179
OULD,+0.178

0,1
Neg logit contributions,Neg logit contributions
Number,-0.214
Forty,-0.194
ignon,-0.184
Poly,-0.183
Discussion,-0.183

0,1
Token,pool
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,fence
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,installed
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,sales
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,processes
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.805
Loss contribution,-0.000

0,1
Token,tailor
Feature activation,+0.000
Loss contribution,+0.536

0,1
Pos logit contributions,Pos logit contributions
'll,+2.096
cheon,+1.769
OULD,+1.700
certainly,+1.651
eding,+1.603

0,1
Neg logit contributions,Neg logit contributions
Number,-2.943
Forty,-2.772
Discussion,-2.726
Memory,-2.672
Poly,-2.653

0,1
Token,service
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,models
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,your
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,0.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,1.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,6.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.800
Loss contribution,-0.000

0,1
Token,find
Feature activation,+0.043
Loss contribution,-0.523

0,1
Pos logit contributions,Pos logit contributions
'll,+1.926
eding,+1.507
certainly,+1.464
OULD,+1.463
hope,+1.398

0,1
Neg logit contributions,Neg logit contributions
Number,-2.177
Forty,-1.988
Poly,-1.885
Memory,-1.872
Discussion,-1.844

0,1
Token,the
Feature activation,+0.000
Loss contribution,+0.011

0,1
Pos logit contributions,Pos logit contributions
'll,+0.056
eding,+0.044
certainly,+0.042
hope,+0.042
OULD,+0.042

0,1
Neg logit contributions,Neg logit contributions
Number,-0.068
Forty,-0.062
ignon,-0.059
Poly,-0.059
Discussion,-0.059

0,1
Token,fact
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,that
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,El
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,our
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,residents
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,home
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.733
Loss contribution,-0.000

0,1
Token,will
Feature activation,+0.286
Loss contribution,-0.397

0,1
Pos logit contributions,Pos logit contributions
'll,+1.697
cheon,+1.544
eding,+1.531
atas,+1.432
eled,+1.400

0,1
Neg logit contributions,Neg logit contributions
Number,-4.313
Forty,-4.003
Memory,-3.818
Poly,-3.760
Standard,-3.742

0,1
Token,have
Feature activation,+0.000
Loss contribution,+0.021

0,1
Pos logit contributions,Pos logit contributions
'll,+0.386
OULD,+0.320
eding,+0.316
cheon,+0.288
eled,+0.283

0,1
Neg logit contributions,Neg logit contributions
Number,-0.598
Forty,-0.562
Memory,-0.528
Poly,-0.520
Discussion,-0.506

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,autonomy
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,will
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,be
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,organized
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.713
Loss contribution,-0.000

0,1
Token,know
Feature activation,+0.000
Loss contribution,-0.176

0,1
Pos logit contributions,Pos logit contributions
'll,+1.875
eding,+1.635
cheon,+1.576
hope,+1.463
OULD,+1.460

0,1
Neg logit contributions,Neg logit contributions
Number,-2.772
Forty,-2.530
Discussion,-2.485
Poly,-2.449
Memory,-2.431

0,1
Token,exactly
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,what
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,in
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,either
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,be
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,repaired
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,or
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.646
Loss contribution,-0.000

0,1
Token,âĢĻ
Feature activation,+0.291
Loss contribution,-0.577

0,1
Pos logit contributions,Pos logit contributions
'll,+1.753
OULD,+1.308
cheon,+1.302
certainly,+1.261
hope,+1.190

0,1
Neg logit contributions,Neg logit contributions
Number,-3.147
Forty,-2.975
Poly,-2.889
Discussion,-2.873
Memory,-2.866

0,1
Token,ll
Feature activation,+0.092
Loss contribution,-0.012

0,1
Pos logit contributions,Pos logit contributions
'll,+0.294
eding,+0.210
hope,+0.203
certainly,+0.202
OULD,+0.191

0,1
Neg logit contributions,Neg logit contributions
Number,-0.538
Forty,-0.495
Discussion,-0.479
ignon,-0.477
Memory,-0.477

0,1
Token,evaluate
Feature activation,+0.000
Loss contribution,-0.001

0,1
Pos logit contributions,Pos logit contributions
'll,+0.119
OULD,+0.093
eding,+0.092
cheon,+0.084
wouldn,+0.084

0,1
Neg logit contributions,Neg logit contributions
Number,-0.146
Forty,-0.133
Poly,-0.128
Memory,-0.126
Standard,-0.125

0,1
Token,if
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,they
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,are
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,so
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,cute
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.630
Loss contribution,-0.000

0,1
Token,get
Feature activation,+0.090
Loss contribution,-0.161

0,1
Pos logit contributions,Pos logit contributions
'll,+1.422
OULD,+1.305
eding,+1.245
cheon,+1.133
abouts,+1.106

0,1
Neg logit contributions,Neg logit contributions
Number,-2.424
Forty,-2.235
Memory,-2.179
Poly,-2.172
Cart,-2.034

0,1
Token,lots
Feature activation,+0.000
Loss contribution,+0.009

0,1
Pos logit contributions,Pos logit contributions
'll,+0.134
eding,+0.109
OULD,+0.106
certainly,+0.106
hope,+0.103

0,1
Neg logit contributions,Neg logit contributions
Number,-0.122
Forty,-0.111
Memory,-0.104
Poly,-0.104
ignon,-0.103

0,1
Token,of
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,compl
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,iments
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,touched
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,by
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,their
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,humans
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,they
Feature activation,+1.629
Loss contribution,-0.000

0,1
Token,love
Feature activation,+0.000
Loss contribution,-0.135

0,1
Pos logit contributions,Pos logit contributions
'll,+1.498
cheon,+1.287
OULD,+1.236
eding,+1.217
certainly,+1.067

0,1
Neg logit contributions,Neg logit contributions
Number,-2.599
Forty,-2.419
Poly,-2.329
ignon,-2.326
Memory,-2.321

0,1
Token,massage
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,as
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,much
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,as
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,for
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,collar
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,less
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.592
Loss contribution,-0.000

0,1
Token,love
Feature activation,+0.000
Loss contribution,-0.751

0,1
Pos logit contributions,Pos logit contributions
'll,+2.116
OULD,+1.788
eding,+1.783
certainly,+1.694
hope,+1.603

0,1
Neg logit contributions,Neg logit contributions
Number,-2.311
Forty,-2.117
Poly,-2.072
ignon,-1.966
Memory,-1.950

0,1
Token,it
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,!!
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,This
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,s
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,trunc
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ate
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.565
Loss contribution,-0.000

0,1
Token,just
Feature activation,+0.397
Loss contribution,-0.453

0,1
Pos logit contributions,Pos logit contributions
'll,+1.693
certainly,+1.394
hope,+1.355
eding,+1.345
cheon,+1.344

0,1
Neg logit contributions,Neg logit contributions
Number,-1.896
Discussion,-1.673
Forty,-1.631
witz,-1.626
Memory,-1.616

0,1
Token,want
Feature activation,+0.000
Loss contribution,-0.181

0,1
Pos logit contributions,Pos logit contributions
'll,+0.490
eding,+0.405
certainly,+0.389
OULD,+0.382
hope,+0.376

0,1
Neg logit contributions,Neg logit contributions
Number,-0.460
Forty,-0.408
Discussion,-0.386
Memory,-0.383
Poly,-0.379

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,test
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,\n
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Don
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Ja
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ime
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.560
Loss contribution,-0.000

0,1
Token,will
Feature activation,+0.230
Loss contribution,-1.231

0,1
Pos logit contributions,Pos logit contributions
'll,+2.195
eding,+2.025
OULD,+1.890
certainly,+1.764
hope,+1.735

0,1
Neg logit contributions,Neg logit contributions
Number,-2.338
Forty,-2.156
Memory,-2.002
Discussion,-2.000
Poly,-1.966

0,1
Token,arrive
Feature activation,+0.000
Loss contribution,+0.055

0,1
Pos logit contributions,Pos logit contributions
'll,+0.302
eding,+0.245
OULD,+0.243
wouldn,+0.209
hope,+0.202

0,1
Neg logit contributions,Neg logit contributions
Number,-0.419
Forty,-0.390
Poly,-0.368
Memory,-0.368
ignon,-0.362

0,1
Token,in
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Plaza
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,del
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,very
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,thick
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,beard
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,he
Feature activation,+1.543
Loss contribution,-0.000

0,1
Token,looked
Feature activation,+0.000
Loss contribution,+0.136

0,1
Pos logit contributions,Pos logit contributions
'll,+1.519
hope,+1.213
cheon,+1.169
OULD,+1.166
certainly,+1.082

0,1
Neg logit contributions,Neg logit contributions
Number,-2.109
Forty,-1.936
ignon,-1.936
Poly,-1.902
witz,-1.858

0,1
Token,approximately
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,forty
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,years
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,old
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,MPs
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,away
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,from
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,power
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.501
Loss contribution,-0.000

0,1
Token,cannot
Feature activation,+0.000
Loss contribution,-0.274

0,1
Pos logit contributions,Pos logit contributions
'll,+1.537
OULD,+1.275
cheon,+1.230
eding,+1.164
certainly,+1.162

0,1
Neg logit contributions,Neg logit contributions
Number,-2.259
Forty,-2.105
Discussion,-2.046
Poly,-2.033
Memory,-1.991

0,1
Token,afford
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,have
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,1.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,9.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,4.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,1.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,he
Feature activation,+1.495
Loss contribution,-0.000

0,1
Token,registered
Feature activation,+0.000
Loss contribution,+0.187

0,1
Pos logit contributions,Pos logit contributions
'll,+1.734
hope,+1.426
cheon,+1.416
OULD,+1.408
eding,+1.393

0,1
Neg logit contributions,Neg logit contributions
Number,-2.118
Forty,-1.947
witz,-1.888
Poly,-1.868
ignon,-1.861

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,vote
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,giving
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,roles
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,each
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,member
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,they
Feature activation,+1.468
Loss contribution,-0.000

0,1
Token,will
Feature activation,+0.130
Loss contribution,-0.907

0,1
Pos logit contributions,Pos logit contributions
'll,+1.773
eding,+1.535
cheon,+1.512
OULD,+1.460
hope,+1.439

0,1
Neg logit contributions,Neg logit contributions
Number,-2.385
Forty,-2.203
Discussion,-2.118
Memory,-2.099
Poly,-2.076

0,1
Token,be
Feature activation,+0.000
Loss contribution,+0.034

0,1
Pos logit contributions,Pos logit contributions
'll,+0.160
eding,+0.131
OULD,+0.126
cheon,+0.120
wouldn,+0.115

0,1
Neg logit contributions,Neg logit contributions
Number,-0.206
Forty,-0.191
Poly,-0.181
Memory,-0.181
Discussion,-0.178

0,1
Token,able
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,access
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,three
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,months
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ago
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.463
Loss contribution,-0.000

0,1
Token,now
Feature activation,+0.469
Loss contribution,+0.288

0,1
Pos logit contributions,Pos logit contributions
'll,+1.636
eding,+1.431
cheon,+1.402
OULD,+1.343
hope,+1.299

0,1
Neg logit contributions,Neg logit contributions
Number,-2.706
Forty,-2.494
Discussion,-2.409
Standard,-2.333
Memory,-2.302

0,1
Token,want
Feature activation,+0.000
Loss contribution,-0.159

0,1
Pos logit contributions,Pos logit contributions
'll,+0.605
eding,+0.526
cheon,+0.494
OULD,+0.486
certainly,+0.459

0,1
Neg logit contributions,Neg logit contributions
Number,-0.783
Forty,-0.715
Discussion,-0.679
Memory,-0.667
Standard,-0.660

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,review
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,in
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,your
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,heart
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.463
Loss contribution,-0.000

0,1
Token,don
Feature activation,+0.169
Loss contribution,+0.176

0,1
Pos logit contributions,Pos logit contributions
'll,+1.066
eding,+0.959
cheon,+0.828
OULD,+0.798
atas,+0.790

0,1
Neg logit contributions,Neg logit contributions
Number,-3.074
Forty,-2.832
Discussion,-2.765
Poly,-2.754
Memory,-2.737

0,1
Token,'t
Feature activation,+0.000
Loss contribution,-0.000

0,1
Pos logit contributions,Pos logit contributions
'll,+0.169
eding,+0.128
OULD,+0.124
hope,+0.121
certainly,+0.121

0,1
Neg logit contributions,Neg logit contributions
Number,-0.308
Forty,-0.287
Discussion,-0.280
Memory,-0.278
ignon,-0.278

0,1
Token,have
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,compassion
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,intense
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,w
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,agers
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.876
Loss contribution,-0.000

0,1
Token,get
Feature activation,+0.168
Loss contribution,-0.303

0,1
Pos logit contributions,Pos logit contributions
'll,+1.833
OULD,+1.423
cheon,+1.378
certainly,+1.372
eding,+1.310

0,1
Neg logit contributions,Neg logit contributions
Number,-2.807
Forty,-2.636
Discussion,-2.549
Memory,-2.478
ograf,-2.441

0,1
Token,our
Feature activation,+0.000
Loss contribution,+0.006

0,1
Pos logit contributions,Pos logit contributions
'll,+0.237
eding,+0.194
certainly,+0.188
hope,+0.188
OULD,+0.187

0,1
Neg logit contributions,Neg logit contributions
Number,-0.203
Forty,-0.184
ignon,-0.173
Discussion,-0.173
Poly,-0.173

0,1
Token,vitality
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,from
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,con
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,me
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,as
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,well
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.971
Loss contribution,-0.000

0,1
Token,have
Feature activation,+0.294
Loss contribution,+0.162

0,1
Pos logit contributions,Pos logit contributions
'll,+1.670
eding,+1.407
OULD,+1.398
eled,+1.354
cheon,+1.242

0,1
Neg logit contributions,Neg logit contributions
Number,-3.069
Forty,-2.813
Discussion,-2.789
Memory,-2.777
Poly,-2.740

0,1
Token,yet
Feature activation,+0.000
Loss contribution,-0.017

0,1
Pos logit contributions,Pos logit contributions
'll,+0.415
eding,+0.332
OULD,+0.319
certainly,+0.318
hope,+0.317

0,1
Neg logit contributions,Neg logit contributions
Number,-0.432
Forty,-0.393
Discussion,-0.375
Poly,-0.374
ignon,-0.373

0,1
Token,to
Feature activation,+0.021
Loss contribution,-0.000

0,1
Token,have
Feature activation,+0.000
Loss contribution,-0.003

0,1
Pos logit contributions,Pos logit contributions
'll,+0.029
eding,+0.022
OULD,+0.022
wouldn,+0.020
certainly,+0.020

0,1
Neg logit contributions,Neg logit contributions
Number,-0.036
Forty,-0.033
Discussion,-0.032
Memory,-0.031
Poly,-0.031

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,working
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,for
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,me
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,perfectly
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.870
Loss contribution,-0.000

0,1
Token,can
Feature activation,+0.127
Loss contribution,-0.377

0,1
Pos logit contributions,Pos logit contributions
'll,+1.638
eding,+1.431
OULD,+1.376
eled,+1.245
certainly,+1.242

0,1
Neg logit contributions,Neg logit contributions
Number,-2.813
Forty,-2.552
Memory,-2.454
Poly,-2.428
Import,-2.425

0,1
Token,trace
Feature activation,+0.000
Loss contribution,+0.002

0,1
Pos logit contributions,Pos logit contributions
'll,+0.151
eding,+0.124
OULD,+0.121
wouldn,+0.110
cheon,+0.107

0,1
Neg logit contributions,Neg logit contributions
Number,-0.202
Forty,-0.182
Poly,-0.177
Memory,-0.176
Discussion,-0.175

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,payments
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,from
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,almost
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,done
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.812
Loss contribution,-0.000

0,1
Token,got
Feature activation,+0.158
Loss contribution,+0.035

0,1
Pos logit contributions,Pos logit contributions
'll,+1.858
OULD,+1.456
cheon,+1.453
certainly,+1.325
eding,+1.300

0,1
Neg logit contributions,Neg logit contributions
Number,-2.664
Forty,-2.551
Memory,-2.405
Poly,-2.400
Discussion,-2.382

0,1
Token,a
Feature activation,+0.000
Loss contribution,+0.023

0,1
Pos logit contributions,Pos logit contributions
'll,+0.229
eding,+0.185
certainly,+0.180
hope,+0.179
OULD,+0.178

0,1
Neg logit contributions,Neg logit contributions
Number,-0.214
Forty,-0.194
ignon,-0.184
Poly,-0.183
Discussion,-0.183

0,1
Token,pool
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,fence
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,installed
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,sales
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,processes
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.805
Loss contribution,-0.000

0,1
Token,tailor
Feature activation,+0.000
Loss contribution,+0.536

0,1
Pos logit contributions,Pos logit contributions
'll,+2.096
cheon,+1.769
OULD,+1.700
certainly,+1.651
eding,+1.603

0,1
Neg logit contributions,Neg logit contributions
Number,-2.943
Forty,-2.772
Discussion,-2.726
Memory,-2.672
Poly,-2.653

0,1
Token,service
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,models
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,your
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,they
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,are
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,so
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,cute
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.630
Loss contribution,-0.000

0,1
Token,get
Feature activation,+0.090
Loss contribution,-0.161

0,1
Pos logit contributions,Pos logit contributions
'll,+1.422
OULD,+1.305
eding,+1.245
cheon,+1.133
abouts,+1.106

0,1
Neg logit contributions,Neg logit contributions
Number,-2.424
Forty,-2.235
Memory,-2.179
Poly,-2.172
Cart,-2.034

0,1
Token,lots
Feature activation,+0.000
Loss contribution,+0.009

0,1
Pos logit contributions,Pos logit contributions
'll,+0.134
eding,+0.109
OULD,+0.106
certainly,+0.106
hope,+0.103

0,1
Neg logit contributions,Neg logit contributions
Number,-0.122
Forty,-0.111
Memory,-0.104
Poly,-0.104
ignon,-0.103

0,1
Token,of
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,compl
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,iments
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,will
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,be
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,organized
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.713
Loss contribution,-0.000

0,1
Token,know
Feature activation,+0.000
Loss contribution,-0.176

0,1
Pos logit contributions,Pos logit contributions
'll,+1.875
eding,+1.635
cheon,+1.576
hope,+1.463
OULD,+1.460

0,1
Neg logit contributions,Neg logit contributions
Number,-2.772
Forty,-2.530
Discussion,-2.485
Poly,-2.449
Memory,-2.431

0,1
Token,exactly
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,what
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,in
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,our
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,residents
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,home
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.733
Loss contribution,-0.000

0,1
Token,will
Feature activation,+0.286
Loss contribution,-0.397

0,1
Pos logit contributions,Pos logit contributions
'll,+1.697
cheon,+1.544
eding,+1.531
atas,+1.432
eled,+1.400

0,1
Neg logit contributions,Neg logit contributions
Number,-4.313
Forty,-4.003
Memory,-3.818
Poly,-3.760
Standard,-3.742

0,1
Token,have
Feature activation,+0.000
Loss contribution,+0.021

0,1
Pos logit contributions,Pos logit contributions
'll,+0.386
OULD,+0.320
eding,+0.316
cheon,+0.288
eled,+0.283

0,1
Neg logit contributions,Neg logit contributions
Number,-0.598
Forty,-0.562
Memory,-0.528
Poly,-0.520
Discussion,-0.506

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,autonomy
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,either
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,be
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,repaired
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,or
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.646
Loss contribution,-0.000

0,1
Token,âĢĻ
Feature activation,+0.291
Loss contribution,-0.577

0,1
Pos logit contributions,Pos logit contributions
'll,+1.753
OULD,+1.308
cheon,+1.302
certainly,+1.261
hope,+1.190

0,1
Neg logit contributions,Neg logit contributions
Number,-3.147
Forty,-2.975
Poly,-2.889
Discussion,-2.873
Memory,-2.866

0,1
Token,ll
Feature activation,+0.092
Loss contribution,-0.012

0,1
Pos logit contributions,Pos logit contributions
'll,+0.294
eding,+0.210
hope,+0.203
certainly,+0.202
OULD,+0.191

0,1
Neg logit contributions,Neg logit contributions
Number,-0.538
Forty,-0.495
Discussion,-0.479
ignon,-0.477
Memory,-0.477

0,1
Token,evaluate
Feature activation,+0.000
Loss contribution,-0.001

0,1
Pos logit contributions,Pos logit contributions
'll,+0.119
OULD,+0.093
eding,+0.092
cheon,+0.084
wouldn,+0.084

0,1
Neg logit contributions,Neg logit contributions
Number,-0.146
Forty,-0.133
Poly,-0.128
Memory,-0.126
Standard,-0.125

0,1
Token,if
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,touched
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,by
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,their
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,humans
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,they
Feature activation,+1.629
Loss contribution,-0.000

0,1
Token,love
Feature activation,+0.000
Loss contribution,-0.135

0,1
Pos logit contributions,Pos logit contributions
'll,+1.498
cheon,+1.287
OULD,+1.236
eding,+1.217
certainly,+1.067

0,1
Neg logit contributions,Neg logit contributions
Number,-2.599
Forty,-2.419
Poly,-2.329
ignon,-2.326
Memory,-2.321

0,1
Token,massage
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,as
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,much
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,as
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Don
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Ja
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ime
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.560
Loss contribution,-0.000

0,1
Token,will
Feature activation,+0.230
Loss contribution,-1.231

0,1
Pos logit contributions,Pos logit contributions
'll,+2.195
eding,+2.025
OULD,+1.890
certainly,+1.764
hope,+1.735

0,1
Neg logit contributions,Neg logit contributions
Number,-2.338
Forty,-2.156
Memory,-2.002
Discussion,-2.000
Poly,-1.966

0,1
Token,arrive
Feature activation,+0.000
Loss contribution,+0.055

0,1
Pos logit contributions,Pos logit contributions
'll,+0.302
eding,+0.245
OULD,+0.243
wouldn,+0.209
hope,+0.202

0,1
Neg logit contributions,Neg logit contributions
Number,-0.419
Forty,-0.390
Poly,-0.368
Memory,-0.368
ignon,-0.362

0,1
Token,in
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Plaza
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,del
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,three
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,months
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ago
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+1.463
Loss contribution,-0.000

0,1
Token,now
Feature activation,+0.469
Loss contribution,+0.288

0,1
Pos logit contributions,Pos logit contributions
'll,+1.636
eding,+1.431
cheon,+1.402
OULD,+1.343
hope,+1.299

0,1
Neg logit contributions,Neg logit contributions
Number,-2.706
Forty,-2.494
Discussion,-2.409
Standard,-2.333
Memory,-2.302

0,1
Token,want
Feature activation,+0.000
Loss contribution,-0.159

0,1
Pos logit contributions,Pos logit contributions
'll,+0.605
eding,+0.526
cheon,+0.494
OULD,+0.486
certainly,+0.459

0,1
Neg logit contributions,Neg logit contributions
Number,-0.783
Forty,-0.715
Discussion,-0.679
Memory,-0.667
Standard,-0.660

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,review
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,stack
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,addressing
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.448
Loss contribution,-0.000

0,1
Token,/
Feature activation,+0.344
Loss contribution,-0.042

0,1
Pos logit contributions,Pos logit contributions
'll,+1.960
eding,+1.614
OULD,+1.608
certainly,+1.598
hope,+1.554

0,1
Neg logit contributions,Neg logit contributions
Number,-2.028
Memory,-1.806
Forty,-1.790
Discussion,-1.730
witz,-1.699

0,1
Token,O
Feature activation,+0.000
Loss contribution,-0.030

0,1
Pos logit contributions,Pos logit contributions
'll,+0.644
eding,+0.524
hope,+0.519
certainly,+0.519
wouldn,+0.505

0,1
Neg logit contributions,Neg logit contributions
Number,-0.497
Forty,-0.444
Discussion,-0.429
Memory,-0.427
Poly,-0.419

0,1
Token,facilities
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,are
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,available
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,trunc
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ate
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.565
Loss contribution,-0.000

0,1
Token,just
Feature activation,+0.397
Loss contribution,-0.453

0,1
Pos logit contributions,Pos logit contributions
'll,+1.693
certainly,+1.394
hope,+1.355
eding,+1.345
cheon,+1.344

0,1
Neg logit contributions,Neg logit contributions
Number,-1.896
Discussion,-1.673
Forty,-1.631
witz,-1.626
Memory,-1.616

0,1
Token,want
Feature activation,+0.000
Loss contribution,-0.181

0,1
Pos logit contributions,Pos logit contributions
'll,+0.490
eding,+0.405
certainly,+0.389
OULD,+0.382
hope,+0.376

0,1
Neg logit contributions,Neg logit contributions
Number,-0.460
Forty,-0.408
Discussion,-0.386
Memory,-0.383
Poly,-0.379

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,test
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,\n
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,MPs
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,away
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,from
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,power
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.501
Loss contribution,-0.000

0,1
Token,cannot
Feature activation,+0.000
Loss contribution,-0.274

0,1
Pos logit contributions,Pos logit contributions
'll,+1.537
OULD,+1.275
cheon,+1.230
eding,+1.164
certainly,+1.162

0,1
Neg logit contributions,Neg logit contributions
Number,-2.259
Forty,-2.105
Discussion,-2.046
Poly,-2.033
Memory,-1.991

0,1
Token,afford
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,have
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,my
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,company
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,completely
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,fails
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+1.359
Loss contribution,-0.000

0,1
Token,fall
Feature activation,+0.000
Loss contribution,-0.314

0,1
Pos logit contributions,Pos logit contributions
'll,+1.237
eding,+1.028
OULD,+0.977
eled,+0.960
cheon,+0.944

0,1
Neg logit contributions,Neg logit contributions
Number,-2.201
Forty,-2.060
Memory,-1.982
Discussion,-1.927
Poly,-1.919

0,1
Token,flat
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,on
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,my
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,face
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,spirits
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,are
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,dece
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ivers
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,they
Feature activation,+1.292
Loss contribution,-0.000

0,1
Token,cannot
Feature activation,+0.025
Loss contribution,-0.344

0,1
Pos logit contributions,Pos logit contributions
'll,+1.353
OULD,+1.089
eding,+1.080
cheon,+1.042
hope,+1.025

0,1
Neg logit contributions,Neg logit contributions
Number,-1.605
Forty,-1.487
Poly,-1.436
Discussion,-1.400
witz,-1.398

0,1
Token,be
Feature activation,+0.000
Loss contribution,+0.004

0,1
Pos logit contributions,Pos logit contributions
'll,+0.028
OULD,+0.023
eding,+0.023
wouldn,+0.020
cheon,+0.020

0,1
Neg logit contributions,Neg logit contributions
Number,-0.031
Forty,-0.029
Poly,-0.028
ignon,-0.028
Memory,-0.027

0,1
Token,trusted
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,.
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,This
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,make
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,splash
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,he
Feature activation,+1.345
Loss contribution,-0.000

0,1
Token,âĢĻ
Feature activation,+0.515
Loss contribution,-0.687

0,1
Pos logit contributions,Pos logit contributions
'll,+1.517
cheon,+1.247
hope,+1.226
OULD,+1.160
eding,+1.147

0,1
Neg logit contributions,Neg logit contributions
Number,-1.877
Forty,-1.676
Poly,-1.675
witz,-1.641
Discussion,-1.600

0,1
Token,s
Feature activation,+0.254
Loss contribution,+0.269

0,1
Pos logit contributions,Pos logit contributions
'll,+0.631
eding,+0.477
hope,+0.476
certainly,+0.455
OULD,+0.445

0,1
Neg logit contributions,Neg logit contributions
Number,-0.858
Forty,-0.771
ignon,-0.766
witz,-0.751
Poly,-0.744

0,1
Token,playing
Feature activation,+0.000
Loss contribution,+0.065

0,1
Pos logit contributions,Pos logit contributions
'll,+0.390
eding,+0.317
OULD,+0.304
hope,+0.301
certainly,+0.298

0,1
Neg logit contributions,Neg logit contributions
Number,-0.360
Forty,-0.324
Poly,-0.308
ignon,-0.304
Discussion,-0.303

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,risky
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,1.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,9.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,4.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,0.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,she
Feature activation,+1.140
Loss contribution,-0.000

0,1
Token,registered
Feature activation,+0.000
Loss contribution,+0.054

0,1
Pos logit contributions,Pos logit contributions
'll,+2.532
hope,+2.071
eding,+2.012
certainly,+1.977
OULD,+1.930

0,1
Neg logit contributions,Neg logit contributions
Number,-2.350
Forty,-2.079
ignon,-2.048
Poly,-1.996
Memory,-1.936

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,vote
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,giving
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,1.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,9.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,9.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,6.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,he
Feature activation,+1.107
Loss contribution,-0.000

0,1
Token,registered
Feature activation,+0.000
Loss contribution,+0.025

0,1
Pos logit contributions,Pos logit contributions
'll,+2.557
hope,+2.084
certainly,+1.998
eding,+1.994
cheon,+1.973

0,1
Neg logit contributions,Neg logit contributions
Number,-2.299
Forty,-2.071
ignon,-2.054
Poly,-1.969
witz,-1.914

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,vote
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,giving
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,<|BOS|>
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,in
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,True
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+1.156
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,-0.005

0,1
Pos logit contributions,Pos logit contributions
'll,+1.301
eding,+1.141
hope,+1.113
OULD,+1.046
cheon,+1.019

0,1
Neg logit contributions,Neg logit contributions
Number,-1.639
Forty,-1.493
Discussion,-1.477
Poly,-1.461
Memory,-1.437

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,same
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,user
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,name
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,revenues
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,margins
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,âĢĶ
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+1.051
Loss contribution,-0.000

0,1
Token,âĢĻ
Feature activation,+0.342
Loss contribution,-0.630

0,1
Pos logit contributions,Pos logit contributions
'll,+1.408
hope,+1.160
eding,+1.139
OULD,+1.102
cheon,+1.048

0,1
Neg logit contributions,Neg logit contributions
Number,-1.560
Forty,-1.452
Poly,-1.416
Discussion,-1.368
Standard,-1.344

0,1
Token,s
Feature activation,+0.246
Loss contribution,+0.017

0,1
Pos logit contributions,Pos logit contributions
'll,+0.573
eding,+0.463
hope,+0.460
OULD,+0.452
certainly,+0.452

0,1
Neg logit contributions,Neg logit contributions
Number,-0.442
Forty,-0.396
ignon,-0.381
Poly,-0.371
Discussion,-0.371

0,1
Token,applicable
Feature activation,+0.000
Loss contribution,+0.010

0,1
Pos logit contributions,Pos logit contributions
'll,+0.395
eding,+0.323
OULD,+0.311
hope,+0.311
certainly,+0.308

0,1
Neg logit contributions,Neg logit contributions
Number,-0.322
Forty,-0.291
Poly,-0.273
Discussion,-0.272
ignon,-0.271

0,1
Token,in
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ci
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,inside
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+1.082
Loss contribution,-0.000

0,1
Token,was
Feature activation,+0.072
Loss contribution,-0.189

0,1
Pos logit contributions,Pos logit contributions
'll,+1.155
eding,+1.005
hope,+0.989
OULD,+0.946
cheon,+0.877

0,1
Neg logit contributions,Neg logit contributions
Number,-1.480
Forty,-1.381
Poly,-1.348
Discussion,-1.345
Memory,-1.310

0,1
Token,recently
Feature activation,+0.000
Loss contribution,-0.016

0,1
Pos logit contributions,Pos logit contributions
'll,+0.110
eding,+0.090
hope,+0.086
OULD,+0.086
certainly,+0.085

0,1
Neg logit contributions,Neg logit contributions
Number,-0.099
Forty,-0.090
Poly,-0.085
Discussion,-0.085
Memory,-0.085

0,1
Token,used
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,as
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,1.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,9.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,7.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,7.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,he
Feature activation,+0.977
Loss contribution,-0.000

0,1
Token,registered
Feature activation,+0.000
Loss contribution,+0.011

0,1
Pos logit contributions,Pos logit contributions
'll,+2.316
hope,+1.879
eding,+1.816
cheon,+1.810
certainly,+1.809

0,1
Neg logit contributions,Neg logit contributions
Number,-2.004
Forty,-1.798
ignon,-1.791
Poly,-1.716
witz,-1.653

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,vote
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,giving
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,meets
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,new
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,one
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.901
Loss contribution,-0.000

0,1
Token,âĢĻ
Feature activation,+0.254
Loss contribution,-0.458

0,1
Pos logit contributions,Pos logit contributions
'll,+0.960
eding,+0.864
hope,+0.842
OULD,+0.833
cheon,+0.741

0,1
Neg logit contributions,Neg logit contributions
Number,-1.285
Forty,-1.213
Discussion,-1.161
Poly,-1.161
Memory,-1.119

0,1
Token,s
Feature activation,+0.033
Loss contribution,+0.103

0,1
Pos logit contributions,Pos logit contributions
'll,+0.328
eding,+0.256
hope,+0.253
OULD,+0.244
certainly,+0.242

0,1
Neg logit contributions,Neg logit contributions
Number,-0.407
Forty,-0.378
ignon,-0.370
Poly,-0.358
Discussion,-0.358

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.001

0,1
Pos logit contributions,Pos logit contributions
'll,+0.052
eding,+0.043
hope,+0.041
OULD,+0.041
certainly,+0.041

0,1
Neg logit contributions,Neg logit contributions
Number,-0.043
Forty,-0.039
Poly,-0.036
ignon,-0.036
Memory,-0.036

0,1
Token,hub
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,of
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,itt
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ad
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ini
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.962
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,+0.004

0,1
Pos logit contributions,Pos logit contributions
'll,+1.200
eding,+0.969
hope,+0.963
OULD,+0.946
cheon,+0.868

0,1
Neg logit contributions,Neg logit contributions
Number,-1.461
Forty,-1.385
Poly,-1.333
Discussion,-1.311
ignon,-1.282

0,1
Token,currently
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,owned
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,by
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,She
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,<|BOS|>
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,same
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,but
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+0.840
Loss contribution,-0.000

0,1
Token,think
Feature activation,+0.000
Loss contribution,-0.197

0,1
Pos logit contributions,Pos logit contributions
'll,+0.627
eding,+0.527
OULD,+0.498
eled,+0.464
cheon,+0.460

0,1
Neg logit contributions,Neg logit contributions
Number,-1.169
Forty,-1.069
Memory,-1.051
Poly,-1.042
Discussion,-1.020

0,1
Token,this
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,quite
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,clever
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,s
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,H
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ats
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,âĢĿ
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.889
Loss contribution,-0.000

0,1
Token,felt
Feature activation,+0.000
Loss contribution,-0.197

0,1
Pos logit contributions,Pos logit contributions
'll,+1.072
eding,+0.845
hope,+0.806
OULD,+0.788
cheon,+0.724

0,1
Neg logit contributions,Neg logit contributions
Number,-1.424
Forty,-1.333
Discussion,-1.295
Poly,-1.259
Memory,-1.237

0,1
Token,GOOD
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,be
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,on
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,/
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,CF
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,S
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,someone
Feature activation,+0.626
Loss contribution,-0.000

0,1
Token,asked
Feature activation,+0.000
Loss contribution,-0.007

0,1
Pos logit contributions,Pos logit contributions
'll,+0.674
eding,+0.512
hope,+0.485
OULD,+0.478
cheon,+0.470

0,1
Neg logit contributions,Neg logit contributions
Number,-0.962
Forty,-0.888
Discussion,-0.846
Memory,-0.843
Poly,-0.839

0,1
Token,what
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,wanted
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,for
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,possible
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,comments
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,array
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,but
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+0.619
Loss contribution,-0.000

0,1
Token,haven
Feature activation,+0.000
Loss contribution,+0.053

0,1
Pos logit contributions,Pos logit contributions
'll,+0.303
cheon,+0.253
eled,+0.236
eding,+0.230
OULD,+0.218

0,1
Neg logit contributions,Neg logit contributions
Number,-1.059
Forty,-0.992
Discussion,-0.970
Poly,-0.960
Memory,-0.960

0,1
Token,'t
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,defined
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,how
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,me
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,interpretation
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,;
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,but
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+0.655
Loss contribution,-0.000

0,1
Token,are
Feature activation,+0.000
Loss contribution,-0.225

0,1
Pos logit contributions,Pos logit contributions
'll,+0.618
eding,+0.502
cheon,+0.471
hope,+0.454
OULD,+0.452

0,1
Neg logit contributions,Neg logit contributions
Number,-0.888
Forty,-0.804
witz,-0.781
Poly,-0.771
Discussion,-0.771

0,1
Token,able
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,for
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,of
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,toy
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,materials
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.688
Loss contribution,-0.000

0,1
Token,won
Feature activation,+0.000
Loss contribution,-0.364

0,1
Pos logit contributions,Pos logit contributions
'll,+0.911
hope,+0.787
eding,+0.777
OULD,+0.768
cheon,+0.716

0,1
Neg logit contributions,Neg logit contributions
Number,-1.052
Forty,-1.000
Poly,-0.997
Memory,-0.947
Discussion,-0.934

0,1
Token,âĢĻ
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,t
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,stain
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,your
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,erm
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ids
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,but
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+0.638
Loss contribution,-0.000

0,1
Token,find
Feature activation,+0.000
Loss contribution,-0.022

0,1
Pos logit contributions,Pos logit contributions
'll,+0.537
eding,+0.468
hope,+0.428
cheon,+0.416
certainly,+0.391

0,1
Neg logit contributions,Neg logit contributions
Number,-1.018
Forty,-0.910
Memory,-0.895
Discussion,-0.892
Poly,-0.883

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,small
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,fortune
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,""""
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,trunc
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ate
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,we
Feature activation,+1.565
Loss contribution,-0.000

0,1
Token,just
Feature activation,+0.397
Loss contribution,-0.453

0,1
Pos logit contributions,Pos logit contributions
'll,+1.693
certainly,+1.394
hope,+1.355
eding,+1.345
cheon,+1.344

0,1
Neg logit contributions,Neg logit contributions
Number,-1.896
Discussion,-1.673
Forty,-1.631
witz,-1.626
Memory,-1.616

0,1
Token,want
Feature activation,+0.000
Loss contribution,-0.181

0,1
Pos logit contributions,Pos logit contributions
'll,+0.490
eding,+0.405
certainly,+0.389
OULD,+0.382
hope,+0.376

0,1
Neg logit contributions,Neg logit contributions
Number,-0.460
Forty,-0.408
Discussion,-0.386
Memory,-0.383
Poly,-0.379

0,1
Token,to
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,test
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,\n
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,#
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,serving
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,business
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,partners
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,customers
Feature activation,+0.478
Loss contribution,-0.000

0,1
Token,.
Feature activation,+0.000
Loss contribution,+0.049

0,1
Pos logit contributions,Pos logit contributions
'll,+0.750
eding,+0.603
OULD,+0.582
hope,+0.581
certainly,+0.580

0,1
Neg logit contributions,Neg logit contributions
Number,-0.761
Forty,-0.691
Poly,-0.660
ignon,-0.659
Memory,-0.656

0,1
Token,There
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,are
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,numerous
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,corporate
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,same
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,products
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,online
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,then
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+0.474
Loss contribution,-0.000

0,1
Token,google
Feature activation,+0.000
Loss contribution,+0.286

0,1
Pos logit contributions,Pos logit contributions
'll,+0.468
eding,+0.405
OULD,+0.395
eled,+0.365
certainly,+0.355

0,1
Neg logit contributions,Neg logit contributions
Number,-0.719
Forty,-0.672
Memory,-0.635
Discussion,-0.633
Poly,-0.632

0,1
Token,for
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,prom
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,oc
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,odes
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,the
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,job
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,he
Feature activation,+1.392
Loss contribution,-0.000

0,1
Token,âĢĻ
Feature activation,+0.488
Loss contribution,-0.614

0,1
Pos logit contributions,Pos logit contributions
'll,+1.558
cheon,+1.291
atas,+1.233
hope,+1.209
OULD,+1.191

0,1
Neg logit contributions,Neg logit contributions
Number,-1.848
Forty,-1.653
Poly,-1.633
witz,-1.615
Discussion,-1.609

0,1
Token,ll
Feature activation,+0.203
Loss contribution,-0.051

0,1
Pos logit contributions,Pos logit contributions
'll,+0.576
hope,+0.430
eding,+0.422
certainly,+0.409
OULD,+0.399

0,1
Neg logit contributions,Neg logit contributions
Number,-0.797
ignon,-0.717
Forty,-0.716
witz,-0.706
Poly,-0.685

0,1
Token,be
Feature activation,+0.000
Loss contribution,+0.066

0,1
Pos logit contributions,Pos logit contributions
'll,+0.234
eding,+0.182
OULD,+0.173
cheon,+0.163
hope,+0.155

0,1
Neg logit contributions,Neg logit contributions
Number,-0.316
Forty,-0.282
ieurs,-0.272
Poly,-0.271
witz,-0.268

0,1
Token,fired
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,if
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,he
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,hear
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,bats
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,but
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,you
Feature activation,+0.548
Loss contribution,-0.000

0,1
Token,also
Feature activation,+0.119
Loss contribution,-0.123

0,1
Pos logit contributions,Pos logit contributions
'll,+0.454
eding,+0.398
cheon,+0.384
hope,+0.355
OULD,+0.328

0,1
Neg logit contributions,Neg logit contributions
Number,-0.859
Forty,-0.777
witz,-0.758
Memory,-0.751
Discussion,-0.737

0,1
Token,think
Feature activation,+0.000
Loss contribution,-0.078

0,1
Pos logit contributions,Pos logit contributions
'll,+0.157
eding,+0.133
certainly,+0.120
hope,+0.120
cheon,+0.120

0,1
Neg logit contributions,Neg logit contributions
Number,-0.171
Forty,-0.156
Memory,-0.148
ieurs,-0.147
Discussion,-0.146

0,1
Token,you
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,might
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,be
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,m
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ire
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,clay
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,;
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,he
Feature activation,+0.394
Loss contribution,-0.000

0,1
Token,set
Feature activation,+0.000
Loss contribution,+0.021

0,1
Pos logit contributions,Pos logit contributions
'll,+0.405
OULD,+0.338
hope,+0.332
cheon,+0.322
certainly,+0.312

0,1
Neg logit contributions,Neg logit contributions
Number,-0.479
ignon,-0.445
Forty,-0.443
witz,-0.438
Poly,-0.432

0,1
Token,my
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,feet
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,upon
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,a
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,people
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,love
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,him
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,while
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,it
Feature activation,+0.235
Loss contribution,-0.000

0,1
Token,seems
Feature activation,+0.000
Loss contribution,-0.027

0,1
Pos logit contributions,Pos logit contributions
'll,+0.290
OULD,+0.252
hope,+0.245
eding,+0.243
cheon,+0.228

0,1
Neg logit contributions,Neg logit contributions
Number,-0.360
Forty,-0.337
Poly,-0.328
Discussion,-0.316
witz,-0.307

0,1
Token,almost
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,as
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,many
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,det
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,home
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,duration
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,exceeded
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,or
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,there
Feature activation,+0.229
Loss contribution,-0.000

0,1
Token,is
Feature activation,+0.000
Loss contribution,+0.056

0,1
Pos logit contributions,Pos logit contributions
'll,+0.195
eding,+0.141
hope,+0.134
OULD,+0.132
certainly,+0.131

0,1
Neg logit contributions,Neg logit contributions
Number,-0.408
Forty,-0.376
Discussion,-0.363
Memory,-0.360
ignon,-0.360

0,1
Token,no
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Battery
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,""")"
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,#
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,with
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,or
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,affects
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,and
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,which
Feature activation,+0.346
Loss contribution,-0.000

0,1
Token,require
Feature activation,+0.000
Loss contribution,-0.108

0,1
Pos logit contributions,Pos logit contributions
'll,+0.548
eding,+0.446
hope,+0.431
certainly,+0.428
OULD,+0.427

0,1
Neg logit contributions,Neg logit contributions
Number,-0.498
Forty,-0.448
Discussion,-0.428
Memory,-0.426
Poly,-0.425

0,1
Token,thought
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,","
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,del
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ib
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,):
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,\n
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,#
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,""""""""
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,It
Feature activation,+0.237
Loss contribution,-0.000

0,1
Token,will
Feature activation,+0.000
Loss contribution,-0.181

0,1
Pos logit contributions,Pos logit contributions
'll,+0.301
hope,+0.248
eding,+0.246
certainly,+0.239
OULD,+0.236

0,1
Neg logit contributions,Neg logit contributions
Number,-0.298
Forty,-0.265
Discussion,-0.264
witz,-0.256
ignon,-0.256

0,1
Token,not
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,allow
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,birth
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,dates
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,\n
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,f
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,1.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,.
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,create
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Dim
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ension
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,('
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,eta
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,_
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,v
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,oe
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ev
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,oking
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,le
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,et
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Al
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,p
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,raz
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ol
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,am
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,Order
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,(
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,""">"
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,I
Feature activation,+0.028
Loss contribution,-0.000

0,1
Token,""","
Feature activation,+0.000
Loss contribution,+0.000

0,1
Pos logit contributions,Pos logit contributions
'll,+0.046
eding,+0.039
certainly,+0.038
OULD,+0.038
hope,+0.038

0,1
Neg logit contributions,Neg logit contributions
Number,-0.034
Forty,-0.029
Memory,-0.028
Poly,-0.028
Discussion,-0.028

0,1
Token,b
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,"""\\"
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,x
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,0.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,0.0
Feature activation,0.0
Loss contribution,-0.0

0,1
Token,""""
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,+
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,args
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,.
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ud
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,p
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,:
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,\nĉĉ
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,ĉ
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,client
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,=
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,socket
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,.
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,'
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,c
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,++
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,'.
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,\n
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,'-
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,x
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,"',"
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,\n
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,'
Feature activation,+0.000
Loss contribution,-0.000

0,1
Token,c
Feature activation,+0.000
Loss contribution,-0.000


You can also generate smaller plots. If you don't care about getting the sequences in the activation quantiles (which are the things that take the most time to generate), you can pass `n_groups=0` into the `FeatureVizParams` dataclass. This roughly halves the time taken to generate the visualisation.

In [None]:
feature_viz_params = FeatureVizParams(n_groups=0)

feature_data = get_feature_data(
    encoder = encoder,
    encoder_B = encoder_B,
    model = model,
    tokens = all_tokens,
    fvp = feature_viz_params,
)

In [None]:
html_str = feature_data.feature_data_dict[test_idx].get_html(width=320)
display(HTML(html_str))
with open(filepath, "w") as f:
    f.write(html_str)
result = webbrowser.open(filepath)

And if you want to be even more minimal, you can remove the tables on the left hand side. When you do this, the middle column will be rearranged by default, to make everything more compact. This visualization also shows what `border = False` looks like.

In [None]:
feature_viz_params = FeatureVizParams(
    n_groups = 0,
    first_group_size = 15,
    include_left_tables = False,
    border = False,
)

feature_data = get_feature_data(
    encoder = encoder,
    encoder_B = encoder_B,
    model = model,
    tokens = all_tokens,
    fvp = feature_viz_params,
)

In [None]:
html_str = feature_data[test_idx].get_html()
display(HTML(html_str))
with open(filepath, "w") as f:
    f.write(html_str)
result = webbrowser.open(filepath)

# Creating visualisations #2 (prompt-centric)

First we create our vocab dict, via a helper function which allows us to get nice HTML representations of our tokens (rather than things which mess up our HTML, e.g. actual line breaks). You should do this on your model's tokenizer, since this `vocab_dict` will be used in subsequent functions. I've only worked with the GPT2 tokenizer, so if this code fails in some way for a different tokenizer, please let me know!

In [None]:
vocab_dict = create_vocab_dict(model.tokenizer)

Next, we pick a prompt and generate the data for it. The `get_prompt_data` function requires `feature_data` as input, because it needs things like the max-activating sequences for this feature. Note, we're using the `feature_data` object with `n_groups=0` and `include_left_tables=False` - this is because we don't actually need these for the prompt-centric visualization. If you're only trying to generate the prompt-centric view, it's a good idea to have these parameters set to these values, because it will speed up the process.

We don't have an extra dataclass like `FeatureVizParams` to wrap our arguments in, because there are very few. Some of them (e.g. `first_group_size`) are inherited from the `FeatureVizParams` object which was used to generate the `feature_data` which is supplied. The only important argument we need to use is `num_top_features`, which is the max number of top-scoring features which are displayed for any given prompt & metric. There's also the argument `verbose` (default False) which controls whether progress bars are printed.

In [None]:
prompt = "'first_name': ('django.db.models.fields"

str_toks = model.tokenizer.tokenize(prompt)
print(str_toks)

prompt_data = get_prompt_data(
    encoder = encoder,
    model = model,
    prompt = prompt,
    feature_data = feature_data,
    num_top_features = 10,
)

Lastly, from this data we create our visualization. We've chosen to examine the `"loss_effect"` on the `django` token, i.e. showing the features whose contributions most reduce the loss on this token.

In [None]:
str_score = "loss_effect"
seq_pos = str_toks.index("django")

html_str = prompt_data.get_html(seq_pos, str_score, vocab_dict)

display(HTML(html_str))

filepath = "prompt_viz_demo.html"
with open(filepath, "w") as f:
    f.write(html_str)

result = webbrowser.open(filepath)

Alternatively, you can use the `"act_size"` or `"act_quantile"` metrics (we recommend the latter) on the `Ġ('` token, i.e. the token immediately before `django`. Remember, we have to include this `Ġ` character at the front of the token (which represents the space character), although this will depend on what tokenizer your model is using.

In [None]:
str_score = "act_quantile"
seq_pos = str_toks.index("Ġ('")

html_str = prompt_data.get_html(seq_pos, str_score, vocab_dict)

display(HTML(html_str))

filepath = "user_prompt.html"
with open(filepath, "w") as f:
    f.write(html_str)

result = webbrowser.open(filepath)

# Saving data

Obviously the HTML strings can be saved, either as strings or as regular HTML files. If you want something more compact, you can pickle the dataclasses:

In [None]:
# Save
with open("feature_data.pkl", "wb") as f:
    pickle.dump(feature_data, f)

# Load
with open("feature_data.pkl", "rb") as f:
    feature_data: MultiFeatureData = pickle.load(f)

# Delete
os.remove("feature_data.pkl")

# Visualize the loaded data, to check it works
html_str = feature_data[test_idx].get_html()
display(HTML(html_str))

And for the prompt-centric visualisation:

In [None]:
# Save
with open("prompt_data.pkl", "wb") as f:
    pickle.dump(prompt_data, f)

# Load
with open("prompt_data.pkl", "rb") as f:
    prompt_data: MultiPromptData = pickle.load(f)

# Delete
os.remove("prompt_data.pkl")

# Visualize the loaded data, to check it works
html_str = prompt_data.get_html(seq_pos, str_score, vocab_dict)
display(HTML(html_str))