In [1]:
from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer, AutoModelForCausalLM
import torch
import sentencepiece as spm
import pandas as pd

#mamba tokenizer
mamba_tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

#gpt tokenizer
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

## Mamba model
mamba_model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")

## gpt2 model

gpt2_model = AutoModelForCausalLM.from_pretrained("gpt2")

mamba_model.eval()
gpt2_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/517M [00:00<?, ?B/s]

The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [3]:
sentences = [
    "I was a mat in France",
    "I was a matron in France",
    "The unwelcome guest refused to leave despite multiple hints.",
    "Eating a hotdog at the carnival was the highlight of my day.",
    "The spacecraft hovered silently before vanishing into deep space.",
    "Our company’s entire customer database was leaked online.",
    "The sound of the rushing waterfall was calming after a long hike.",
    "The old man the boat.",
    "The horse raced past the barn fell."
]

In [4]:
def calculate_surprisal_causal_mamba(sentence):
    inputs = mamba_tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = mamba_model(**inputs, labels=inputs["input_ids"])

    # Get token log probabilities
    log_probs = outputs.logits.log_softmax(dim=-1)
    input_ids = inputs["input_ids"][0]

    print(f"\nSentence: {sentence}")
    print("-" * 50)
    print(f"{'Token':<15} | {'Surprisal':>10}")
    print("-" * 50)

    for i in range(len(input_ids)):
        token_id = input_ids[i]
        if i == 0:
            log_prob = log_probs[0, i, token_id].item()
            surprisal = -log_prob
        else:
            prev_log_prob = log_probs[0, i - 1, token_id].item()
            surprisal = -prev_log_prob

        token_text = mamba_tokenizer.decode([token_id])
        print(f"{token_text:<15} | {surprisal:>10.4f}")

# Run the experiment for each sentence
for sentence in sentences:
    calculate_surprisal_causal_mamba(sentence)

The 'batch_size' argument of MambaCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.



Sentence: I was a mat in France
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
I               |     9.9123
 was            |     3.3943
 a              |     3.1621
 mat            |     9.5610
 in             |     6.4746
 France         |     6.9136

Sentence: I was a matron in France
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
I               |     9.9123
 was            |     3.3943
 a              |     3.1621
 mat            |     9.5610
ron             |     2.1427
 in             |     3.3055
 France         |     5.8759

Sentence: The unwelcome guest refused to leave despite multiple hints.
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
The             |    12.1457
 unw            |    11.3258
elcome          |     2.9464
 g

In [5]:
def calculate_surprisal_gpt2_mamba(sentence):
    inputs = gpt2_tokenizer(sentence, return_tensors="pt")
    token_ids = inputs["input_ids"][0]
    tokens = gpt2_tokenizer.convert_ids_to_tokens(token_ids)

    with torch.no_grad():
        outputs = mamba_model(**inputs, labels=inputs["input_ids"])

    # Get token log probabilities
    log_probs = outputs.logits.log_softmax(dim=-1)

    print(f"\nSentence: {sentence}")
    print("-" * 50)
    print(f"{'Token':<15} | {'Surprisal':>10}")
    print("-" * 50)

    for i in range(len(token_ids)):
        token_id = token_ids[i]
        if i == 0:
            log_prob = log_probs[0, i, token_id].item()
            surprisal = -log_prob
        else:
            prev_log_prob = log_probs[0, i - 1, token_id].item()
            surprisal = -prev_log_prob

        token_text = tokens[i]
        print(f"{token_text:<15} | {surprisal:>10.4f}")

# Run the experiment for each sentence
for sentence in sentences:
    calculate_surprisal_gpt2_mamba(sentence)


Sentence: I was a mat in France
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
I               |     9.4180
Ġwas            |     9.2078
Ġa              |     3.9441
Ġmat            |    13.7404
Ġin             |    14.2818
ĠFrance         |    10.5058

Sentence: I was a matron in France
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
I               |     9.4180
Ġwas            |     9.2078
Ġa              |     3.9441
Ġmat            |    13.7404
ron             |     8.8042
Ġin             |     8.0218
ĠFrance         |    11.7626

Sentence: The unwelcome guest refused to leave despite multiple hints.
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
The             |     8.1644
Ġunwelcome      |    23.2462
Ġguest          |    10.9223
Ġr

In [6]:
def calculate_surprisal_causal_gpt2(sentence):
    inputs = gpt2_tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = gpt2_model(**inputs, labels=inputs["input_ids"])

    # Get token log probabilities
    log_probs = outputs.logits.log_softmax(dim=-1)
    input_ids = inputs["input_ids"][0]

    print(f"\nSentence: {sentence}")
    print("-" * 50)
    print(f"{'Token':<15} | {'Surprisal':>10}")
    print("-" * 50)

    for i in range(len(input_ids)):
        token_id = input_ids[i]
        if i == 0:
            log_prob = log_probs[0, i, token_id].item()
            surprisal = -log_prob
        else:
            prev_log_prob = log_probs[0, i - 1, token_id].item()
            surprisal = -prev_log_prob

        token_text = gpt2_tokenizer.decode([token_id])
        print(f"{token_text:<15} | {surprisal:>10.4f}")

# Run the experiment for each sentence
for sentence in sentences:
    calculate_surprisal_causal_gpt2(sentence)

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.



Sentence: I was a mat in France
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
I               |     5.9292
 was            |     4.2507
 a              |     3.4783
 mat            |     9.9763
 in             |     8.0515
 France         |     6.8520

Sentence: I was a matron in France
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
I               |     5.9292
 was            |     4.2507
 a              |     3.4783
 mat            |     9.9763
ron             |     1.8520
 in             |     2.3505
 France         |     6.1017

Sentence: The unwelcome guest refused to leave despite multiple hints.
--------------------------------------------------
Token           |  Surprisal
--------------------------------------------------
The             |     7.7088
 unwelcome      |    12.6304
 guest          |     4.8563
 r