In [4]:
import numpy as np
from tqdm import tqdm
import time
import math
import gc
import torch
from datasets import load_dataset
from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast, DataCollatorForLanguageModeling

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPTNeoXForCausalLM.from_pretrained(
    "EleutherAI/pythia-70m-deduped",
    revision="step3000",
).to(device)
tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")

reverse_model = GPTNeoXForCausalLM.from_pretrained(
    "afterless/reverse-pythia-160m"
).to(device)

Here I want to test whether the get_cond_logprob and log_prob is in fact working properly.

In [143]:
def get_cond_logprob_old(input_ids, model):
    # Get conditional logprobs
    with torch.no_grad():
        logprobs = torch.nn.functional.log_softmax(
            model(input_ids=input_ids).logits, dim=-1
        )
    # Get the log probabilities corresponding to the words in input_ids
    relevant_logprobs = torch.gather(
        logprobs, 2, input_ids.unsqueeze(-1)[:, 1:]
    ).squeeze(-1)
    # Sum log probabilities over the sequence length dimension
    sum_log_probs = relevant_logprobs.sum(dim=1)
    return sum_log_probs

def get_cond_logprob_test(input_ids, model):
    # Get conditional logprobs
    with torch.no_grad():
        logprobs = torch.nn.functional.log_softmax(
            model(input_ids=input_ids[:,:-1]).logits, dim=-1
        )
    # Get the log probabilities corresponding to the words in input_ids
    relevant_logprobs = torch.gather(
        logprobs, 2, input_ids.unsqueeze(-1)[:, 1:]
    ).squeeze(-1)
    # Sum log probabilities over the sequence length dimension
    sum_log_probs = relevant_logprobs.sum(dim=1)
    return sum_log_probs

In [146]:
suffix = "Yoooooo"
tokenized_suffix = tokenizer.encode(suffix, return_tensors="pt").to(device)
print(get_cond_logprob_old(tokenized_suffix, model))
print(get_cond_logprob_test(tokenized_suffix, model))

tensor([-13.3137])
tensor([-13.3137])


In [125]:
with torch.no_grad():
    logprobs = torch.nn.functional.log_softmax(
        model(input_ids=tokenized_suffix).logits, dim=-1
    )
# Get the log probabilities corresponding to the words in input_ids
relevant_logprobs = torch.gather(
    logprobs, 2, tokenized_suffix.unsqueeze(-1)[:, 1:]
).squeeze(-1)
# Sum log probabilities over the sequence length dimension
sum_log_probs = relevant_logprobs.sum(dim=1)


In [130]:
tokenized_suffix.shape


torch.Size([1, 4])

In [127]:
logprobs.shape

torch.Size([1, 4, 50304])

In [18]:
tokenized_suffix.unsqueeze(-1)[:, 1:].shape

torch.Size([1, 3, 1])

In [None]:
relevant_logprobs = torch.gather(
    logprobs, 2, tokenized_suffix.unsqueeze(-1)[:, 1:]
)

In [108]:
input = torch.tensor([[1, 2], [3, 4]])
# a = torch.gather(t, 0, torch.tensor([[0, 0], [1, 0]]))
index = torch.tensor([[0,0]])
out  = torch.gather(input, 1, index)

In [128]:
out

tensor([[1, 1]])

In [61]:
a

tensor([[1],
        [1]])

In [51]:
torch.tensor([[0, 0], [1, 0]])

tensor([[0, 0],
        [1, 0]])

In [25]:
a

tensor([[1, 1],
        [4, 3]])

In [10]:
from utils import *
test = create_chunked_dataset_from_full_sequences(
        "pile_val",
        tokenizer,
        10,
        2048,
        suffix_length=1,
        batch_size=1,
        seed=23,
        return_all = False
        )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 10/10 [00:00<00:00, 145.90 examples/s]


In [4]:
prefix_length = 1
suffix = " Obama"
tokenized_suffix = tokenizer.encode(suffix, return_tensors="pt").to(device)
print(tokenized_suffix)

tensor([[6729]])


In [56]:
reverse_suffix = tokenized_suffix.flip(dims=[1]).to(device)        
            
reverse_input_ids = reverse_suffix[:, :-1]
reverse_targets = reverse_suffix[:, 1:]

with torch.no_grad():
  reverse_outputs = reverse_model(input_ids=reverse_input_ids)
  reverse_logits = reverse_outputs.logits

torch.nn.CrossEntropyLoss()(reverse_logits.squeeze(0), reverse_targets.squeeze(0))

tensor(3.9799)

In [57]:
input_ids = tokenized_suffix[:,0:-1]
targets = tokenized_suffix[:, 1:]

with torch.no_grad():
  outputs = model(input_ids=input_ids)
  logits = outputs.logits


torch.nn.CrossEntropyLoss()(logits.squeeze(0), targets.squeeze(0))

tensor(4.6060)

In [58]:
input_ids = torch.ones(1,6000, dtype=torch.long).to(device)
model.eval()
with torch.no_grad():
  model_outputs = model(input_ids=input_ids)

In [57]:
model_outputs.logits.shape

torch.Size([1, 4000, 50304])

In [None]:
from reverse_sampling import sample_reverse_dynamics
test_suffix = torch.ones((1, 1), dtype=torch.long).to(device)
output1, logits1 = sample_reverse_dynamics(
    model,
    empirical_dist,
    prefix_length,
    test_suffix,
    vocab_batch_size=1000,
    temperature=1.0,
)

In [5]:
from reverse_sampling import sample_reverse_dynamics_reverse_prior

output1, logits1 = sample_reverse_dynamics_reverse_prior(
    model,
    reverse_model,
    prefix_length,
    test_suffix,
    vocab_batch_size=300,
    temperature=0.7,
)

100%|██████████| 393/393 [00:18<00:00, 21.33it/s]
100%|██████████| 393/393 [00:20<00:00, 19.16it/s]
100%|██████████| 393/393 [00:27<00:00, 14.46it/s]
100%|██████████| 393/393 [00:32<00:00, 12.13it/s]
100%|██████████| 393/393 [00:36<00:00, 10.82it/s]
100%|██████████| 393/393 [00:41<00:00,  9.38it/s]
100%|██████████| 393/393 [00:45<00:00,  8.64it/s]
100%|██████████| 393/393 [00:33<00:00, 11.67it/s]
100%|██████████| 393/393 [00:34<00:00, 11.53it/s]
100%|██████████| 393/393 [01:04<00:00,  6.08it/s]


In [60]:
suffix = " President Donald Trump filed a lawsuit against former President Barack Obama"
tokenized_suffix= tokenizer.encode(suffix, return_tensors="pt").to(device)
tokenized_suffix.shape

torch.Size([1, 11])

In [16]:
from reverse_sampling import compute_loss_reverse_dynamics_reverse_prior


suffix = " President Donald Trump filed a lawsuit against former President Barack Obama"
tokenized_suffix= tokenizer.encode(suffix, return_tensors="pt").to(device)

loss = compute_loss_reverse_dynamics_reverse_prior(
    model,
    reverse_model,
    tokenized_suffix,
    vocab_batch_size=128,
    device = device
)

  0%|          | 0/393 [00:00<?, ?it/s]


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [10]:
suffix = " The House Democrats oppose the impeachment of former President Barack Obama"
tokenized_suffix= tokenizer.encode(suffix, return_tensors="pt")

loss = compute_loss_reverse_dynamics_reverse_prior(
    model,
    reverse_model,
    tokenized_suffix,
    vocab_batch_size=128,
    device = device
)

  0%|          | 0/393 [00:00<?, ?it/s]

  0%|          | 0/393 [00:00<?, ?it/s]


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [18]:
model.device

device(type='mps', index=0)

In [1]:
from reverse_sampling import compute_loss_reverse_dynamics

suffix = " President Donald Trump filed a lawsuit against former President Barack Obama"
tokenized_suffix= tokenizer.encode(suffix, return_tensors="pt").to(device)

loss = compute_loss_reverse_dynamics(
    model,
    empirical_dist,
    tokenized_suffix,
    dilution=1.0,
    vocab_batch_size=128,
)

### Check Posterior vs Stationary Reversal

In [5]:
uniform_dist = torch.ones_like(empirical_dist) / empirical_dist.shape[0]
empirical_dist = empirical_dist * 0.7 + uniform_dist * 0.3

In [6]:
from reverse_sampling import sample_reverse_dynamics

output1, logits1 = sample_reverse_dynamics(
    model,
    empirical_dist,
    prefix_length,
    tokenized_suffix,
    temperature=0.7,
    vocab_batch_size=512
)

100%|██████████| 99/99 [00:13<00:00,  7.47it/s]
100%|██████████| 99/99 [00:14<00:00,  7.02it/s]
100%|██████████| 99/99 [00:17<00:00,  5.76it/s]
100%|██████████| 99/99 [00:18<00:00,  5.39it/s]
100%|██████████| 99/99 [00:24<00:00,  3.97it/s]


In [8]:
tokenizer.decode(output1[0])

' In thiserior pair, Obama'

In [10]:
logits2 = sr.stationary_reverse_full_dist_suffix_calculation(model, empirical_dist, output1,)

i= 0


100%|██████████| 32/32 [00:25<00:00,  1.27it/s]


i= 0


100%|██████████| 32/32 [00:25<00:00,  1.27it/s]


i= 0


100%|██████████| 32/32 [00:22<00:00,  1.42it/s]


i= 0


100%|██████████| 32/32 [00:18<00:00,  1.73it/s]


i= 0


100%|██████████| 32/32 [00:14<00:00,  2.25it/s]


In [11]:
logits1.log_softmax(dim=-1)

tensor([[-14.1750, -14.4474, -15.0450,  ..., -13.0240, -12.8964, -13.3337],
        [-12.3171, -12.7534, -13.0377,  ..., -12.1253,  -9.1679, -14.2333],
        [-13.4286, -12.3321, -11.0569,  ..., -10.6190, -11.5005, -12.7546],
        [-11.3032, -11.4353, -13.2914,  ..., -12.1252, -11.5411, -13.5619],
        [-13.5377, -15.0662,  -8.3655,  ..., -14.6411, -13.6813, -13.2714]],
       device='cuda:0')

In [12]:
torch.abs(logits2 - logits1.log_softmax(dim=-1)).max()

tensor(3.0518e-05, device='cuda:0')

In [None]:
from reverse_sampling import *
test_suffix = torch.ones((1, 40), dtype=torch.long).to(device)
compute_loss_reverse_dynamics_reverse_prior(
    model,
    reverse_model,
    test_suffix,
    vocab_batch_size=5000,
    dilution=0.0,  # 0.3
    device="cuda",
    loss = torch.nn.CrossEntropyLoss()
)

In [None]:
from reverse_sampling import *
compute_loss_reverse_dynamics_reverse_prior_target_memory(
    model,
    reverse_model,
    test_suffix,
    target_memory=1,
    dilution=0.0,  # 0.3
    device="cuda",
    loss = torch.nn.CrossEntropyLoss()
)

In [None]:
for i in range(1,10):
  print("test")

In [3]:
# random torch array of shape [1000]
import torch
x = torch.ones(1000)
y = torch.ones(1000)+1
test = [x,] + y


TypeError: can only concatenate list (not "Tensor") to list

In [8]:
test_list = [torch.tensor(1), torch.tensor(2), torch.tensor(3)]
test_list = [torch.tensor(4),]+test_list

In [9]:
test_list

[tensor(4), tensor(1), tensor(2), tensor(3)]

In [10]:
torch.stack(test_list)

tensor([4, 1, 2, 3])

In [24]:
# return cost in seconds for 1B model
def cost_estimator_1B(suffix_length):
  return 10 + 140/29 * (suffix_length-1)

In [31]:
def cost_estimator_410m(suffix_length):
  return 5 + 58/29 * (suffix_length-1)

In [13]:
cost_estimator_1B(30)


150.0

In [62]:
# total runtime in hours
num_examples = 50
suffix_length = 30
print((num_examples*sum([cost_estimator_1B(i) for i in range(1,suffix_length)]))/(60*60))
print((num_examples*sum([cost_estimator_410m(i) for i in range(1,suffix_length)]))/(60*60))


31.249999999999993
13.291666666666666


21.666666666666668