In [1]:
#%%
import numpy as np
from tqdm import tqdm
import time
import math
import gc

#%%
import torch
from datasets import load_dataset
from transformers import (
    GPTNeoXForCausalLM,
    GPTNeoXTokenizerFast,
    DataCollatorForLanguageModeling,
)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPTNeoXForCausalLM.from_pretrained(
          "EleutherAI/pythia-70m-deduped-v0"
      ).to(device)
tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b")

In [7]:
def get_cond_logprob1(input_ids, model):
    with torch.no_grad():
        logprobs = torch.nn.functional.log_softmax(
            model(input_ids=input_ids).logits, dim=-1
        )

    # Get the log probabilities corresponding to the words in input_ids
    relevant_logprobs = torch.gather(
        logprobs, 2, input_ids.unsqueeze(-1)[:, 1:]
    ).squeeze(-1)

    # Sum log probabilities over the sequence length dimension
    sum_log_probs = relevant_logprobs.sum(dim=1)

    return sum_log_probs

def get_cond_logprob2(input_ids, model):
    with torch.no_grad():
        logprobs = model(input_ids=input_ids).logits # torch.nn.functional.log_softmax(model(input_ids=input_ids).logits, dim=-1)

    # Get the log probabilities corresponding to the words in input_ids
    #logprobs = logprobs.squeeze(0)
    loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")

    return loss_fn(logprobs[0, :-1, :], input_ids[0, 1:])



In [4]:
input_ids = tokenizer("This is a test hello world", return_tensors="pt").input_ids

In [8]:
get_cond_logprob2(input_ids.cuda(), model)

tensor(28.4848, device='cuda:0')

In [6]:
get_cond_logprob1(input_ids.cuda(), model)

tensor([-28.4848], device='cuda:0')