In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

batched_ids = []
max_len = 0

for sequence in sequences:
    tokens = tokenizer.tokenize(sequence)
    ids = [101] + tokenizer.convert_tokens_to_ids(tokens) + [102]

    batched_ids.append(ids)
    max_len = max(max_len, len(ids))

print(tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")["input_ids"])
print(batched_ids)

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]])
[[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 2061, 2172, 999, 102]]


In [10]:
padded_ids = []
attention_mask = []
for ids in batched_ids:
    padded_ids.append(ids + [0] * (max_len - len(ids)))
    attention_mask.append([1] * len(ids) + [0] * (max_len - len(ids)))

outputs = model(torch.tensor(padded_ids), attention_mask=torch.tensor(attention_mask))

print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)
