# Handling multiple sequences (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [109]:
# Install necessary libraries: Transformers, Datasets, and Evaluate
!pip install datasets evaluate transformers[sentencepiece]



In [137]:
# Import libraries and load a pre-trained model and tokenizer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids])
# This line will fail.
model(input_ids)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [138]:
# Tokenize a sequence and return PyTorch tensors
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])


In [139]:
# Process a single sequence through the model
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [140]:
# Demonstrate processing sequences of different lengths without padding
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print("Logits for sequence1_ids:", model(torch.tensor(sequence1_ids)).logits)
print("Logits for sequence2_ids:", model(torch.tensor(sequence2_ids)).logits)
print("Logits for batched_ids without attention mask:", model(torch.tensor(batched_ids)).logits)

Logits for sequence1_ids: tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
Logits for sequence2_ids: tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
Logits for batched_ids without attention mask: tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [141]:
# Process batched sequences with attention mask
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print("Logits for batched_ids with attention mask:", outputs.logits)

Logits for batched_ids with attention mask: tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [142]:
# Custom function to add padding and create attention mask
def add_padding(pad_token,input_ids):
  max_len = len(max(input_ids, key=len))
  attention_mask=[]
  for idx,seq in enumerate(input_ids):
    seq_len=len(seq)
    seq_len_diff = max_len - seq_len

    if seq_len_diff > 0:
      input_ids[idx].extend([pad_token] * seq_len_diff)
      attention_mask.append([1] * seq_len + [0]*seq_len_diff)
    else:
      attention_mask.append([1] * seq_len)

  return (input_ids,attention_mask)

In [150]:
# Prepare raw input sentences and tokenize them
raw_inputs=[
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]


input_ids2=[]
for sequence in raw_inputs:
  seq_tokens=tokenizer.tokenize(sequence)
  seq_tokens.insert(0,"[CLS]")
  seq_tokens.append("[SEP]")
  seq_ids=tokenizer.convert_tokens_to_ids(seq_tokens)
  input_ids2.append(seq_ids)

tokenizer.decode(input_ids2[0])

"[CLS] i've been waiting for a huggingface course my whole life. [SEP]"

In [151]:
# Process individual sequences through the model (corrected)
seq1_output=model(torch.tensor(input_ids2[0]).unsqueeze(0))
seq2_output=model(torch.tensor(input_ids2[1]).unsqueeze(0))

print("Logits for sequence 1:", seq1_output.logits)
print("Logits for sequence 2:", seq2_output.logits)

Logits for sequence 1: tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>)
Logits for sequence 2: tensor([[ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [152]:
# Process batched sequences with custom padding and attention mask
padded_inputs,attention_mask=add_padding(tokenizer.pad_token_id,input_ids2)
print(attention_mask)
batch_seq_output = model(torch.tensor(padded_inputs), attention_mask=torch.tensor(attention_mask))
print(batch_seq_output.logits)

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]
tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [119]:
# Get the maximum sequence length for a single sentence from the tokenizer
max_sequence_length = tokenizer.max_len_single_sentence
print("Maximum sequence length:", max_sequence_length)

# Truncate a sequence to the maximum length (example)
sequence = sequence[:max_sequence_length]
sequence

Maximum sequence length: 510


'I hate this so much!'

*Note: This notebook is based on content from the Hugging Face LLM Course.*