# Putting Together

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
sequence = "A watched pot never boils"

model_inputs = tokenizer(sequence)

model_inputs

{'input_ids': [101, 1037, 3427, 8962, 2196, 26077, 2015, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
sequences = [
    "A stitch in time saves nine",
    "All lay loads on a willing horse"
]

## No padding

In [11]:
model_inputs = tokenizer(sequences); model_inputs

{'input_ids': [[101, 1037, 26035, 1999, 2051, 13169, 3157, 102], [101, 2035, 3913, 15665, 2006, 1037, 5627, 3586, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [12]:
len(model_inputs.input_ids[0]),len(model_inputs.input_ids[1]) 

(8, 9)

## Longest Padding

In [13]:
model_inputs = tokenizer(sequences, padding="longest"); model_inputs

{'input_ids': [[101, 1037, 26035, 1999, 2051, 13169, 3157, 102, 0], [101, 2035, 3913, 15665, 2006, 1037, 5627, 3586, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [14]:
len(model_inputs.input_ids[0]),len(model_inputs.input_ids[1]) 

(9, 9)

## Model Max Length Padding

In [16]:
model_inputs = tokenizer(sequences, padding="max_length"); model_inputs 

{'input_ids': [[101, 1037, 26035, 1999, 2051, 13169, 3157, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [17]:
len(model_inputs.input_ids[0]),len(model_inputs.input_ids[1]) 

(512, 512)

## Max Length Padding

In [18]:
model_inputs = tokenizer(sequences, padding="max_length", max_length=8); model_inputs

{'input_ids': [[101, 1037, 26035, 1999, 2051, 13169, 3157, 102], [101, 2035, 3913, 15665, 2006, 1037, 5627, 3586, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [19]:
len(model_inputs.input_ids[0]),len(model_inputs.input_ids[1]) 

(8, 9)

In [20]:
model_inputs = tokenizer(sequences, padding="max_length", max_length=9); model_inputs

{'input_ids': [[101, 1037, 26035, 1999, 2051, 13169, 3157, 102, 0], [101, 2035, 3913, 15665, 2006, 1037, 5627, 3586, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [21]:
len(model_inputs.input_ids[0]),len(model_inputs.input_ids[1]) 

(9, 9)

## Padding as `True`

In [22]:
model_inputs = tokenizer(sequences, padding=True); model_inputs

{'input_ids': [[101, 1037, 26035, 1999, 2051, 13169, 3157, 102, 0], [101, 2035, 3913, 15665, 2006, 1037, 5627, 3586, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [23]:
len(model_inputs.input_ids[0]),len(model_inputs.input_ids[1]) 

(9, 9)

In [27]:
len(sequence.split(' '))

5

## Truncation as `True`

In [32]:
model_inputs = tokenizer(sequences, truncation=True)

In [34]:
len(model_inputs.input_ids[0])

8

In [35]:
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [36]:
len(model_inputs.input_ids[0]),len(model_inputs.input_ids[1]) 

(8, 8)

In [37]:
model_inputs.input_ids

[[101, 1037, 26035, 1999, 2051, 13169, 3157, 102],
 [101, 2035, 3913, 15665, 2006, 1037, 5627, 102]]

## Return as PyTorch Tensor

In [38]:
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt"); model_inputs

{'input_ids': tensor([[  101,  1037, 26035,  1999,  2051, 13169,  3157,   102,     0],
        [  101,  2035,  3913, 15665,  2006,  1037,  5627,  3586,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## Return as numpy array

In [40]:
model_inputs = tokenizer(sequences, padding=True, return_tensors="np"); model_inputs

{'input_ids': array([[  101,  1037, 26035,  1999,  2051, 13169,  3157,   102,     0],
       [  101,  2035,  3913, 15665,  2006,  1037,  5627,  3586,   102]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## Special Tokens

In [41]:
model_inputs = tokenizer(sequence); model_inputs

{'input_ids': [101, 1037, 3427, 8962, 2196, 26077, 2015, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [43]:
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[1037, 3427, 8962, 2196, 26077, 2015]

In [44]:
tokenizer.decode(model_inputs.input_ids)

'[CLS] a watched pot never boils [SEP]'

In [45]:
tokenizer.decode(ids)

'a watched pot never boils'

## Summary

In [71]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [72]:
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [73]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [74]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [75]:
model_inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

In [76]:
model_inputs

{'input_ids': tensor([[  101,  1037, 26035,  1999,  2051, 13169,  3157,   102,     0],
        [  101,  2035,  3913, 15665,  2006,  1037,  5627,  3586,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [77]:
model(**model_inputs).logits

tensor([[-2.4571,  2.5661],
        [-1.4630,  1.4730]], grad_fn=<AddmmBackward0>)