# Processing the data

In [3]:
import torch
from rich import print
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
checkpoint = 'bert-base-uncased'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
sequences = [
    'Trust yourself. You know more than you think you do.',
    'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.'
]

In [8]:
batch = tokenizer(sequences, truncation=True, padding=True, return_tensors='pt')

In [9]:
batch['labels'] = torch.tensor([1, 1])

In [10]:
batch

{'input_ids': tensor([[  101,  3404,  4426,  1012,  2017,  2113,  2062,  2084,  2017,  2228,
          2017,  2079,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  1996,  2711,  1010,  2022,  2009, 10170,  2030,  3203,  1010,
          2040,  2038,  2025,  5165,  1999,  1037,  2204,  3117,  1010,  2442,
          2022,  2046,  3917,  8231,  5236,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]]), 'labels': tensor([1, 1])}

In [11]:
optimizer = AdamW(model.parameters())



In [12]:
loss = model(**batch).loss

In [13]:
loss.backward()

In [14]:
optimizer.step()

In [15]:
model(**batch).loss

tensor(0.5568, grad_fn=<NllLossBackward0>)

## Loading a dataset from the hub

In [16]:
from datasets import load_dataset

In [17]:
raw_datasets = load_dataset('glue', 'mrpc')

In [18]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [19]:
raw_train_dataset = raw_datasets["train"]

In [20]:
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [21]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [22]:
raw_datasets["train"][15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

In [23]:
raw_datasets["validation"][87]

{'sentence1': 'However , EPA officials would not confirm the 20 percent figure .',
 'sentence2': 'Only in the past few weeks have officials settled on the 20 percent figure .',
 'label': 0,
 'idx': 812}

## Preprocessing a dataset

In [24]:
from transformers import AutoTokenizer

In [25]:
checkpoint = 'bert-base-uncased'

In [26]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [27]:
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [28]:
len(tokenized_sentences_1.input_ids), len(tokenized_sentences_2.input_ids)

(3668, 3668)

In [29]:
inputs = tokenizer("If you are going through hell, keep going", "You just can't beat the person who never gives up")

In [30]:
print(inputs)

Does it take 3?

In [31]:
sent1 = "If you are going through hell, keep going"
sent2 = "You just can't beat the person who never gives up"
sent3 = "Be kind whenver possible. It is always possible"
inputs = tokenizer(sent1, sent2, sent3)

In [32]:
inputs

{'input_ids': [101, 2065, 2017, 2024, 2183, 2083, 3109, 1010, 2562, 2183, 102, 2017, 2074, 2064, 1005, 1056, 3786, 1996, 2711, 2040, 2196, 3957, 2039, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [101, 2022, 2785, 2043, 6299, 2825, 1012, 2009, 2003, 2467, 2825, 102]}

In [33]:
tokenizer.decode(inputs.labels)

'[CLS] be kind whenver possible. it is always possible [SEP]'

here the sent3 is taken as the label

How about 4 sentences?

In [34]:
sent1 = "If you are going through hell, keep going"
sent2 = "You just can't beat the person who never gives up"
sent3 = "Be kind whenver possible. It is always possible"
sent4 = "Wherever you are - be all there"

In [35]:
inputs = tokenizer(sent1, sent2, sent3, sent4)

In [36]:
inputs

{'input_ids': [101, 2065, 2017, 2024, 2183, 2083, 3109, 1010, 2562, 2183, 102, 2017, 2074, 2064, 1005, 1056, 3786, 1996, 2711, 2040, 2196, 3957, 2039, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [101, 2022, 2785, 2043, 6299, 2825, 1012, 2009, 2003, 2467, 2825, 102, 11210, 2017, 2024, 1011, 2022, 2035, 2045, 102]}

In [37]:
tokenizer.decode(inputs.labels)

'[CLS] be kind whenver possible. it is always possible [SEP] wherever you are - be all there [SEP]'

the sent3 and sent 4 are taken as labels

In [38]:
raw_datasets['train'][15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

In [39]:
sentence1 = raw_datasets['train'][15]['sentence1']; sentence1

'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .'

In [40]:
sentence2 = raw_datasets['train'][15]['sentence2']; sentence2

'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .'

In [41]:
inputs1 = tokenizer(sentence1); inputs1

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [42]:
tokenizer.decode(inputs1.input_ids)

'[CLS] rudder was most recently senior vice president for the developer & platform evangelism business. [SEP]'

In [43]:
inputs2 = tokenizer(sentence2); inputs2

{'input_ids': [101, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [44]:
tokenizer.decode(inputs2.input_ids)

'[CLS] senior vice president eric rudder, formerly head of the developer and platform evangelism unit, will lead the new entity. [SEP]'

In [45]:
inputs = tokenizer(sentence1, sentence2); inputs

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [46]:
tokenizer.decode(inputs.input_ids)

'[CLS] rudder was most recently senior vice president for the developer & platform evangelism business. [SEP] senior vice president eric rudder, formerly head of the developer and platform evangelism unit, will lead the new entity. [SEP]'

As we can see there is [SEP] between the two sentences and token_type_ids are 0 & 1 respectively for the two sentences

In [47]:
tokenized_text = tokenizer(raw_datasets["train"]["sentence1"], raw_datasets["train"]["sentence2"], truncation=True, padding=True)

In [48]:
tokenized_text.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [49]:
len(raw_datasets["train"]["sentence1"])

3668

In [50]:
len(raw_datasets["train"]["sentence2"])

3668

In [51]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [53]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True); tokenized_datasets

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3668/3668 [00:01<00:00, 1906.20 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 408/408 [00:00<00:00, 2347.31 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1725/1725 [00:00<00:00, 2930.74 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

## Dynamic Padding

In [58]:
from transformers import DataCollatorWithPadding

In [59]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [60]:
samples = tokenized_datasets["train"][:8]

In [62]:
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}

In [63]:
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [64]:
batch = data_collator(samples)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [65]:
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}