# Processing the data

In [1]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = 'bert-base-uncased'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 2.80kB/s]
Downloading (…)lve/main/config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 142kB/s]
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 232k/232k [00:00<00:00, 1.69MB/s]
Downloading (…)/main/tokenizer.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 466k/466k [00:00<00:00, 3.39MB/s]


In [4]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Downloading model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 440M/440M [00:17<00:00, 25.4MB/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
sequences = [
    'Trust yourself. You know more than you think you do.',
    'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.'
]

In [6]:
batch = tokenizer(sequences, truncation=True, padding=True, return_tensors='pt')

In [8]:
batch['labels'] = torch.tensor([1, 1])

In [9]:
batch

{'input_ids': tensor([[  101,  3404,  4426,  1012,  2017,  2113,  2062,  2084,  2017,  2228,
          2017,  2079,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  1996,  2711,  1010,  2022,  2009, 10170,  2030,  3203,  1010,
          2040,  2038,  2025,  5165,  1999,  1037,  2204,  3117,  1010,  2442,
          2022,  2046,  3917,  8231,  5236,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]]), 'labels': tensor([1, 1])}

In [10]:
optimizer = AdamW(model.parameters())



In [12]:
loss = model(**batch).loss

In [14]:
loss.backward()

In [15]:
optimizer.step()

In [16]:
model(**batch).loss

tensor(0.0957, grad_fn=<NllLossBackward0>)

## Loading a dataset from the hub

In [19]:
from datasets import load_dataset

In [20]:
raw_datasets = load_dataset('glue', 'mrpc')

Downloading builder script: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28.8k/28.8k [00:00<00:00, 8.96MB/s]
Downloading metadata: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28.7k/28.7k [00:00<00:00, 7.54MB/s]
Downloading readme: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27.9k/27.9k [00:00<00:00, 6.76MB/s]
Downloading data files:   0%|                                                                                                                                                         | 0/3 [00:00<?, ?it/s]
Downloading data: 6.22kB [00:00, 1.08MB/s]
Downloading data files:  33%|████████████████████████████████████████████████▎                                                           

In [21]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [22]:
raw_train_dataset = raw_datasets["train"]

In [26]:
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [24]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [34]:
raw_datasets["train"][15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

In [35]:
raw_datasets["validation"][87]

{'sentence1': 'However , EPA officials would not confirm the 20 percent figure .',
 'sentence2': 'Only in the past few weeks have officials settled on the 20 percent figure .',
 'label': 0,
 'idx': 812}