<a href="https://colab.research.google.com/github/aleexarias/deeplearning/blob/main/chapter3/section2_pt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Processing the data (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.

In [2]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [22]:
raw_train_dataset = raw_datasets["train"]
for i in range(10):
  print(raw_train_dataset[i])

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", 'sentence2': "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .", 'label': 0, 'idx': 1}
{'sentence1': 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'sentence2': "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .", 'label': 1, 'idx': 2}
{'sentence1': 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .', 'sentence2': 'Tab shares jumped 20 cents , or 4.6 % , to set a rec

In [24]:
raw_validation_dataset = raw_datasets["validation"]
raw_validation_dataset[87]

{'sentence1': 'However , EPA officials would not confirm the 20 percent figure .',
 'sentence2': 'Only in the past few weeks have officials settled on the 20 percent figure .',
 'label': 0,
 'idx': 812}

In [5]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [6]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [29]:
inputs = tokenizer(["This is the first very long sentence.", "This is the second one."],
                   ["testing one", "testing two"],
                   padding=True,
                   return_tensors="pt"
         )
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1996, 2034, 2200, 2146, 6251, 1012,  102, 5604, 2028,
          102],
        [ 101, 2023, 2003, 1996, 2117, 2028, 1012,  102, 5604, 2048,  102,    0,
            0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}

In [31]:
print(tokenizer(raw_datasets["train"][15]["sentence1"]))
print(tokenizer(raw_datasets["train"][15]["sentence2"]))
print(tokenizer(raw_datasets["train"][15]["sentence1"], raw_datasets["train"][15]["sentence2"]))

{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 24049, 2001, 2087, 3728, 3026, 3580, 2343, 2005, 1996, 9722, 1004, 4132, 9340, 12439, 2964, 2449, 1012, 102, 3026, 3580, 2343, 4388, 24049, 1010, 3839, 2132, 1997, 1996, 9722, 1998, 4132, 9340, 12439, 2964, 3131, 1010, 2097, 2599, 1996, 2047, 9178, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [61]:
# This works well, but it has the disadvantage of returning a dictionary
#(with our keys, input_ids, attention_mask, and token_type_ids, and values that
# are lists of lists). It will also only work if you have enough RAM.
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [62]:
# sometimes it is more useful to use fixed padding, because our model might
# need to receive batches of the same size
def tokenize_function(example):
    return tokenizer(
        example["sentence1"],
        example["sentence2"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [63]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets.column_names)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
print(tokenized_datasets.column_names)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
print(tokenized_datasets.column_names)
print(tokenized_datasets["train"][0])
tokenized_datasets = tokenized_datasets.with_format("torch")
print(tokenized_datasets["train"][0])

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}
{'train': ['label', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['label', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['label', 'input_ids', 'token_type_ids', 'attention_mask']}
{'train': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['labels', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['labels', 'input_ids', 'token_type_ids', 'attention_mask']}
{'labels': 1, 'input_ids': [101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102, 7727, 2000, 2032, 2004, 2069, 1000, 1996, 7409, 1000, 1010, 2572, 3217, 5831, 5496, 2

In [65]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True)

# we see how inefficient it is that all batches have max size
for step, batch in enumerate(train_dataloader):
  print(batch["input_ids"].shape)
  print(batch["input_ids"])
  if step > 2:
    break

torch.Size([16, 128])
tensor([[  101, 10134,  2360,  ...,     0,     0,     0],
        [  101,  2216,  2920,  ...,     0,     0,     0],
        [  101,  5912,  2018,  ...,     0,     0,     0],
        ...,
        [  101, 22767, 15723,  ...,     0,     0,     0],
        [  101,  1000,  1996,  ...,     0,     0,     0],
        [  101,  3648,  6316,  ...,     0,     0,     0]])
torch.Size([16, 128])
tensor([[ 101, 1037, 2261,  ...,    0,    0,    0],
        [ 101, 1000, 2057,  ...,    0,    0,    0],
        [ 101, 3892, 1010,  ...,    0,    0,    0],
        ...,
        [ 101, 5564, 1010,  ...,    0,    0,    0],
        [ 101, 2088, 9006,  ...,    0,    0,    0],
        [ 101, 2572, 2094,  ...,    0,    0,    0]])
torch.Size([16, 128])
tensor([[  101,  8418, 20924,  ...,     0,     0,     0],
        [  101,  1996, 17615,  ...,     0,     0,     0],
        [  101,  1996,  2171,  ...,     0,     0,     0],
        ...,
        [  101,  1996,  1055,  ...,     0,     0,     0],
 

In [66]:
# padding all the samples to the maximum length is not efficient: it’s better to
# pad the samples when we’re building a batch, as then we only need to pad to
# the maximum length in that batch, and not the maximum length in the entire dataset
def tokenize_function(example):
    return tokenizer(
        example["sentence1"], example["sentence2"], truncation=True
    )

In [67]:
# add the batched=True
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [69]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Dynamic Padding

In [46]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [47]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [48]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [70]:
# in a more real way:
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [71]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator
)

for step, batch in enumerate(train_dataloader):
  print(batch["input_ids"].shape)
  if step > 5:
    break

torch.Size([16, 87])
torch.Size([16, 73])
torch.Size([16, 74])
torch.Size([16, 77])
torch.Size([16, 80])
torch.Size([16, 79])
torch.Size([16, 81])
