# Full Training

In [2]:
# !pip install evaluate
# !pip install datasets
# !pip install transformers

In [3]:
import torch
import evaluate
from tqdm.auto import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, AdamW, get_scheduler

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Select Device

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"); device

device(type='cuda')

## Load Dataset

In [5]:
datasets = load_dataset("glue", "sst2")

In [6]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [7]:
num_labels = len(set(datasets['train']['label'])); num_labels

2

## Tokenization

In [8]:
checkpoint = 'bert-base-uncased'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
def tokenize(sample):
    # we have to only tokenize the `sentence` here
    return tokenizer(sample['sentence'], truncation=True)

In [11]:
tokenized_datasets = datasets.map(tokenize, batched=True)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [13]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'idx'])
tokenized_datasets = tokenized_datasets.rename_columns({'label': 'labels'})
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

## DataLoader

In [14]:
data_collator = DataCollatorWithPadding(tokenizer)

In [15]:
train_ds = tokenized_datasets['train']
valid_ds = tokenized_datasets['validation']

In [16]:
train_dl = DataLoader(train_ds, batch_size = 32, collate_fn = data_collator)
valid_dl = DataLoader(valid_ds, batch_size = 64, collate_fn = data_collator)

In [17]:
for batch in train_dl:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([32]),
 'input_ids': torch.Size([32, 34]),
 'token_type_ids': torch.Size([32, 34]),
 'attention_mask': torch.Size([32, 34])}

## Scheduler

In [19]:
num_epochs = 3

In [20]:
num_training_steps = num_epochs * len(train_dl)

In [26]:
lr_scheduler = get_scheduler('linear',
                            optimizer,
                            num_warmup_steps = 0, 
                            num_training_steps= num_training_steps)

In [27]:
num_training_steps

6315

## Train

In [30]:
checkpoint = 'bert-base-uncased'

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = num_labels).to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

progress = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        batch = { k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress.update(1)
    print(f'Training Loss: {loss}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/6315 [00:00<?, ?it/s]

Training Loss: 0.19032524526119232
Training Loss: 0.05125846341252327
Training Loss: 0.05452456325292587


## Evaluation

In [37]:
metrics = evaluate.load("glue", "sst2")

In [38]:
model.eval()
for batch in valid_dl:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    
    predictions = torch.argmax(outputs.logits, dim=-1)
    metrics.add_batch(predictions=predictions, references = batch['labels'])
    
metrics.compute()

{'accuracy': 0.8990825688073395}