In [1]:
import torch
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("glue", 'mrpc')

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [4]:
dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [10]:
check_point = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(check_point)

In [7]:
def tokenize(examples):
    return tokenizer(
        examples['sentence1'], examples['sentence2'], padding=True, truncation=True, max_length=128 
    )

In [8]:
tokenize_datasets = dataset.map(tokenize, batched=True)

Map: 100%|██████████| 408/408 [00:00<00:00, 8500.15 examples/s]


In [9]:
print(tokenize_datasets.column_names)

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}


In [16]:
tokenize_datasets = tokenize_datasets.remove_columns(['idx', 'sentence1', 'sentence2'])
tokenize_datasets = tokenize_datasets.rename_column('label', 'labels')
tokenize_datasets = tokenize_datasets.with_format('torch')

In [58]:
tokenize_datasets['train']

NameError: name 'tokenize_datasets' is not defined

In [18]:
small_train_dataset = tokenize_datasets['train'].select(range(100))

In [19]:
small_train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})

In [10]:
from torch.utils.data import DataLoader

In [21]:
train_dataloader = DataLoader(tokenize_datasets['train'], batch_size=16, shuffle=True)

In [22]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x1e3a8897550>

# Train Loop with classes

In [18]:
data_collator = DataCollatorWithPadding(tokenizer)

In [19]:
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [7]:
from transformers import AutoModelForSequenceClassification

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(check_point, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    'test-train',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=.01
)

In [16]:
from transformers import Trainer

In [20]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenize_datasets['train'],
    eval_dataset=tokenize_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
)
trainer.train()

Step,Training Loss
500,0.3968
1000,0.0895


TrainOutput(global_step=1150, training_loss=0.2162336196070132, metrics={'train_runtime': 233.5403, 'train_samples_per_second': 78.53, 'train_steps_per_second': 4.924, 'total_flos': 1045856167280400.0, 'train_loss': 0.2162336196070132, 'epoch': 5.0})

In [21]:
pred = trainer.predict(tokenize_datasets['validation'])

In [24]:
print(pred.predictions.shape, pred.label_ids.shape)

(408, 2) (408,)


In [28]:
import numpy as np
from datasets import load_metric
metric = load_metric("glue", "mrpc")
preds = np.argmax(pred.predictions, axis=-1)
metric.compute(predictions=preds, references=pred.label_ids)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8480392156862745, 'f1': 0.8912280701754387}

In [29]:
metric = load_metric("glue", "mrpc")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [ ]:
training_args = TrainingArguments(
    'test-train',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=.01,
    evaluation_strategy='epoch'
)

In [ ]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenize_datasets['train'],
    eval_dataset=tokenize_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Custom loop train

In [5]:
def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [12]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 408/408 [00:00<00:00, 5745.58 examples/s]


In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [14]:
tokenized_datasets = tokenized_datasets.remove_columns(['idx', 'sentence1', 'sentence2'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

In [15]:
data_collator = DataCollatorWithPadding(tokenizer)

In [16]:
from torch.utils.data import DataLoader

In [17]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=8, shuffle=True, collate_fn=data_collator)

In [18]:
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8, shuffle=True, collate_fn=data_collator)

In [29]:
for batch in train_dataloader:
    print({k: v.shape for k, v in batch.items()}, end='\n')
    break

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 62]), 'token_type_ids': torch.Size([8, 62]), 'attention_mask': torch.Size([8, 62])}


In [34]:
model = AutoModelForSequenceClassification.from_pretrained(check_point, num_labels=2)

In [19]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)



In [20]:
from transformers import get_scheduler
num_epoch = 3
num_training_steps = num_epoch * len(train_dataloader)
lr_schedule = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [21]:
import torch

In [22]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [24]:
from tqdm import tqdm
progress = tqdm(range(num_training_steps))

  0%|          | 0/1377 [00:00<?, ?it/s]

In [42]:


model.train()
for epoch in range(num_epoch):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_schedule.step()
        optimizer.zero_grad()
        progress.update(1)


 33%|███▎      | 459/1377 [10:27<20:54,  1.37s/it]

  0%|          | 1/1377 [00:00<07:23,  3.11it/s][A
  0%|          | 3/1377 [00:00<03:27,  6.63it/s][A
  0%|          | 5/1377 [00:00<02:36,  8.74it/s][A
  1%|          | 7/1377 [00:00<02:19,  9.80it/s][A
  1%|          | 9/1377 [00:00<02:08, 10.63it/s][A
  1%|          | 11/1377 [00:01<02:08, 10.63it/s][A
  1%|          | 13/1377 [00:01<02:04, 10.93it/s][A
  1%|          | 15/1377 [00:01<02:01, 11.19it/s][A
  1%|          | 17/1377 [00:01<01:57, 11.58it/s][A
  1%|▏         | 19/1377 [00:01<02:00, 11.29it/s][A
  2%|▏         | 21/1377 [00:02<01:58, 11.42it/s][A
  2%|▏         | 23/1377 [00:02<01:55, 11.76it/s][A
  2%|▏         | 25/1377 [00:02<01:57, 11.53it/s][A
  2%|▏         | 27/1377 [00:02<01:56, 11.62it/s][A
  2%|▏         | 29/1377 [00:02<01:55, 11.63it/s][A
  2%|▏         | 31/1377 [00:02<01:56, 11.58it/s][A
  2%|▏         | 33/1377 [00:03<01:56, 11.53it/s][A
  3%|▎         | 35/1377 [00:03<01:56, 11.55it/s][

In [45]:
from datasets import load_metric
metric = load_metric('glue', 'mrpc')
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

metric.compute()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8602941176470589, 'f1': 0.9028960817717206}

# with accelator

In [25]:
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, train_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader
)

model.train()
for epoch in range(num_epoch):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        
        optimizer.step()
        lr_schedule.step()
        optimizer.zero_grad()
        progress.update(1)

100%|█████████▉| 1376/1377 [02:18<00:00, 10.75it/s]

In [26]:
from datasets import load_metric
metric = load_metric('glue', 'mrpc')
model.eval()

eval_dataloader = accelerator.prepare(eval_dataloader)
for batch in eval_dataloader:
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=accelerator.gather(predictions), references=accelerator.gather(batch['labels']))

metric.compute()

  metric = load_metric('glue', 'mrpc')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8333333333333334, 'f1': 0.8862876254180602}

In [30]:
predictions

tensor([0, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')

In [55]:
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8, collate_fn=data_collator)
test_dataloader = test_dataloader.remove
test_data = accelerator.prepare(test_dataloader)
for batch in test_data:
    print(batch)
    break
    # with torch.no_grad():
    #     output = model(**batch)
    # 
    # logits = output.logits
    # pred = torch.argmax(logits, dim=-1)

{'labels': tensor([1, 1, 1, 0, 0, 1, 0, 1], device='cuda:0'), 'input_ids': tensor([[  101,  7054,  1658,  2924,   112,   188,  2705,  3389,  2575,   117,
          2639, 25991,   117,  1105,  3230,  6492,   117,  1103,  2705,  2798,
          2575,   117,  1209,  2592,  2626,  1106,  1828,  1573,   119,   102,
          9493,  2534, 20961,  4124,  2639, 25991,  1105,  1990,  2534,  7748,
          4124,  3230,  6492,  1209,  2592,  1106,  1573,   119,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  1109,  1362,   112,   188,  1160,  2026, 12365, 11877,  1163,
          1147,   158,   119,   156,   119,  3813,  5799,  1167,  1190, 10035,
          1314,  2370,  1112,   170,  1523,  2247,  3813, 27221,  2416,  1167,
          1104,  1126,  2380,  1171,  9936,  1190,  2637,   119,   102, 21057,
          3813,  1

In [134]:
dataset['test'][0]

{'sentence1': "PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .",
 'sentence2': 'Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .',
 'label': 1,
 'idx': 0}

In [135]:
sn = tokenize_function({
    'sentence1':"hi my name is amqa and my",
    'sentence2':"I am amqa"
})

In [136]:
x = sn

In [137]:
x['input_ids'] = torch.tensor(x['input_ids'])
x['token_type_ids'] = torch.tensor(x['token_type_ids'])
x['attention_mask'] = torch.tensor(x['attention_mask'])

In [138]:
x = {key: value.to('cuda:0') for key, value in x.items()}

In [139]:
with torch.no_grad():
    out = model(x['input_ids'].unsqueeze(0), x['attention_mask'].unsqueeze(0),
                  x['token_type_ids'].unsqueeze(0))

In [140]:
torch.argmax(out.logits)

tensor(1, device='cuda:0')

In [126]:
print(x['input_ids'].unsqueeze(0).shape)
print(x['attention_mask'].shape)
print(x['token_type_ids'].shape)

torch.Size([1, 16])
torch.Size([16])
torch.Size([16])
