In [None]:
!pip install huggingface-hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: write

In [None]:
!pip install datasets transformers torch




In [None]:


from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader

%env CUDA_LAUNCH_BLOCKING=1

snli = load_dataset("snli")

snli = snli.filter(lambda example: 0 <= example['label'] <= 2)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], padding='max_length', truncation=True, max_length=128)

snli = snli.map(encode_data, batched=True, num_proc=4)

snli.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


train_subset = snli['train']
val_subset = snli['validation']
test_subset = snli['test']

train_loader = DataLoader(train_subset, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_subset, batch_size=16, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_subset, batch_size=16, num_workers=2, pin_memory=True)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

model.train()
for epoch in range(10):
    total_train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss

        scaler.scale(loss).backward()

        scaler.step(optimizer)
        scaler.update()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Training Loss: {avg_train_loss:.4f}")

def evaluate(loader):
    model.eval()
    correct = 0
    total = 0
    total_eval_loss = 0

    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss
            total_eval_loss += loss.item()

            predictions = outputs.logits.argmax(dim=1)
            correct += (predictions == batch['label']).sum().item()
            total += batch['label'].size(0)

    avg_eval_loss = total_eval_loss / len(loader)
    accuracy = correct / total
    return avg_eval_loss, accuracy

val_loss, val_accuracy = evaluate(val_loader)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy * 100:.2f}%")

test_loss, test_accuracy = evaluate(test_loader)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f}%")


env: CUDA_LAUNCH_BLOCKING=1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/9824 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/9842 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/549367 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():


Epoch 1 - Training Loss: 0.5096
Epoch 2 - Training Loss: 0.3291
Epoch 3 - Training Loss: 0.2267
Epoch 4 - Training Loss: 0.1596
Epoch 5 - Training Loss: 0.1231
Epoch 6 - Training Loss: 0.0973
Epoch 7 - Training Loss: 0.0822
Epoch 8 - Training Loss: 0.0681
Epoch 9 - Training Loss: 0.0570
Epoch 10 - Training Loss: 0.0511
Validation Loss: 0.5988, Validation Accuracy: 86.56%
Test Loss: 0.5958, Test Accuracy: 86.69%


In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader

In [None]:
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification
from torch.cuda.amp import GradScaler, autocast



In [None]:
%env CUDA_LAUNCH_BLOCKING=1


snli = load_dataset("snli")

snli = snli.filter(lambda example: 0 <= example['label'] <= 2)

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def encode_data(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], padding='max_length', truncation=True, max_length=128)

snli = snli.map(encode_data, batched=True, num_proc=4)

snli.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

train_subset = snli['train']
val_subset = snli['validation']
test_subset = snli['test']

train_loader = DataLoader(train_subset, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_subset, batch_size=16, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_subset, batch_size=16, num_workers=2, pin_memory=True)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
epochs = 10
total_steps = len(train_loader) * epochs
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.0, total_iters=total_steps)

scaler = GradScaler()

model.train()
for epoch in range(epochs):
    total_train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        scheduler.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Training Loss: {avg_train_loss:.4f}")

def evaluate(loader):
    model.eval()
    correct = 0
    total = 0
    total_eval_loss = 0

    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss
            total_eval_loss += loss.item()

            predictions = outputs.logits.argmax(dim=1)
            correct += (predictions == batch['label']).sum().item()
            total += batch['label'].size(0)

    avg_eval_loss = total_eval_loss / len(loader)
    accuracy = correct / total
    return avg_eval_loss, accuracy

val_loss, val_accuracy = evaluate(val_loader)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy * 100:.2f}%")

test_loss, test_accuracy = evaluate(test_loader)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f}%")


env: CUDA_LAUNCH_BLOCKING=1


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():


Epoch 1 - Training Loss: 0.4499
Epoch 2 - Training Loss: 0.3191
Epoch 3 - Training Loss: 0.2560
Epoch 4 - Training Loss: 0.2063
Epoch 5 - Training Loss: 0.1653
Epoch 6 - Training Loss: 0.1351
Epoch 7 - Training Loss: 0.1120
Epoch 8 - Training Loss: 0.0949
Epoch 9 - Training Loss: 0.0816
Epoch 10 - Training Loss: 0.0718
Validation Loss: 0.4706, Validation Accuracy: 89.82%
Test Loss: 0.4634, Test Accuracy: 89.67%


In [None]:
from transformers import DebertaTokenizer, DebertaForSequenceClassification


In [None]:

snli = load_dataset("snli")

snli = snli.filter(lambda example: 0 <= example['label'] <= 2)

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')

def encode_data(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], padding='max_length', truncation=True, max_length=128)

snli = snli.map(encode_data, batched=True, num_proc=4)

snli.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

train_subset = snli['train']
val_subset = snli['validation']
test_subset = snli['test']

train_loader = DataLoader(train_subset, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_subset, batch_size=16, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_subset, batch_size=16, num_workers=2, pin_memory=True)

model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base', num_labels=3)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
epochs = 10
total_steps = len(train_loader) * epochs
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.0, total_iters=total_steps)

scaler = GradScaler()

model.train()
for epoch in range(epochs):
    total_train_loss = 0
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        scheduler.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Training Loss: {avg_train_loss:.4f}")

def evaluate(loader):
    model.eval()
    correct = 0
    total = 0
    total_eval_loss = 0

    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss
            total_eval_loss += loss.item()

            predictions = outputs.logits.argmax(dim=1)
            correct += (predictions == batch['label']).sum().item()
            total += batch['label'].size(0)

    avg_eval_loss = total_eval_loss / len(loader)
    accuracy = correct / total
    return avg_eval_loss, accuracy

val_loss, val_accuracy = evaluate(val_loader)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy * 100:.2f}%")

test_loss, test_accuracy = evaluate(test_loader)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f}%")


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():


Epoch 1 - Training Loss: 0.8882
Epoch 2 - Training Loss: 0.4025
Epoch 3 - Training Loss: 0.2848
Epoch 4 - Training Loss: 0.2110
Epoch 5 - Training Loss: 0.1586
Epoch 6 - Training Loss: 0.1205
Epoch 7 - Training Loss: 0.0974
Epoch 8 - Training Loss: 0.0803
Epoch 9 - Training Loss: 0.0668
Epoch 10 - Training Loss: 0.0552
Validation Loss: 0.5174, Validation Accuracy: 88.82%
Test Loss: 0.5481, Test Accuracy: 88.14%
