## Imports

In [None]:
import datasets
import pandas as pd

from datasets import load_dataset

## Dataset

In [None]:
cola_dataset = load_dataset('glue', 'cola')

In [None]:
cola_dataset

In [None]:
train_dataset = cola_dataset['train']
val_dataset = cola_dataset['validation']
test_dataset = cola_dataset['test']

In [None]:
len(train_dataset), len(val_dataset), len(test_dataset)

In [None]:
train_dataset[0]

In [None]:
val_dataset[0]

In [None]:
test_dataset[0]

In [None]:
train_dataset.features

In [None]:
train_dataset.filter(lambda example: example['label'] == train_dataset.features['label'].str2int('acceptable'))[:5]

In [None]:
train_dataset.filter(lambda example: example['label'] == train_dataset.features['label'].str2int('unacceptable'))[:5]

## Tokenizing

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

In [None]:
train_dataset = cola_dataset['train']
val_dataset = cola_dataset['validation']
test_dataset = cola_dataset['test']

In [None]:
tokenizer

In [None]:
print(train_dataset[0]['sentence'])
tokenizer(train_dataset[0]['sentence'])

In [None]:
tokenizer.decode(tokenizer(train_dataset[0]['sentence'])['input_ids'])

In [None]:
def encode(examples):
    return tokenizer(
            examples["sentence"],
            truncation=True,
            padding="max_length",
            max_length=512,
        )

In [None]:
train_dataset = train_dataset.map(encode, batched=True)

## Formatting

In [None]:
import torch

In [None]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

## Data Loader

In [None]:
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)

In [None]:
next(iter(dataloader))

In [None]:
for batch in dataloader:
    print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['label'].shape)

In [None]:
from onnxruntime import get_all_providers

In [None]:
get_all_providers()