## Imports

In [1]:
import datasets
import pandas as pd
import random
from transformers import AutoTokenizer
from collections import Counter
from torch.utils.data import DataLoader

## Config

In [2]:
PATH = "glue"
NAME = "cola"
MODEL = "google/bert_uncased_L-2_H-128_A-2"
TOKENIZER_MAX_LENGTH = 512
BATCH_SIZE = 4

## Dataset

In [3]:
dataset = datasets.load_dataset(path=PATH, name=NAME)
dataset

Found cached dataset glue (C:/Users/nak142/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [4]:
for type in ['train', 'validation', 'test']:
    print(f'{type} set size: {len(dataset[type])}')

train_dset, valid_dset, test_dset = dataset['train'], dataset['validation'], dataset['test']

train set size: 8551
validation set size: 1043
test set size: 1063


In [5]:
for _ in range(5):
    idx = random.randint(0, len(train_dset))
    label_int = train_dset[idx]['label']
    print(f"Index: {train_dset[idx]['idx']}")
    print(f"Sentence: {train_dset[idx]['sentence']}")
    print(f"Label: {train_dset.features['label'].names[label_int]}")
    

Index: 1227
Sentence: I yawned.
Label: acceptable
Index: 1021
Sentence: What does who admire?
Label: unacceptable
Index: 6899
Sentence: Captain Wentworth recovered the property for Mrs Smith.
Label: acceptable
Index: 7805
Sentence: She liked Moya's football.
Label: unacceptable
Index: 2042
Sentence: I shipped the package halfway to the Antarctic.
Label: acceptable


In [6]:
print(f"Labels: {train_dset.features['label']}")
for dset in ['train', 'validation', 'test']: 
    counter = Counter(dataset[dset]['label'])
    print(f'{dset} set: {counter}')

Labels: ClassLabel(names=['unacceptable', 'acceptable'], id=None)
train set: Counter({1: 6023, 0: 2528})
validation set: Counter({1: 721, 0: 322})
test set: Counter({-1: 1063})


## Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
sentence = train_dset[0]['sentence']
# tokenize method returns the tokens as a list
tok_sentence = tokenizer.tokenize(
    text=sentence,
    truncation=True,
    max_length=TOKENIZER_MAX_LENGTH,
    padding="max_length")
# directly calling the tokenizer returns a dictionary with keys 'input_ids' and 'attention_mask'
tok_sentence_enc = tokenizer(
    text=sentence)
dec_tok_sentence = tokenizer.decode(tok_sentence_enc['input_ids'])
print(f"Original sentence: {sentence}")
print(f"Tokenized sentence: {tok_sentence}")
print(f"Tokenized sentence encoded: {tok_sentence_enc['input_ids']}")
print(f"Decoded sentence: {dec_tok_sentence}")

Original sentence: Our friends won't buy this analysis, let alone the next one we propose.
Tokenized sentence: ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

## Formatting

In [8]:
train_dset = train_dset.map(
    lambda x: tokenizer(
        x['sentence'], 
        truncation=True,
        max_length=TOKENIZER_MAX_LENGTH,
        padding="max_length"),
    batched=True)
train_dset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
print(train_dset[0])

  0%|          | 0/9 [00:00<?, ?ba/s]

{'label': tensor(1), 'input_ids': tensor([  101,  2256,  2814,  2180,  1005,  1056,  4965,  2023,  4106,  1010,
         2292,  2894,  1996,  2279,  2028,  2057, 16599,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,  

In [9]:
train_dloader = DataLoader(
    train_dset,
    batch_size=BATCH_SIZE
)
next(iter(train_dloader))

{'label': tensor([1, 1, 1, 1]),
 'input_ids': tensor([[ 101, 2256, 2814,  ...,    0,    0,    0],
         [ 101, 2028, 2062,  ...,    0,    0,    0],
         [ 101, 2028, 2062,  ...,    0,    0,    0],
         [ 101, 1996, 2062,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [10]:
print("input_ids, attention_mask, label")
for batch in train_dloader:
    print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['label'].shape, )

input_ids, attention_mask, label
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512]) torch.Size([4])
torch.Size([4, 512]) torch.Size([4, 512