## Intro to Huggingface

### Load the model

In [1]:
# Loading a model (e.g. )
from transformers import AutoModel
encoder = AutoModel.from_pretrained("xlm-roberta-base", add_pooling_layer=False)

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [2]:
[k for k, _ in encoder.named_parameters()]

['embeddings.word_embeddings.weight',
 'embeddings.position_embeddings.weight',
 'embeddings.token_type_embeddings.weight',
 'embeddings.LayerNorm.weight',
 'embeddings.LayerNorm.bias',
 'encoder.layer.0.attention.self.query.weight',
 'encoder.layer.0.attention.self.query.bias',
 'encoder.layer.0.attention.self.key.weight',
 'encoder.layer.0.attention.self.key.bias',
 'encoder.layer.0.attention.self.value.weight',
 'encoder.layer.0.attention.self.value.bias',
 'encoder.layer.0.attention.output.dense.weight',
 'encoder.layer.0.attention.output.dense.bias',
 'encoder.layer.0.attention.output.LayerNorm.weight',
 'encoder.layer.0.attention.output.LayerNorm.bias',
 'encoder.layer.0.intermediate.dense.weight',
 'encoder.layer.0.intermediate.dense.bias',
 'encoder.layer.0.output.dense.weight',
 'encoder.layer.0.output.dense.bias',
 'encoder.layer.0.output.LayerNorm.weight',
 'encoder.layer.0.output.LayerNorm.bias',
 'encoder.layer.1.attention.self.query.weight',
 'encoder.layer.1.attention.se

### Load and preprocess the data (*small* map-style datasets)

In [3]:
# Loading datasets
from datasets import load_dataset
stsb = load_dataset(path="stsb_multi_mt", name="en")

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/470k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [4]:
stsb

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})

In [5]:
# Preprocess a dataset
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [6]:
train_examples = tokenizer(stsb["train"]["sentence1"][0], stsb["train"]["sentence2"][0], truncation=True, padding="max_length", max_length=32, return_tensors='pt')
print(train_examples)

{'input_ids': tensor([[    0,    62, 47880,    83, 35971,  5773,     5,     2,     2,   893,
          1831, 47880,    83, 35971,  5773,     5,     2,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])}


In [7]:
len(train_examples["input_ids"][0])

32

In [8]:
output = encoder(input_ids=train_examples["input_ids"], attention_mask=train_examples["attention_mask"])
print(output)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2114,  0.2636,  0.1051,  ..., -0.2435,  0.2229, -0.1088],
         [-0.0390,  0.0091,  0.0044,  ..., -0.0226,  0.0616,  0.2126],
         [-0.0664,  0.1254, -0.0792,  ...,  0.1323,  0.0104, -0.0032],
         ...,
         [ 0.2524,  0.3314, -0.1276,  ..., -0.7320, -0.0146,  0.0453],
         [ 0.2524,  0.3314, -0.1276,  ..., -0.7320, -0.0146,  0.0453],
         [ 0.2524,  0.3314, -0.1276,  ..., -0.7320, -0.0146,  0.0453]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=None, hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)


In [9]:
# Output representations
output[0].shape

torch.Size([1, 32, 768])

In [None]:
# Pooled CLS token
# output[1].shape

In [14]:
# Do we want to pad to max length all the time?
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=256)

In [15]:
tokenized_datasets = stsb.map(tokenize_function, batched=True, remove_columns=["sentence1", "sentence2"], )

Map:   0%|          | 0/5749 [00:00<?, ? examples/s]

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['similarity_score', 'input_ids', 'attention_mask'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['similarity_score', 'input_ids', 'attention_mask'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['similarity_score', 'input_ids', 'attention_mask'],
        num_rows: 1500
    })
})

In [17]:
# Change the format of all columns to torch tensors
tokenized_datasets["train"].set_format("pt")

In [18]:
from torch.utils.data import DataLoader
# No need to define a torch.utils.data.Dataset
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True)

In [19]:
batch = next(iter(train_dataloader))

In [20]:
batch["input_ids"].shape

# ==> Proceed for val and test in similar fashion

torch.Size([32, 256])

### Dynamic padding

In [21]:
from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tokenizer, padding=True)

In [22]:
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding=False, max_length=256)

In [23]:
tokenized_datasets = stsb.map(tokenize_function, batched=True, remove_columns=["sentence1", "sentence2"])

Map:   0%|          | 0/5749 [00:00<?, ? examples/s]

Map:   0%|          | 0/1379 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [24]:
tokenized_datasets["train"][0]

{'similarity_score': 5.0,
 'input_ids': [0,
  62,
  47880,
  83,
  35971,
  5773,
  5,
  2,
  2,
  893,
  1831,
  47880,
  83,
  35971,
  5773,
  5,
  2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [25]:
# No need to define a torch.utils.data.Dataset
train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=32, shuffle=True, collate_fn=collator)

In [26]:
batch = next(iter(train_dataloader))

In [27]:
batch["input_ids"].shape

torch.Size([32, 47])