In [1]:
checkpoint = "distilbert-base-uncased"

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [2]:
from datasets import load_dataset

emotions = load_dataset("SetFit/emotion")
emotions

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 2000
    })
})

In [3]:
def tokenize_text(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

In [4]:
emotions_encoded = emotions.map(tokenize_text, batched=True, batch_size=None)
emotions_encoded

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [5]:
import torch
from transformers import AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(checkpoint).to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
print(model.config)
print(model.device)

DistilBertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.48.2",
  "vocab_size": 30522
}

cuda:0


In [7]:
text = "this is a test"
inputs = tokenizer(text, return_tensors="pt")
print(f"Inputs tensor shape: {inputs['input_ids'].size()}")

Inputs tensor shape: torch.Size([1, 6])


In [8]:
inputs = {k:v.to(device) for k,v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

BaseModelOutput(last_hidden_state=tensor([[[-0.1565, -0.1862,  0.0528,  ..., -0.1188,  0.0662,  0.5470],
         [-0.3575, -0.6484, -0.0618,  ..., -0.3040,  0.3508,  0.5221],
         [-0.2772, -0.4459,  0.1818,  ..., -0.0948, -0.0076,  0.9958],
         [-0.2841, -0.3917,  0.3753,  ..., -0.2151, -0.1173,  1.0526],
         [ 0.2661, -0.5094, -0.3180,  ..., -0.4203,  0.0144, -0.2149],
         [ 0.9441,  0.0112, -0.4714,  ...,  0.1439, -0.7288, -0.1619]]],
       device='cuda:0'), hidden_states=None, attentions=None)


In [9]:
print(f"Outputs tensor shape: {outputs.last_hidden_state.size()}")

Outputs tensor shape: torch.Size([1, 6, 768])


In [10]:
print(f"Last hidden state tensor for the [CLS] token: {outputs.last_hidden_state[:, 0].size()}")

Last hidden state tensor for the [CLS] token: torch.Size([1, 768])


In [11]:
def extract_hidden_states(batch):
    # place model inputs on the GPU
    inputs = { 
        k:v.to(device) 
        for k,v in batch.items()
        if k in tokenizer.model_input_names
    }
    # extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # return vector for the [CLS] token
    # ... putting it back on the CPU,
    # ... and returning as a Numpy object
    return { 
        "hidden_state": last_hidden_state[:,0].cpu().numpy()
    }

In [12]:
emotions_encoded.set_format(
    "torch",
    columns=["input_ids", "attention_mask", "label"]
)

In [13]:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [14]:
emotions_hidden["train"].column_names

['text', 'label', 'label_text', 'input_ids', 'attention_mask', 'hidden_state']

In [15]:
emotions_hidden["test"]["hidden_state"]

tensor([[-0.0216,  0.1965,  0.0041,  ..., -0.0646,  0.4319,  0.5599],
        [-0.0165,  0.3204, -0.0225,  ..., -0.1050,  0.4260,  0.4670],
        [-0.1114,  0.0211, -0.2773,  ..., -0.0317,  0.2478,  0.2088],
        ...,
        [ 0.1405,  0.0942, -0.0726,  ...,  0.0361,  0.3638,  0.1618],
        [-0.0996,  0.2276,  0.0766,  ..., -0.0227,  0.3045,  0.5173],
        [ 0.0641,  0.1125, -0.0785,  ...,  0.0009,  0.3229,  0.4590]])

In [16]:
import numpy as np

X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, X_valid.shape

((16000, 768), (2000, 768))

id2str = dict(
    zip(
        emotions["train"].unique('label'), 
        emotions["train"].unique('label_text')
    )
)

id2str = dict(sorted(id2str.items(), key=lambda item: item[0]))
id2str

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 3, figsize=(7,5))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples", "Greens"]
labels = id2str.values()

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"label == {i}")
    axes[i].hexbin(
        df_emb_sub["X"], 
        df_emb_sub["Y"],
        cmap=cmap,
        gridsize=20,
        linewidths=(0,)
    )
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])
    
plt.tight_layout()
plt.show()