## Load dataset

In [None]:
from datasets import list_datasets, load_dataset
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
all_datasets = list_datasets()
print(len(all_datasets))
emotions = load_dataset("emotion")
emotions
train_ds = emotions["train"]
train_ds[0]
train_ds.features

In [None]:
# convert to pandas
emotions.set_format(type="pandas")
df = emotions["train"][:]

def label_to_string(row):
  return emotions["train"].features["label"].int2str(row)

df["label_name"] = df["label"].apply(label_to_string)

## EDA

In [None]:
# class imbalance
df["label_name"].value_counts(ascending=True).plot.barh()

In [None]:
# length of messages since input is max at 512 tokens for most architectures
(df
 .assign(word_per_tweet=lambda df: df.text.str.split()
         .apply(len)).boxplot(by="label_name"))


In [None]:
emotions.reset_format()

## Tokenizer

Option 1 - Character-level encoding

- Pro: deal with misspelling and OOV words
- Con: no prior on words so need to be learnt by model (sparser space)

In [3]:
text = "a test for tokenization"
tokenized_text = list(text)
token2idx  = {token:idx for idx,token in enumerate(sorted(set(tokenized_text)))}

In [6]:
input_ids = list(map(lambda x: token2idx.get(x), tokenized_text))

In [None]:
import torch
import torch.nn.functional as F

In [None]:
input_ids = torch.tensor(input_ids)
one_hot_encoding = F.one_hot(input_ids, num_classes=len(token2idx))
# with tf
one_hot_encoding = tf.one_hot(input_ids, depth=len(token2idx))

Option 2 - Word-level encoding
- Pro: add some prior knowledge
- Usually limit vocab size to ~10k words and encode other words with UNK

Option 3 - Subword tokenization
- Idea: split rare words and misspellings but keep whole frequent words
- Pro: mix of both words
- Common algos: WordPiece, Byte

In [None]:
from transformers import AutoTokenizer

model = "distillbert-based-uncased"
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

- attention_mask = 0 for padding token, i.e. should not consider padding as a feature
- padding is set batch-wise, i.e. max(longest sequence in batch, max_input_for_model)

In [None]:
# batched=True -> avoids to pad individually
# batch_zie = None -> encode whole dataset at once
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

How to-use Distilbert
1. Option 1 - Use as a feature extractor and train a classifier on it (frozen embeddings)
2. Option 2 - Fine-tune both model and head together

In [None]:
from transformers import AutoModel
model_ckpt = "distillbert-based-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
text = "this is a test"
inputs = tokenizer(text, return_tensors="pt")
inputs = {k:v.to(device) for k,v in inputs.items()} # set to same device as model
with torch.no_grad(): # disable gradient propagation
    outputs = model(**inputs) # forward pass, outputs a 768 dense vector
print(outputs)

- contains one 768 dense vector for each token in vocabulary
- For text classification, we can use only the [CLS] token, marking the beginning of each sequence