In [1]:
import torch
from datasets import list_datasets, load_dataset, Dataset
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

In [2]:
dataset_list = list_datasets()

In [3]:
len(dataset_list)

7646

In [4]:
dataset_list[:5]

['acronym_identification',
 'ade_corpus_v2',
 'adversarial_qa',
 'aeslc',
 'afrikaans_ner_corpus']

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Load dataset

In [6]:
emotion = load_dataset('emotion')

Using custom data configuration default
Reusing dataset emotion (/home/andreas/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
emotion

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

### Tokenize

In [8]:
model_ckpt = "distilbert-base-uncased"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [10]:
def tokenize(batch):
    result = tokenizer(batch['text'], padding=True, truncation=True)
    return result

In [11]:
emotion_encoded = emotion.map(tokenize, batched=True, batch_size=512)



  0%|          | 0/32 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [12]:
emotion_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

### Model

In [None]:
model = AutoModel.from_pretrained(model_ckpt)

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v for k, v in batch.items() if k in tokenizer.model_input_names}
    
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
        
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
emotion_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
### take a long time
emotion_encoded.map(extract_hidden_states, batched=True, batch_size=2048)

### Fine tuning Transformers

In [None]:
num_labels=6
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)
model = model.to(device)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    
    return {"accuracy": acc, "f1": f1}

In [None]:
batch_size=64
logging_steps = len(emotion_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"

In [None]:
print(model_name)

In [None]:
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True, 
                                  log_level="error")

In [None]:
training_args

In [None]:
trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=emotion_encoded["train"],
                  eval_dataset=emotion_encoded["validation"],
                  tokenizer=tokenizer)

In [None]:
trainer.train()

In [None]:
preds_output = trainer.predict(emotion_encoded['validation'])

In [None]:
preds_output.metrics

In [None]:
y_pred = preds_output.predictions.argmax(axis=1)

In [None]:
y_pred

In [None]:
preds_output = trainer.predict(emotion_encoded['test'])

In [None]:
preds_output.metrics

### Saving and share model

In [None]:
trainer.push_to_hub(commit_message="Training Completed!")