In [1]:
from datasets import load_dataset

In [6]:
emotions = load_dataset("/home/guhangsong/Data/transformers/emotion", ignore_verifications=True)

Using custom data configuration default


Downloading and preparing dataset emotion/default (download: 1.97 MiB, generated: 2.07 MiB, post-processed: Unknown size, total: 4.05 MiB) to /home/guhangsong/.cache/huggingface/datasets/emotion/default/0.0.0/bc8a3c7dca14fe7ca0113ecd1687794c89a701f3687d9e412eec77748b02a679...


Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset emotion downloaded and prepared to /home/guhangsong/.cache/huggingface/datasets/emotion/default/0.0.0/bc8a3c7dca14fe7ca0113ecd1687794c89a701f3687d9e412eec77748b02a679. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
emotions['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

In [8]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [13]:
from transformers import AutoTokenizer

model_ckpt = "/home/guhangsong/Data/transformers/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [14]:
text = "Tokenizing text is a core task of NLP."

In [15]:
encoded_text = tokenizer(text)
print(encoded_text)

{'input_ids': [101, 19204, 6026, 3793, 2003, 1037, 4563, 4708, 1997, 17953, 2361, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [16]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

['[CLS]', 'token', '##izing', 'text', 'is', 'a', 'core', 'task', 'of', 'nl', '##p', '.', '[SEP]']


In [17]:
print(tokenizer.convert_tokens_to_string(tokens))

[CLS] tokenizing text is a core task of nlp. [SEP]


In [18]:
tokenizer.vocab_size

30522

In [19]:
tokenizer.model_max_length

1000000000000000019884624838656

In [20]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [21]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [22]:
print(tokenize(emotions["train"][:2]))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [23]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [24]:
emotions_encoded["train"].column_names

['text', 'label', 'input_ids', 'attention_mask']

In [25]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda")
num_labels = 6
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels).to(device)

Some weights of the model checkpoint at /home/guhangsong/Data/transformers/distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /home/guhangsong/Data/transformers/distilbert-base-uncased and are newly

In [27]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [35]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded["train"])
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                num_train_epochs=2,
                                learning_rate=2e-5,
                                per_device_train_batch_size=batch_size,
                                per_device_eval_batch_size=batch_size,
                                weight_decay=0.01,
                                evaluation_strategy="epoch",
                                disable_tqdm=False,
                                report_to="none",
                                logging_steps=500,
                                log_level="error")

In [36]:
trainer = Trainer(model=model, args=training_args,
                compute_metrics=compute_metrics,
                train_dataset=emotions_encoded["train"],
                eval_dataset=emotions_encoded["validation"],
                tokenizer=tokenizer)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.167576,0.93,0.930135
2,0.140400,0.164656,0.931,0.931337


TrainOutput(global_step=500, training_loss=0.14040017700195312, metrics={'train_runtime': 59.7284, 'train_samples_per_second': 535.758, 'train_steps_per_second': 8.371, 'total_flos': 720342861696000.0, 'train_loss': 0.14040017700195312, 'epoch': 2.0})

In [34]:
len(emotions_encoded["train"])

16000