In [1]:
%pip install datasets transformers onnx onnxruntime 

Successfully installed aiohttp-3.8.1 aiosignal-1.2.0 async-timeout-4.0.2 asynctest-0.13.0 datasets-1.18.3 frozenlist-1.3.0 fsspec-2022.2.0 huggingface-hub-0.4.0 multidict-6.0.2 onnx-1.11.0 onnxruntime-1.10.0 pyyaml-6.0 sacremoses-0.0.47 tokenizers-0.11.6 transformers-4.17.0 xxhash-3.0.0 yarl-1.7.2


[link text](https://)We use the small distilled BERT model from Microsoft as our pre-trained model which we fine-tune on the emotion classification task. 
See https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased for details. Inspiration for dataset adjustment from https://colab.research.google.com/drive/1aue7x525rKy6yYLqqt-5Ll96qjQvpqS7#scrollTo=Dcw8-k4lO5Yk 

# Data

## load

In [2]:
model_name = 'microsoft/xtremedistil-l6-h384-uncased'
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/526 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [3]:
from datasets import load_dataset
ds = load_dataset("go_emotions", "raw")

Downloading:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading and preparing dataset go_emotions/raw (download: 40.76 MiB, generated: 52.78 MiB, post-processed: Unknown size, total: 93.54 MiB) to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

Dataset go_emotions downloaded and prepared to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
ds = ds.filter(lambda example, idx: idx<10000, with_indices=True)

  0%|          | 0/212 [00:00<?, ?ba/s]

## Data Statistics

In [5]:
len(ds["train"])

10000

In [6]:
emotions = [
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [7]:
sample = ds["train"][:10000]
for emotion in emotions:
  print(f"{emotion}: {sum(sample[emotion])}")

admiration: 766
amusement: 396
anger: 368
annoyance: 616
approval: 859
caring: 263
confusion: 394
curiosity: 493
desire: 178
disappointment: 407
disapproval: 541
disgust: 232
embarrassment: 99
excitement: 301
fear: 153
gratitude: 565
grief: 29
joy: 376
love: 387
nervousness: 85
optimism: 422
pride: 51
realization: 426
relief: 83
remorse: 141
sadness: 337
surprise: 259
neutral: 2633


## Preprocess

In [8]:
ds_new = ds.map(lambda x : {"labels": [x[c] for c in emotions]})

0ex [00:00, ?ex/s]

In [9]:

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

cols = ds_new["train"].column_names
cols.remove("labels")
ds_enc = ds_new.map(tokenize_function, batched=True, remove_columns=cols)
ds_enc

  0%|          | 0/10 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [10]:
import torch
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_labels": x["labels"].to(torch.float32)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

0ex [00:00, ?ex/s]

In [11]:
ds_enc["train"][0]["labels"].dtype

torch.float32

# Model

So finally done processing the dataset - now define the model and training parameters

In [12]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [13]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(emotions), problem_type="multi_label_classification")
model = model.to(device)

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer",
                                  per_device_train_batch_size=128, 
                                  num_train_epochs=4,learning_rate=3e-05,
                                  evaluation_strategy="no")
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_enc['train']
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 316


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=316, training_loss=0.36887263044526303, metrics={'train_runtime': 191.7811, 'train_samples_per_second': 208.571, 'train_steps_per_second': 1.648, 'total_flos': 165982801920000.0, 'train_loss': 0.36887263044526303, 'epoch': 4.0})

In [18]:
import transformers

def parse_result(results):
  new_results = []
  for result in results:
    label = result["label"]
    assert label.startswith("LABEL_")
    label = label[len("LABEL_"):]
    new_results.append((emotions[int(label)], result["score"]))
  return new_results

pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer, device=0)

In [19]:
ds["train"][0]

{'admiration': 0,
 'amusement': 0,
 'anger': 0,
 'annoyance': 0,
 'approval': 0,
 'author': 'Brdd9',
 'caring': 0,
 'confusion': 0,
 'created_utc': 1548381056.0,
 'curiosity': 0,
 'desire': 0,
 'disappointment': 0,
 'disapproval': 0,
 'disgust': 0,
 'embarrassment': 0,
 'example_very_unclear': False,
 'excitement': 0,
 'fear': 0,
 'gratitude': 0,
 'grief': 0,
 'id': 'eew5j0j',
 'joy': 0,
 'link_id': 't3_ajis4z',
 'love': 0,
 'nervousness': 0,
 'neutral': 0,
 'optimism': 0,
 'parent_id': 't1_eew18eq',
 'pride': 0,
 'rater_id': 1,
 'realization': 0,
 'relief': 0,
 'remorse': 0,
 'sadness': 1,
 'subreddit': 'nrl',
 'surprise': 0,
 'text': 'That game hurt.'}

In [21]:
results = pipeline("That game hurt.")

parse_result(results)

[('neutral', 0.2507284879684448)]

# Export PyTorch model to ONNX format for serving with ONNX Runtime Web 

In [None]:
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bergum/xtremedistil-l6-h384-go-emotion")
model = AutoModelForSequenceClassification.from_pretrained("bergum/xtremedistil-l6-h384-go-emotion")

Downloading:   0%|          | 0.00/365 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [None]:
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer)

In [None]:
pipeline("I love you so much")

[{'label': 'LABEL_18', 'score': 0.6735345721244812}]

In [None]:
onnx_convert.convert_pytorch(pipeline, opset=11, output=Path("extreme-go-emotion.onnx"), use_external_format=False)

Using framework PyTorch: 1.10.0+cu111
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']




In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic("extreme-go-emotion.onnx", "extreme-go-emotion-int8.onnx", 
                 weight_type=QuantType.QUInt8)

In [None]:
from google.colab import files

In [None]:
files.download("extreme-go-emotion-int8.onnx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model = model.to("cpu")

In [None]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,034 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [None]:
token="token"

In [None]:
model.push_to_hub("xtremedistil-l6-h384-go-emotion", use_auth_token=token)

In [None]:
tokenizer.push_to_hub("xtremedistil-l6-h384-go-emotion", use_auth_token=token)