In [65]:
%pip install datasets transformers onnx onnxruntime 



[link text](https://)We use the small distilled BERT model from Microsoft as our pre-trained model which we fine-tune on the emotion classification task. 
See https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased for details. Inspiration for dataset adjustment from https://colab.research.google.com/drive/1aue7x525rKy6yYLqqt-5Ll96qjQvpqS7#scrollTo=Dcw8-k4lO5Yk 

# Data

## load

In [66]:
model_name = 'microsoft/xtremedistil-l6-h384-uncased'
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h384-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/31d6577412393ebb07c02de876b2d1397fcae2d85cb053b588145f6869ab1a15.44cd178af39e607af310bc4cc48a944f5e5f746b372c161b32511f0fd585789b
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "typ

In [67]:
from datasets import load_dataset
ds = load_dataset("go_emotions", "raw")

Reusing dataset go_emotions (/root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [68]:
# ds = ds.filter(lambda example, idx: idx<10000, with_indices=True)
ds = ds.shuffle()

## Data Statistics

In [69]:
len(ds["train"])

211225

In [70]:
emotions = [
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [71]:
sample = ds["train"][:10000]
for emotion in emotions:
  print(f"{emotion}: {sum(sample[emotion])}")

admiration: 826
amusement: 472
anger: 420
annoyance: 607
approval: 881
caring: 298
confusion: 356
curiosity: 481
desire: 173
disappointment: 380
disapproval: 516
disgust: 242
embarrassment: 102
excitement: 259
fear: 163
gratitude: 539
grief: 36
joy: 338
love: 357
nervousness: 77
optimism: 409
pride: 41
realization: 432
relief: 80
remorse: 108
sadness: 306
surprise: 242
neutral: 2667


## Preprocess

In [72]:
ds_new = ds.map(lambda x : {"labels": [x[c] for c in emotions]})

0ex [00:00, ?ex/s]

In [73]:

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

cols = ds_new["train"].column_names
cols.remove("labels")
ds_enc = ds_new.map(tokenize_function, batched=True, remove_columns=cols)
ds_enc

  0%|          | 0/212 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 211225
    })
})

In [74]:
import torch
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_labels": x["labels"].to(torch.float32)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

0ex [00:00, ?ex/s]

In [75]:
ds_enc["train"][0]["labels"].dtype

torch.float32

# Model

So finally done processing the dataset - now define the model and training parameters

In [76]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [77]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(emotions), problem_type="multi_label_classification")
model = model.to(device)

loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h384-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/31d6577412393ebb07c02de876b2d1397fcae2d85cb053b588145f6869ab1a15.44cd178af39e607af310bc4cc48a944f5e5f746b372c161b32511f0fd585789b
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
   

In [78]:

ds_test = ds_enc.filter(lambda example, idx: idx<200, with_indices=True)
ds_train = ds_enc.filter(lambda example, idx: idx>=200, with_indices=True)

  0%|          | 0/212 [00:00<?, ?ba/s]

  0%|          | 0/212 [00:00<?, ?ba/s]

In [79]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer",
                                  per_device_train_batch_size=128, 
                                  num_train_epochs=4,learning_rate=3e-05,
                                  evaluation_strategy="steps",
                                  eval_steps=10,
                                  per_device_eval_batch_size=100)
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train['train'],
    eval_dataset=ds_test['train']
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 211025
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 6596


Step,Training Loss,Validation Loss
10,No log,0.657687
20,No log,0.622443
30,No log,0.591214
40,No log,0.561614
50,No log,0.531685
60,No log,0.500293
70,No log,0.467664
80,No log,0.4346
90,No log,0.402062
100,No log,0.371009


***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** Running Evaluation *****
  Num examples = 200
  Batch size = 100
***** 

KeyboardInterrupt: ignored

In [53]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer",
                                  per_device_train_batch_size=128, 
                                  num_train_epochs=4,learning_rate=1e-05,
                                  evaluation_strategy="steps",
                                  eval_steps=10,
                                  per_device_eval_batch_size=100)
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train['train'],
    eval_dataset=ds_test['train']
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 9500
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 300


Step,Training Loss,Validation Loss
10,No log,0.218594
20,No log,0.214476
30,No log,0.21074
40,No log,0.207362
50,No log,0.204314
60,No log,0.201595
70,No log,0.199126
80,No log,0.196906
90,No log,0.194901
100,No log,0.193101


***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** Running Evaluation *****
  Num examples = 500
  Batch size = 100
***** 

TrainOutput(global_step=300, training_loss=0.19336901346842447, metrics={'train_runtime': 209.5387, 'train_samples_per_second': 181.351, 'train_steps_per_second': 1.432, 'total_flos': 157683661824000.0, 'train_loss': 0.19336901346842447, 'epoch': 4.0})

In [54]:
import transformers
def parse_result(results):
  new_results = []
  for result in results:
    label = result["label"]
    assert label.startswith("LABEL_")
    label = label[len("LABEL_"):]
    new_results.append((emotions[int(label)], result["score"]))
  return new_results

pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer, device=0)

In [55]:
results = pipeline("this is great job")
parse_result(results)

results = pipeline("That game hurt.")
parse_result(results)

[('neutral', 0.2380610555410385)]

# Export PyTorch model to ONNX format for serving with ONNX Runtime Web 

In [None]:
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bergum/xtremedistil-l6-h384-go-emotion")
model = AutoModelForSequenceClassification.from_pretrained("bergum/xtremedistil-l6-h384-go-emotion")

Downloading:   0%|          | 0.00/365 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [None]:
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer)

In [None]:
pipeline("I love you so much")

[{'label': 'LABEL_18', 'score': 0.6735345721244812}]

In [None]:
onnx_convert.convert_pytorch(pipeline, opset=11, output=Path("extreme-go-emotion.onnx"), use_external_format=False)

Using framework PyTorch: 1.10.0+cu111
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']




In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic("extreme-go-emotion.onnx", "extreme-go-emotion-int8.onnx", 
                 weight_type=QuantType.QUInt8)

In [None]:
from google.colab import files

In [None]:
files.download("extreme-go-emotion-int8.onnx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model = model.to("cpu")

In [None]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,034 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [None]:
token="token"

In [None]:
model.push_to_hub("xtremedistil-l6-h384-go-emotion", use_auth_token=token)

In [None]:
tokenizer.push_to_hub("xtremedistil-l6-h384-go-emotion", use_auth_token=token)