In [None]:
%pip install datasets transformers onnx onnxruntime 

[link text](https://)We use the small distilled BERT model from Microsoft as our pre-trained model which we fine-tune on the emotion classification task. 
See https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased for details. Inspiration for dataset adjustment from https://colab.research.google.com/drive/1aue7x525rKy6yYLqqt-5Ll96qjQvpqS7#scrollTo=Dcw8-k4lO5Yk 

## Data Preprocessing

In [2]:
model_name = 'microsoft/xtremedistil-l6-h384-uncased'
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/526 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [44]:
from datasets import load_dataset
ds = load_dataset("go_emotions", "raw")

Reusing dataset go_emotions (/root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [45]:
ds = ds.filter(lambda example, idx: idx<10000, with_indices=True)

  0%|          | 0/212 [00:00<?, ?ba/s]

In [46]:
len(ds["train"])

10000

In [47]:
emotions = [
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [49]:
ds["train"][:5]

{'admiration': [0, 0, 0, 0, 0],
 'amusement': [0, 0, 0, 0, 0],
 'anger': [0, 0, 0, 0, 0],
 'annoyance': [0, 0, 0, 0, 0],
 'approval': [0, 0, 0, 0, 0],
 'author': ['Brdd9',
  'TheGreen888',
  'Labalool',
  'MrsRobertshaw',
  'American_Fascist713'],
 'caring': [0, 0, 0, 0, 0],
 'confusion': [0, 0, 0, 0, 0],
 'created_utc': [1548381056.0,
  1548084224.0,
  1546427776.0,
  1547965056.0,
  1546668544.0],
 'curiosity': [0, 0, 0, 0, 0],
 'desire': [0, 0, 0, 0, 0],
 'disappointment': [0, 0, 0, 0, 0],
 'disapproval': [0, 0, 0, 0, 0],
 'disgust': [0, 0, 0, 0, 0],
 'embarrassment': [0, 0, 0, 0, 0],
 'example_very_unclear': [False, False, False, False, False],
 'excitement': [0, 0, 0, 0, 0],
 'fear': [0, 0, 0, 0, 0],
 'gratitude': [0, 0, 0, 0, 0],
 'grief': [0, 0, 0, 0, 0],
 'id': ['eew5j0j', 'eemcysk', 'ed2mah1', 'eeibobj', 'eda6yn6'],
 'joy': [0, 0, 0, 0, 0],
 'link_id': ['t3_ajis4z', 't3_ai4q37', 't3_abru74', 't3_ahulml', 't3_ackt2f'],
 'love': [0, 0, 0, 1, 0],
 'nervousness': [0, 0, 0, 0, 0],


In [None]:
ds_new = ds.map(lambda x : {"labels": [x[c] for c in emotions]})

In [53]:

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

cols = ds_new["train"].column_names
cols.remove("labels")
ds_enc = ds_new.map(tokenize_function, batched=True, remove_columns=cols)
ds_enc

  0%|          | 0/10 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [61]:
import torch
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_labels": x["labels"].to(torch.float32)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

0ex [00:00, ?ex/s]

In [66]:
ds_enc["train"][0]["labels"].dtype

torch.float32

# Model

So finally done processing the dataset - now define the model and training parameters

In [67]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [68]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(emotions), problem_type="multi_label_classification")
model = model.to(device)

loading configuration file https://huggingface.co/microsoft/xtremedistil-l6-h384-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/31d6577412393ebb07c02de876b2d1397fcae2d85cb053b588145f6869ab1a15.44cd178af39e607af310bc4cc48a944f5e5f746b372c161b32511f0fd585789b
Model config BertConfig {
  "_name_or_path": "microsoft/xtremedistil-l6-h384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
   

In [69]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer",
                                  per_device_train_batch_size=128, 
                                  num_train_epochs=4,learning_rate=3e-05,
                                  evaluation_strategy="no")
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_enc['train']
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [70]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 316


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=316, training_loss=0.3759038659590709, metrics={'train_runtime': 156.9364, 'train_samples_per_second': 254.88, 'train_steps_per_second': 2.014, 'total_flos': 165982801920000.0, 'train_loss': 0.3759038659590709, 'epoch': 4.0})

In [77]:
import transformers
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer, device=0)

In [97]:
results = pipeline("this is great job")

def parse_result(results):
  new_results = []
  for result in results:
    label = result["label"]
    assert label.startswith("LABEL_")
    label = label[len("LABEL_"):]
    new_results.append((emotions[int(label)], result["score"]))
  return new_results

parse_result(results)



[('neutral', 0.2732345461845398)]

# Export PyTorch model to ONNX format for serving with ONNX Runtime Web 

In [None]:
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bergum/xtremedistil-l6-h384-go-emotion")
model = AutoModelForSequenceClassification.from_pretrained("bergum/xtremedistil-l6-h384-go-emotion")

Downloading:   0%|          | 0.00/365 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [None]:
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer)

In [None]:
pipeline("I love you so much")

[{'label': 'LABEL_18', 'score': 0.6735345721244812}]

In [None]:
onnx_convert.convert_pytorch(pipeline, opset=11, output=Path("extreme-go-emotion.onnx"), use_external_format=False)

Using framework PyTorch: 1.10.0+cu111
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']




In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic("extreme-go-emotion.onnx", "extreme-go-emotion-int8.onnx", 
                 weight_type=QuantType.QUInt8)

In [None]:
from google.colab import files

In [None]:
files.download("extreme-go-emotion-int8.onnx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
model = model.to("cpu")

In [None]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,034 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [None]:
token="token"

In [None]:
model.push_to_hub("xtremedistil-l6-h384-go-emotion", use_auth_token=token)

In [None]:
tokenizer.push_to_hub("xtremedistil-l6-h384-go-emotion", use_auth_token=token)