# Setup

In [1]:
!pip install -q transformers
!pip install -q datasets
!nvidia-smi -L  

[K     |████████████████████████████████| 3.5 MB 5.5 MB/s 
[K     |████████████████████████████████| 67 kB 3.5 MB/s 
[K     |████████████████████████████████| 6.8 MB 31.2 MB/s 
[K     |████████████████████████████████| 596 kB 43.0 MB/s 
[K     |████████████████████████████████| 895 kB 41.9 MB/s 
[K     |████████████████████████████████| 311 kB 5.4 MB/s 
[K     |████████████████████████████████| 243 kB 44.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 41.8 MB/s 
[K     |████████████████████████████████| 133 kB 50.5 MB/s 
[K     |████████████████████████████████| 94 kB 2.9 MB/s 
[K     |████████████████████████████████| 144 kB 51.2 MB/s 
[K     |████████████████████████████████| 271 kB 29.6 MB/s 
[?25hGPU 0: Tesla K80 (UUID: GPU-d02a2109-da78-97b6-a90e-acb342f53aee)


In [2]:
import numpy as np
import random

from datasets import load_dataset
from datasets import load_metric
from transformers import pipeline
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, RobertaForSequenceClassification, AutoModelForSequenceClassification

from sklearn.calibration import calibration_curve
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt

# Dateset Retrieval and Processing

In [3]:
dataset = load_dataset("tweet_eval", "emoji")

dataset
dataset['test'][0]

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading and preparing dataset tweet_eval/emoji (download: 7.28 MiB, generated: 8.06 MiB, post-processed: Unknown size, total: 15.34 MiB) to /root/.cache/huggingface/datasets/tweet_eval/emoji/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343...


  0%|          | 0/6 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/30.4k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/159k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.57k [00:00<?, ?B/s]

  0%|          | 0/6 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset tweet_eval downloaded and prepared to /root/.cache/huggingface/datasets/tweet_eval/emoji/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

{'label': 2, 'text': 'en Pelham Parkway'}

In [4]:
################## YOUR CODE HERE ##################
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=20, problem_type="single_label_classification")

model
####################################################

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [5]:
# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/45 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

# Training

Leverage pre-trained base model as described in https://aclanthology.org/2020.emnlp-demos.2.pdf, which we fine-tune using Emoji data

In [None]:
################## YOUR CODE HERE ##################
from sklearn.metrics import f1_score, classification_report

# Setup training
training_args = TrainingArguments("test_trainer", 
                  num_train_epochs=3,
                  per_device_train_batch_size=8,
                  per_device_eval_batch_size=8,
                  evaluation_strategy="epoch",
                  learning_rate=5e-05,
                  )

metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
)

trainer.train()
####################################################

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 45000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 16875


Epoch,Training Loss,Validation Loss,Accuracy
1,1.8746,2.342075,0.2964
2,1.5915,2.296325,0.319
3,1.3045,2.354966,0.3238


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2000
Configuration saved in test_trainer/checkpoint-2000/config.json
Model weights saved in test_trainer/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-2500
Configuration saved in test_trainer/checkpoint-2500/config.json
Model weights saved in test_trainer/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-30

TrainOutput(global_step=16875, training_loss=1.6451295283564815, metrics={'train_runtime': 2533.9571, 'train_samples_per_second': 53.276, 'train_steps_per_second': 6.66, 'total_flos': 8881433256960000.0, 'train_loss': 1.6451295283564815, 'epoch': 3.0})

In [None]:
trainer.evaluate(tokenized_datasets['test'])

# Save Model

In [None]:
model.save_pretrained("model")

Configuration saved in model/config.json
Model weights saved in model/pytorch_model.bin


In [None]:
from google.colab import files
files.download('/content/model') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>