# SentiRoBERTa

---

In this notebook we will use NLTK's twitter dataset and along  the `distilroberta-base` model checkpoint to train `sentiroberta` for sentiment classification.

- Requirements
  - transformers
  - datasets (hugging-face)
  - pytorch
  - numpy
  - NLTK
 
  
- The `datasets` library abstracts many of the pre-processing steps, so we get right into building our model as fast as possible.  

> ***TIP:*** *If you are new to programming or new to NLP, I still suggest doing the manual steps - including re-inventing the wheel. Start with the `text-cleaning -> tokenization -> extracting-features` and finally modeling because all of these steps affect the final result. Consequently, there are infinite blends and specific techniques in an NLP pipeline; choosing the most suitable requires knowing what not to do!*

In [1]:
from rich.jupyter import print

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import datasets

In [2]:
from nltk.corpus import twitter_samples

def twitter_dataset(
    ratio=1.0,
    size=5000,
    format='dataset',
    labels_dtype='list',
    columns=['texts', 'labels'],
):
    """Twitter Dataset
    :param format: a type of data format; `dataset` or `mapped`.
    :param labels_dtype: labels data type; `np` or `list`.
    """
    A, B = columns
    pos = twitter_samples.strings('positive_tweets.json')[:size]
    neg = twitter_samples.strings('negative_tweets.json')[:size]
    k = int(ratio * size)
    x_tweets = pos[:k] + neg[:k]  # Train text splits.
    y_tweets = pos[k:] + neg[k:]  # Test text splits.
    x_labels = [1] * len(pos[:k]) + [0] * len(neg[:k])
    y_labels = [1] * len(pos[k:]) + [0] * len(neg[k:])
    if format in 'mapped':
        tweets = x_tweets + y_tweets
        labels = x_labels + y_labels
        if labels_dtype == 'np':
            labels = np.array(labels)[None, :].T
        return {A: tweets, B: labels}
    dataset = {}
    if labels_dtype == 'np':
        x_labels = np.array(x_labels)[None, :].T
        y_labels = np.array(y_labels)[None, :].T
    dataset['x'] = {A: x_tweets, B: x_labels}
    dataset['y'] = {A: y_tweets, B: y_labels}
    return dataset

In [3]:
dataset = datasets.Dataset.from_dict(
    mapping=twitter_dataset(format='mapped', columns=['text', 'labels']),
    split=['train', 'test'],
)
dataset_split = dataset.train_test_split(0.1, shuffle=True, seed=1234)
print(dataset_split)

In [4]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('distilroberta-base')

In [5]:
max_seqlen = 64
encoded_dataset = dataset_split.map(
    lambda field: tokenizer(
        field['text'], padding=True, truncation=True, max_length=max_seqlen,
    ), batched=True,
)

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [6]:
columns = list(encoded_dataset['train'][0].keys())
if 'text' in columns:
    columns.pop(columns.index('text'))

encoded_dataset.set_format(type='torch', columns=columns)
print('input_ids: {}, label_class: {}\n* {}\n\t - {}'.format(
    encoded_dataset['train']['input_ids'][0].shape,
    encoded_dataset['train']['labels'].shape,
    list(encoded_dataset.column_names),
    columns,
))

In [9]:
from torch.utils.data import DataLoader

test_dl = DataLoader(encoded_dataset['test'], batch_size=8)
input_batch = next(iter(test_dl))
print(input_batch)

In [10]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(
    'distilroberta-base', return_dict=True
)
model = model.train()
print(model)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

In [11]:
labels = input_batch['labels']
inputs = input_batch['input_ids']
attention_mask = input_batch['attention_mask']
outputs = model(inputs, attention_mask=attention_mask)
loss = F.cross_entropy(outputs.logits, labels)
loss.backward()
print(loss)

In [12]:
glue_task = "accuracy"
metric = datasets.load_metric(glue_task)
print(metric)

In [17]:
from transformers import TrainingArguments, Trainer

def model_init():
    return RobertaForSequenceClassification.from_pretrained(
        'distilroberta-base', return_dict=True,
    )

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.argmax(axis=-1)
    return metric.compute(predictions=preds, references=labels)

# Test function before training session!
print(compute_metrics((outputs.logits, labels)))

In [15]:
del model  # Delete the current initialized model we used to test the DL

In [19]:
args = TrainingArguments(
    output_dir='SentiRoBERTa',
    overwrite_output_dir=True,
    eval_accumulation_steps=True,
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    disable_tqdm=False,
    fp16=True,  # set scaled floating point.
)

trainer = Trainer(
    args=args,
    tokenizer=tokenizer,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    model_init=model_init,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

**hyperparameter search docs**

- Args

  - **compute_objective** (:obj:`Callable[[Dict[str, float]], float]`, `optional`):
  
    - A function computing the objective to minimize or maximize from the metrics returned by the :obj:`evaluate` method. Will default to :func:`~transformers.trainer_utils.default_compute_objective`.
 
  - **n_trials** (:obj:`int`, `optional`, defaults to 100):
  
    - The number of trial runs to test.
    
  - **direction** (:obj:`str`, `optional`, defaults to :obj:`"minimize"`):
 
    - Whether to optimize greater or lower objects. Can be :obj:`"minimize"` or :obj:`"maximize"`, you should pick :obj:`"minimize"` when optimizing the validation loss, :obj:`"maximize"` when optimizing one or several metrics.

In [20]:
best_run = trainer.hyperparameter_search(
    direction='maximize',  # We want to optimize our accuracy metric (classification)
    backend='optuna',  # Check transformers docs for other supported backends.
    n_trials=10,
)

[32m[I 2021-01-20 00:45:33,024][0m A new study created in memory with name: no-name-f344b652-bbc3-4e1b-ba3a-738b78b3e87a[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassifica

Step,Training Loss
500,0.3325
1000,0.0087
1500,0.0033
2000,0.0129


[32m[I 2021-01-20 00:48:43,947][0m Trial 0 finished with value: 284.1903 and parameters: {'learning_rate': 2.015492944525841e-06, 'num_train_epochs': 2, 'seed': 5, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

Step,Training Loss
500,0.0216
1000,0.0014


[32m[I 2021-01-20 00:52:50,941][0m Trial 1 finished with value: 279.0453 and parameters: {'learning_rate': 6.716647183348325e-05, 'num_train_epochs': 4, 'seed': 26, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

Step,Training Loss
500,0.0537
1000,0.0138
1500,0.0099
2000,0.0001


[32m[I 2021-01-20 00:56:04,454][0m Trial 2 finished with value: 278.921 and parameters: {'learning_rate': 2.4252538182599025e-05, 'num_train_epochs': 2, 'seed': 29, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForS

Step,Training Loss


[32m[I 2021-01-20 00:57:07,112][0m Trial 3 finished with value: 241.8805 and parameters: {'learning_rate': 1.659903578537249e-05, 'num_train_epochs': 1, 'seed': 33, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

Step,Training Loss
500,0.047
1000,0.0021


[32m[I 2021-01-20 00:59:46,511][0m Trial 4 finished with value: 262.1658 and parameters: {'learning_rate': 1.9593029202139164e-05, 'num_train_epochs': 2, 'seed': 28, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFo

Step,Training Loss
500,0.0492
1000,0.0119
1500,0.0122
2000,0.0141
2500,0.0205
3000,0.0
3500,0.018
4000,0.0
4500,0.0002
5000,0.0018


[32m[I 2021-01-20 01:12:36,909][0m Trial 5 finished with value: 276.2296 and parameters: {'learning_rate': 3.566022862064323e-05, 'num_train_epochs': 5, 'seed': 3, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

Step,Training Loss
500,0.0336


[32m[I 2021-01-20 01:16:15,444][0m Trial 6 finished with value: 278.5786 and parameters: {'learning_rate': 1.8779524995389454e-05, 'num_train_epochs': 4, 'seed': 3, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

Step,Training Loss
500,0.055
1000,0.0418
1500,0.0265
2000,0.0273
2500,0.0047
3000,0.0
3500,0.0124
4000,0.0052
4500,0.0104
5000,0.0051


[32m[I 2021-01-20 01:26:06,308][0m Trial 7 finished with value: 281.5795 and parameters: {'learning_rate': 9.007294721770037e-05, 'num_train_epochs': 4, 'seed': 32, 'per_device_train_batch_size': 4}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForS

Step,Training Loss
500,0.2452


[32m[I 2021-01-20 01:29:16,766][0m Trial 8 finished with value: 279.3541 and parameters: {'learning_rate': 1.901440320007455e-06, 'num_train_epochs': 3, 'seed': 26, 'per_device_train_batch_size': 32}. Best is trial 0 with value: 284.1903.[0m
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

Step,Training Loss
500,0.0314
1000,0.0187


[32m[I 2021-01-20 01:31:00,154][0m Trial 9 finished with value: 266.868 and parameters: {'learning_rate': 9.38030078656833e-05, 'num_train_epochs': 1, 'seed': 13, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 284.1903.[0m


In [21]:
print(best_run)  # Our final best run!

### Training with optimized Hyperparameters

The `trainer.train()` method will take care of everything for us! But to give you an idea of what is going on in the background see the code below.

```python
# out optimized optuna parameters
optuna_lr = 2.015492944525841e-06
optuna_epochs = 2

dataloader = Dataloader(...)  # train and eval loaders
optimizer = AdamW(model.parameters(), lr=optuna_lr)

for epoch in range(optuna_epochs):
    for batch in dataloader:
        optim.zero_grad()
        input = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        output = model(input, attention_mask=attn_mask, labels=labels)
        loss = output[0]
        loss.backward()
        optim.step()
...
```

In [22]:
# Lets use the optimized hyperparameters for training our sentiment-model.
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)  # override the previous training arguments.

trainer.train()  # Train the model!

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

Step,Training Loss
500,0.3319
1000,0.0084
1500,0.0032
2000,0.0127


TrainOutput(global_step=2250, training_loss=0.07920986966954337, metrics={'train_runtime': 182.4037, 'train_samples_per_second': 12.335, 'total_flos': 567613011456000, 'epoch': 2.0})

In [23]:
# Wow that was fast!
print(trainer.evaluate())

In [26]:
!mkdir sen

In [27]:
# Save the model and tokenizer.
trainer.save_model('sentiroberta/pt')
trainer.tokenizer.save_pretrained('sentiroberta/pt')

('sentiroberta/pt/tokenizer_config.json',
 'sentiroberta/pt/special_tokens_map.json',
 'sentiroberta/pt/vocab.json',
 'sentiroberta/pt/merges.txt',
 'sentiroberta/pt/added_tokens.json')

In [28]:
# Verify and load the saved model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('sentiroberta/pt/')
tokenizer = RobertaTokenizerFast.from_pretrained('sentiroberta/pt/')

In [29]:
from transformers import pipeline
# We will use the existing sentiment pipeline to test our model with actual text,
model.config.id2label.update({0: 'NEGATIVE', 1: 'POSITIVE'}) 
# Simply pass the Model and Tokenizer to the pipe.
sentiment = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [30]:
print(sentiment("I hate my life :("))
print(sentiment("I love my life :)"))

In [40]:
# input-ids and attention-mask from the test-dataloader from earlier.
with torch.no_grad():
    output = model(inputs, attention_mask=attention_mask)
    preds = output[0].cpu()

id2label = model.config.id2label
scores = np.exp(preds.numpy()) / np.exp(preds.numpy()).sum(-1, keepdims=True)
gold_results = [
    {
        "truth": id2label[label.item()],
        "predicted": id2label[item.argmax()],
        "?": "🔥" if id2label[label.item()] == id2label[item.argmax()] else "👀",
        "score": item.max().item()
    } for item, label in zip(scores, labels)
]
print(gold_results)  # Our trained model predicted all correctly!

In [39]:
print(F.cross_entropy(output.logits, labels))

In [62]:
# Can our model detect the difference between similar texts? Indeed 🤗
print(sentiment('fuck! life is not worth giving up!'))
print(sentiment('fuck! life is not worth it, give up.'))  # tricky, yet correctly predicts its negative!

In [52]:
print(sentiment("On a mission to solve NLP, one commit at a time 🤗."))