### Creating a RoBERTa Model for Sequence Classification

Download the required libraries and setup the GPU.

In [1]:
!pip install torch
!pip install transformers

import tensorflow as tf
import torch

# If there's a GPU available...
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.3 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 47.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

Get the data and unzip it. Using [this notebook](https://github.com/avyavkumar/machine-learning-notebooks/blob/main/bert/bert_fine_tuning_seq_classfication.ipynb) for reference.

In [2]:
!pip install wget

import wget
import os

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'
filename = wget.download(url)

print('Downloaded the file to {}'.format(filename))

!unzip cola_public_1.1.zip

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=6375cac92b2ec8a8ae71cb59de948bd16e8781955feebce74200ec7312937bdc
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Downloaded the file to cola_public_1.1.zip
Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/o

Create the datasets using `pandas`

In [3]:
import pandas as pd

df = pd.read_csv('cola_public/raw/in_domain_train.tsv', sep='\t', header=None, names = ['sentence_source', 'label', 'misc.', 'sentence'])
df.sample(10)

Unnamed: 0,sentence_source,label,misc.,sentence
6822,m_02,1,,Alison drove the car.
6916,m_02,1,,The computer is playing six simultaneous games...
7619,sks13,1,,There were several doctors available.
5740,c_13,1,,Juliet says that Romeo lies to his parents a lot.
1548,r-67,1,,Tom knows it and Dick knows it and Harry knows...
1247,r-67,0,*,Bill made Sarah's gal to me of $40.
4184,ks08,1,,John is in the room.
6882,m_02,1,,Bill went to London on Monday.
920,bc01,0,*,"Sally will stand near Mag, but he won't Holly."
6841,m_02,0,*,The manager presented the foreman a gold watch.


In [4]:
labels = df.label.values
sentences = df.sentence.values

Use the Roberta tokenizer to create the necessary data.

In [5]:
from transformers import RobertaTokenizer
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base", do_lower_case = True)

# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []

for sentence in sentences:
  encoded_dict = tokenizer.encode_plus(sentence,
                                       add_special_tokens = True,
                                       max_length = 64,
                                       truncation=True,
                                       pad_to_max_length = True,
                                       return_attention_mask = True,
                                       return_tensors = 'pt',)
  
  input_ids.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids)
attention_masks = torch.cat(attention_masks)
labels = torch.tensor(labels)

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]



Create an `iterable` object for training.

In [9]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

dataset = TensorDataset(input_ids, attention_masks, labels)

# create the split for training and test data
len_train = int(len(labels)*0.95)
len_test = len(labels) - len_train

train_set, val_set = torch.utils.data.random_split(dataset, [len_train, len_test])

print('{:>5,} training samples'.format(len_train))
print('{:>5,} validation samples'.format(len_test))

train_dataloader = DataLoader(train_set, batch_size = 32, sampler = RandomSampler(train_set))
validation_dataloader = DataLoader(val_set, batch_size = 32, sampler = SequentialSampler(val_set))

8,123 training samples
  428 validation samples


Get the model from the `Transformers` library.

In [10]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained('roberta-base',
                                                         num_labels = 2,
                                                         output_attentions = False,
                                                         output_hidden_states = False,
)

model.cuda()                                          

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

Define the optimizer and the hyper parameters.

In [16]:
from transformers import get_linear_schedule_with_warmup, AdamW

epochs = 2

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
)

total_steps = len(train_dataloader)*epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

Start the training for the model.

In [25]:
import numpy as np

epochs = 4

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

for epoch in range(epochs):

  print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))  

  # Reset the total loss for this epoch.
  total_train_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):

    # Unpack this training batch from our dataloader. 
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Always clear any previously calculated gradients before performing a backward pass.
    model.zero_grad()

    # Perform a forward pass
    result = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels,
                    return_dict=True)
    
    loss = result.loss
    logits = result.logits

    # Accumulate the training loss over all of the batches 
    total_train_loss += loss.item()

    # Perform a backward pass to calculate the gradients.
    loss.backward()

    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Update parameters and take a step using the computed gradient.
    # The optimizer dictates the "update rule"--how the parameters are modified based on their gradients, the learning rate, etc.
    optimizer.step()

    # Update the learning rate.
    scheduler.step()
  
  # Calculate the average loss over all of the batches.
  avg_train_loss = total_train_loss / len(train_dataloader)            
  
  print("Average training loss: {0:.2f}".format(avg_train_loss))

  # Put the model in evaluation mode
  model.eval()

  # Tracking variables 
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
      
    # Unpack this training batch from our dataloader
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():
      # Forward pass, calculate logit predictions.
      # token_type_ids is the same as the "segment ids", which 
      # differentiates sentence 1 and 2 in 2-sentence tasks.
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_input_mask,
                     labels=b_labels,
                     return_dict=True)

    # Get the loss and "logits" output by the model. The "logits" are the 
    # output values prior to applying an activation function like the 
    # softmax.
    loss = result.loss
    logits = result.logits
        
    # Accumulate the validation loss.
    total_eval_loss += loss.item()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate the accuracy for this batch of test sentences, and
    # accumulate it over all batches.
    total_eval_accuracy += flat_accuracy(logits, label_ids)
  
  # Report the final accuracy for this validation run.
  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("Accuracy: {0:.2f}".format(avg_val_accuracy))

  # Calculate the average loss over all of the batches.
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  
  print("Validation Loss: {0:.2f}".format(avg_val_loss))
  print('') 

Average training loss: 0.27
Accuracy: 0.83
Validation Loss: 0.48

Average training loss: 0.26
Accuracy: 0.83
Validation Loss: 0.48

Average training loss: 0.27
Accuracy: 0.83
Validation Loss: 0.48

Average training loss: 0.27
Accuracy: 0.83
Validation Loss: 0.48

