In [247]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [248]:
import sys
sys.path.append('/content/drive/MyDrive/classification')

In [249]:
import os
os.chdir('/content/drive/MyDrive/classification')
os.listdir('/content/drive/MyDrive/classification')

['data',
 'results_sentiment',
 'results_letters',
 'my_awesome_model',
 'logs',
 'baseline',
 'results-sent-0.0025',
 'results-sent-0.1%']

In [250]:
!nvidia-smi

Mon Jan  2 14:22:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0    28W /  70W |  10690MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Text Classification with BERT in Pytorch

In [251]:
! pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [252]:
import transformers

## 1. Data Pre-processing

### 1.1 Configure CPU

In [253]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

### 1.2 Read dataset file

In [254]:
import pandas as pd
import numpy as np

# Sentiment dataset
# train_filename = "data/sentiment/classification/classification_sentiment_train.jsonl"
# test_filename = "data/sentiment/classification/classification_sentiment_eval.jsonl"

# Letters dataset
train_filename = "data/letters/classification/classifier_data_train.json"
test_filename = "data/letters/classification/classifier_data_eval.json"

df_train = pd.read_json(train_filename, lines=True)
df_test = pd.read_json(test_filename, lines=True)

Take a look at the train data and test data.

In [255]:
df_train.describe()

Unnamed: 0,author,year,lang,text,file
count,39077,39077,39077,39077,39077
unique,7,106,8,39043,10411
top,Virginia Woolf,unknown,en,Med hjertelige hilsener tegner jeg mig Deres h...,joyce/json/letter_1016.json
freq,15211,5990,20150,3,39


In [256]:
df_test.describe()

Unnamed: 0,author,year,lang,text,file
count,4881,4881,4881,4881,4881
unique,7,103,6,4881,4504
top,Virginia Woolf,unknown,en,"Ich war aber der Einzige, dem der Christmann s...",joyce/json/letter_1016.json
freq,1901,740,2517,1,5


### 1.3 Get the values of DataFrame
Here to set the size of annotated data.

In [257]:
# reset the annotated data number, default = 1%
df_train = df_train.sample(frac=0.5, random_state=1)

# Sentiment dataset
# train_labels = df_train['sentiment'].tolist()
# train_text = df_train['text'].tolist()

# test_labels = df_test['sentiment'].tolist()
# test_text = df_test['text'].tolist()

# Letters dataset
train_labels = df_train['author'].tolist()
train_text = df_train['text'].tolist()

test_labels = df_test['author'].tolist()
test_text = df_test['text'].tolist()

assert len(train_labels) == len(train_text)
assert len(test_labels) == len(test_text)

print("len(train_text) = {}, len(test_text) = {}".format(len(train_text), len(test_text)))

len(train_text) = 19538, len(test_text) = 4881


### 1.4 Get values indices

In [258]:
def format_label_value(label_list):
    label_set = {label for label in label_list}
    label_dict = {label : i for i, label in enumerate(label_set)}
    # label_dict = {'negative': 0, 'positive': 1}
    return [label_dict[label] for label in label_list]

In [259]:
train_labels = format_label_value(train_labels)
test_labels = format_label_value(test_labels)

## 2. BERT Tokenization & Input Formatting

### 2.1 BERT Tokenization

In [260]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Loading BERT tokenizer...


**Let's apply the tokenizer to one sentence just to see the output.**

In [261]:
# Print the original sentence.
print('Original: ', train_text[0])
print("len(Original) = ", len(train_text[0]))
print("\n")

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_text[0]))
print("len(Tokenized) = ", len(tokenizer.tokenize(train_text[0])))
print("\n")

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[0])))
print("len(Token IDs) = ", len(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[0]))))
print("\n")

Original:  And if ever you had time, and a half sheet of paper, and would write upon it that it was your picture and that you gave it me, my debt of gratitude to you would mount, if possible, higher. But I’m not going to bother you any more. This is only to thank you for sending it, and to assert the affection which, though suppressed, is always alive in the heart of yours gratefully
len(Original) =  375


Tokenized:  ['And', 'if', 'ever', 'you', 'had', 'time', ',', 'and', 'a', 'half', 'sheet', 'of', 'paper', ',', 'and', 'would', 'write', 'upon', 'it', 'that', 'it', 'was', 'your', 'picture', 'and', 'that', 'you', 'gave', 'it', 'me', ',', 'my', 'debt', 'of', 'gratitude', 'to', 'you', 'would', 'mount', ',', 'if', 'possible', ',', 'higher', '.', 'But', 'I', '’', 'm', 'not', 'going', 'to', 'bother', 'you', 'any', 'more', '.', 'This', 'is', 'only', 'to', 'thank', 'you', 'for', 'sending', 'it', ',', 'and', 'to', 'assert', 'the', 'affection', 'which', ',', 'though', 'suppressed', ',', 'is', '

### 2.2 Input Formatting for BERT

**BERT needs adding special tokens --- [cls] and [sep]**

The `tokenizer.encode` function combines multiple steps for us:
1. Split the sentence into tokens.
2. Add the special `[CLS]` and `[SEP]` tokens.
3. Map the tokens to their IDs.

Oddly, this function can perform truncating for us, but doesn't handle padding. 

**Encoding for text in training dataset**

In [262]:
import logging

# Set logger to avoid warning `token indices sequence length is longer than the specified maximum sequence length for this model (1017 > 512)`
logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)


def text_to_id(tokenizer, text_list):
    """
    It is a function to transform text to id.
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    """
    ids_list = []
    
    for item in text_list:
        # Sentence to id and add [CLS] and [SEP]
        # As we used in ganbert, we only take max_length=64
        encoded_item = tokenizer.encode(item, add_special_tokens=True, max_length=64, truncation=True)
        ids_list.append(encoded_item)
    
    return ids_list

In [263]:
train_text_ids = text_to_id(tokenizer, train_text)
test_text_ids = text_to_id(tokenizer, test_text)


# Print sentence 0, now as a list of IDs.
print('Original: {}\n'.format(train_text[0]))
print('Token IDs: {}\n'.format(train_text_ids[0]))
print("len(train_text_ids) = {}\n".format(len(train_text_ids)))
print("len(test_text_ids) = {}".format(len(test_text_ids)))

Original: And if ever you had time, and a half sheet of paper, and would write upon it that it was your picture and that you gave it me, my debt of gratitude to you would mount, if possible, higher. But I’m not going to bother you any more. This is only to thank you for sending it, and to assert the affection which, though suppressed, is always alive in the heart of yours gratefully

Token IDs: [101, 1262, 1191, 1518, 1128, 1125, 1159, 117, 1105, 170, 1544, 6837, 1104, 2526, 117, 1105, 1156, 3593, 1852, 1122, 1115, 1122, 1108, 1240, 3439, 1105, 1115, 1128, 1522, 1122, 1143, 117, 1139, 6695, 1104, 17234, 1106, 1128, 1156, 11885, 117, 1191, 1936, 117, 2299, 119, 1252, 146, 787, 182, 1136, 1280, 1106, 8255, 1128, 1251, 1167, 119, 1188, 1110, 1178, 1106, 6243, 102]

len(train_text_ids) = 19538

len(test_text_ids) = 4881


**Calculate the maximum length of train_text_ids**

In [264]:
print('Train: max sentence length: ', max([len(sen) for sen in train_text_ids]))
print('Train: Min sentence length: ', min([len(sen) for sen in train_text_ids]))
print('Test: max sentence length: ', max([len(sen) for sen in test_text_ids]))
print('Test: Min sentence length: ', min([len(sen) for sen in test_text_ids]))

Train: max sentence length:  64
Train: Min sentence length:  11
Test: max sentence length:  64
Test: Min sentence length:  13


From above results, we could find that the length of each sentence in `train_text_ids` is not the same, so we need to pad or truncate the text ids.

### 2.3 Padding & Truncating

In [265]:
def padding_truncating(input_ids_list, max_length):
    """
    It is a function to perform padding and truncating
    @param input_ids_list: <List> text_ids
    @param max_length: <Integer> the number we wanna the sentence to be padding or truncating
    @return: processed input_ids_list
    """
    processed_input_ids_list = []
    for item in input_ids_list:
        seq_list = []
        
        if len(item) < max_length:
            # Define a seq_list with the length of max_length
            seq_list = [0] * (max_length - len(item))
            item = item + seq_list
        
        elif len(item) >= max_length:
            item = item[:max_length]
            
        processed_input_ids_list.append(item)
    
    return processed_input_ids_list

**Padding or truncating the `train_text_ids` and `test_text_ids`**

In [266]:
train_padding_list = padding_truncating(train_text_ids, max_length=64)
test_padding_list = padding_truncating(test_text_ids, max_length=64)

### 2.4 Attention Masks

The attention mask simply makes it explicit which tokens are actual words versus which are padding. The `BERT` vocabulary does not use the ID 0, so **if a token ID is 0, it's a padded one, and otherwise it's a real token**.

In [267]:
def get_attention_masks(pad_input_ids_list):
    """
    It is a function to get attention masks:
    
    - If a token ID is 0, then it's padding, set the mask to 0.
    - If a token ID is > 0, then it's a real token, set the mask to 1.
    """
    attention_masks_list = []
    
    for item in pad_input_ids_list:
        
        mask_list = []
        for subitem in item:
            if subitem > 0:
                mask_list.append(1)
            else:
                mask_list.append(0)
        attention_masks_list.append(mask_list)
    
    return attention_masks_list

In [268]:
train_attention_masks = get_attention_masks(train_padding_list)
test_attention_masks = get_attention_masks(test_padding_list)

assert len(train_text) == len(train_labels) == len(train_attention_masks) == len(train_padding_list)
assert len(test_text) == len(test_labels) == len(test_attention_masks) == len(test_padding_list)

### 2.5 Split train dataset into train_dataset and validation_dataset

In [269]:
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
train_padding_list, validation_padding_list, train_labels, validation_labels, train_attention_masks, validation_attention_masks = train_test_split(train_padding_list, train_labels, train_attention_masks, random_state=1, test_size=0.1)

In [270]:
assert len(train_labels) == len(train_attention_masks) == len(train_padding_list)
assert len(validation_labels) == len(validation_attention_masks) == len(validation_padding_list)
assert len(test_labels) == len(test_attention_masks) == len(test_padding_list)

In [271]:
print("len(train_labels) = {}\nlen(validation_labels) = {}\nlen(test_labels) = {}".format(len(train_labels), len(validation_labels), len(test_labels)))

len(train_labels) = 17584
len(validation_labels) = 1954
len(test_labels) = 4881


### 2.6 Convert to Dataset

#### 2.6.1 Convert all the `List` objects to tensor

In [272]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Convert all inputs and labels into torch tensors, the required datatype for our model.
train_inputs = torch.tensor(train_padding_list)
validation_inputs = torch.tensor(validation_padding_list)
test_inputs = torch.tensor(test_padding_list)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)

train_masks = torch.tensor(train_attention_masks)
validation_masks = torch.tensor(validation_attention_masks)
test_masks = torch.tensor(test_attention_masks)

#### 2.6.2 Form the Dataset with torch.tensor

In [273]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.

batch_size = 64

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## 3. Train BERT Text Classification Model

### 3.1 BertForSequenceClassification

**Load `BertForSequenceClassification` from `transformers`**

In [274]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch

# Load BertForSequenceClassification, the pretrained BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",  # Use the 12-layer BERT model, with an uncased vocab.
     num_labels = 7,      # The number of output labels -- 2 for binary classification.
                    # You can increase this for multi-class tasks.   
     output_attentions = False, # Whether the model returns attentions weights.
     output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### 3.2 Optimizer & Learning Rate Scheduler

Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.

For the purposes of fine-tuning, the authors recommend choosing from the following values:
- Batch size: 16, 32  (We chose 32 when creating our DataLoaders).
- Learning rate (Adam): 5e-5, 3e-5, 2e-5  (We'll use 2e-5).
- Number of epochs: 2, 3, 4  (We'll use 4).

The epsilon parameter `eps = 1e-8` is "a very small number to prevent any division by zero in the implementation" (from [here](https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/)).

You can find the creation of the AdamW optimizer in `run_glue.py` [here](https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L109).

In [275]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )





In [276]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
print("total_steps = {}".format(total_steps))

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

total_steps = 825


### 3.3 Train

In [277]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [278]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
import random
from tqdm import tqdm

# Set the seed value all over the place to make this reproducible.
seed_val = 1

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(epochs):
    
    ##########################################
    #               Training                 #
    ##########################################
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # Progress update every 10 batches.
        if step % 10 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear the gradients.
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we have provided the `labels`.
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end. 
        # `loss` is a Tensor containing a single value; the `.item()` function just returns the Python value from the tensor.
        total_loss += loss.item()

        # Perform a `backward` pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
        
    ##########################################
    #               Validation               #
    ##########################################
    # After the completion of each training epoch, measure our performance on our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to device
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            # token_type_ids is the same as the "segment ids", which differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        # flat_accuracy(y_pred, y_true)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...


  4%|▎         | 10/275 [00:06<02:39,  1.67it/s]

  Batch    10  of    275.    Elapsed: 0:00:06.


  7%|▋         | 20/275 [00:12<02:34,  1.65it/s]

  Batch    20  of    275.    Elapsed: 0:00:12.


 11%|█         | 30/275 [00:18<02:28,  1.65it/s]

  Batch    30  of    275.    Elapsed: 0:00:18.


 15%|█▍        | 40/275 [00:24<02:23,  1.64it/s]

  Batch    40  of    275.    Elapsed: 0:00:24.


 18%|█▊        | 50/275 [00:30<02:18,  1.62it/s]

  Batch    50  of    275.    Elapsed: 0:00:30.


 22%|██▏       | 60/275 [00:36<02:17,  1.57it/s]

  Batch    60  of    275.    Elapsed: 0:00:37.


 25%|██▌       | 70/275 [00:42<02:07,  1.60it/s]

  Batch    70  of    275.    Elapsed: 0:00:43.


 29%|██▉       | 80/275 [00:49<02:02,  1.59it/s]

  Batch    80  of    275.    Elapsed: 0:00:49.


 33%|███▎      | 90/275 [00:55<01:56,  1.59it/s]

  Batch    90  of    275.    Elapsed: 0:00:55.


 36%|███▋      | 100/275 [01:01<01:50,  1.59it/s]

  Batch   100  of    275.    Elapsed: 0:01:02.


 40%|████      | 110/275 [01:08<01:42,  1.61it/s]

  Batch   110  of    275.    Elapsed: 0:01:08.


 44%|████▎     | 120/275 [01:14<01:35,  1.62it/s]

  Batch   120  of    275.    Elapsed: 0:01:14.


 47%|████▋     | 130/275 [01:20<01:29,  1.62it/s]

  Batch   130  of    275.    Elapsed: 0:01:20.


 51%|█████     | 140/275 [01:26<01:23,  1.62it/s]

  Batch   140  of    275.    Elapsed: 0:01:27.


 55%|█████▍    | 150/275 [01:32<01:16,  1.63it/s]

  Batch   150  of    275.    Elapsed: 0:01:33.


 58%|█████▊    | 160/275 [01:38<01:10,  1.63it/s]

  Batch   160  of    275.    Elapsed: 0:01:39.


 62%|██████▏   | 170/275 [01:44<01:04,  1.62it/s]

  Batch   170  of    275.    Elapsed: 0:01:45.


 65%|██████▌   | 180/275 [01:51<00:58,  1.62it/s]

  Batch   180  of    275.    Elapsed: 0:01:51.


 69%|██████▉   | 190/275 [01:57<00:52,  1.62it/s]

  Batch   190  of    275.    Elapsed: 0:01:57.


 73%|███████▎  | 200/275 [02:03<00:46,  1.62it/s]

  Batch   200  of    275.    Elapsed: 0:02:03.


 76%|███████▋  | 210/275 [02:09<00:40,  1.61it/s]

  Batch   210  of    275.    Elapsed: 0:02:10.


 80%|████████  | 220/275 [02:15<00:34,  1.61it/s]

  Batch   220  of    275.    Elapsed: 0:02:16.


 84%|████████▎ | 230/275 [02:22<00:27,  1.61it/s]

  Batch   230  of    275.    Elapsed: 0:02:22.


 87%|████████▋ | 240/275 [02:28<00:21,  1.61it/s]

  Batch   240  of    275.    Elapsed: 0:02:28.


 91%|█████████ | 250/275 [02:34<00:15,  1.61it/s]

  Batch   250  of    275.    Elapsed: 0:02:35.


 95%|█████████▍| 260/275 [02:40<00:09,  1.62it/s]

  Batch   260  of    275.    Elapsed: 0:02:41.


 98%|█████████▊| 270/275 [02:46<00:03,  1.62it/s]

  Batch   270  of    275.    Elapsed: 0:02:47.


100%|██████████| 275/275 [02:49<00:00,  1.62it/s]



  Average training loss: 0.44
  Training epcoh took: 0:02:50

Running Validation...
  Accuracy: 0.89
  Validation took: 0:00:06

Training...


  4%|▎         | 10/275 [00:06<02:43,  1.62it/s]

  Batch    10  of    275.    Elapsed: 0:00:06.


  7%|▋         | 20/275 [00:12<02:38,  1.61it/s]

  Batch    20  of    275.    Elapsed: 0:00:12.


 11%|█         | 30/275 [00:18<02:32,  1.61it/s]

  Batch    30  of    275.    Elapsed: 0:00:19.


 15%|█▍        | 40/275 [00:24<02:25,  1.61it/s]

  Batch    40  of    275.    Elapsed: 0:00:25.


 18%|█▊        | 50/275 [00:30<02:19,  1.61it/s]

  Batch    50  of    275.    Elapsed: 0:00:31.


 22%|██▏       | 60/275 [00:37<02:13,  1.61it/s]

  Batch    60  of    275.    Elapsed: 0:00:37.


 25%|██▌       | 70/275 [00:43<02:06,  1.62it/s]

  Batch    70  of    275.    Elapsed: 0:00:43.


 29%|██▉       | 80/275 [00:49<02:00,  1.62it/s]

  Batch    80  of    275.    Elapsed: 0:00:50.


 33%|███▎      | 90/275 [00:55<01:54,  1.61it/s]

  Batch    90  of    275.    Elapsed: 0:00:56.


 36%|███▋      | 100/275 [01:01<01:48,  1.62it/s]

  Batch   100  of    275.    Elapsed: 0:01:02.


 40%|████      | 110/275 [01:08<01:41,  1.62it/s]

  Batch   110  of    275.    Elapsed: 0:01:08.


 44%|████▎     | 120/275 [01:14<01:35,  1.62it/s]

  Batch   120  of    275.    Elapsed: 0:01:14.


 47%|████▋     | 130/275 [01:20<01:29,  1.62it/s]

  Batch   130  of    275.    Elapsed: 0:01:20.


 51%|█████     | 140/275 [01:26<01:23,  1.62it/s]

  Batch   140  of    275.    Elapsed: 0:01:27.


 55%|█████▍    | 150/275 [01:32<01:17,  1.61it/s]

  Batch   150  of    275.    Elapsed: 0:01:33.


 58%|█████▊    | 160/275 [01:39<01:11,  1.61it/s]

  Batch   160  of    275.    Elapsed: 0:01:39.


 62%|██████▏   | 170/275 [01:45<01:05,  1.61it/s]

  Batch   170  of    275.    Elapsed: 0:01:45.


 65%|██████▌   | 180/275 [01:51<00:58,  1.61it/s]

  Batch   180  of    275.    Elapsed: 0:01:51.


 69%|██████▉   | 190/275 [01:57<00:52,  1.61it/s]

  Batch   190  of    275.    Elapsed: 0:01:58.


 73%|███████▎  | 200/275 [02:03<00:46,  1.61it/s]

  Batch   200  of    275.    Elapsed: 0:02:04.


 76%|███████▋  | 210/275 [02:10<00:40,  1.62it/s]

  Batch   210  of    275.    Elapsed: 0:02:10.


 80%|████████  | 220/275 [02:16<00:34,  1.62it/s]

  Batch   220  of    275.    Elapsed: 0:02:16.


 84%|████████▎ | 230/275 [02:22<00:27,  1.61it/s]

  Batch   230  of    275.    Elapsed: 0:02:22.


 87%|████████▋ | 240/275 [02:28<00:21,  1.61it/s]

  Batch   240  of    275.    Elapsed: 0:02:29.


 91%|█████████ | 250/275 [02:34<00:15,  1.62it/s]

  Batch   250  of    275.    Elapsed: 0:02:35.


 95%|█████████▍| 260/275 [02:40<00:09,  1.62it/s]

  Batch   260  of    275.    Elapsed: 0:02:41.


 98%|█████████▊| 270/275 [02:47<00:03,  1.62it/s]

  Batch   270  of    275.    Elapsed: 0:02:47.


100%|██████████| 275/275 [02:50<00:00,  1.62it/s]



  Average training loss: 0.22
  Training epcoh took: 0:02:50

Running Validation...
  Accuracy: 0.92
  Validation took: 0:00:06

Training...


  4%|▎         | 10/275 [00:06<02:44,  1.61it/s]

  Batch    10  of    275.    Elapsed: 0:00:06.


  7%|▋         | 20/275 [00:12<02:37,  1.62it/s]

  Batch    20  of    275.    Elapsed: 0:00:12.


 11%|█         | 30/275 [00:18<02:32,  1.61it/s]

  Batch    30  of    275.    Elapsed: 0:00:19.


 15%|█▍        | 40/275 [00:24<02:25,  1.61it/s]

  Batch    40  of    275.    Elapsed: 0:00:25.


 18%|█▊        | 50/275 [00:31<02:19,  1.61it/s]

  Batch    50  of    275.    Elapsed: 0:00:31.


 22%|██▏       | 60/275 [00:37<02:13,  1.61it/s]

  Batch    60  of    275.    Elapsed: 0:00:37.


 25%|██▌       | 70/275 [00:43<02:07,  1.61it/s]

  Batch    70  of    275.    Elapsed: 0:00:43.


 29%|██▉       | 80/275 [00:49<02:01,  1.61it/s]

  Batch    80  of    275.    Elapsed: 0:00:50.


 33%|███▎      | 90/275 [00:55<01:54,  1.62it/s]

  Batch    90  of    275.    Elapsed: 0:00:56.


 36%|███▋      | 100/275 [01:02<01:48,  1.62it/s]

  Batch   100  of    275.    Elapsed: 0:01:02.


 40%|████      | 110/275 [01:08<01:41,  1.62it/s]

  Batch   110  of    275.    Elapsed: 0:01:08.


 44%|████▎     | 120/275 [01:14<01:35,  1.62it/s]

  Batch   120  of    275.    Elapsed: 0:01:14.


 47%|████▋     | 130/275 [01:20<01:29,  1.62it/s]

  Batch   130  of    275.    Elapsed: 0:01:21.


 51%|█████     | 140/275 [01:26<01:23,  1.62it/s]

  Batch   140  of    275.    Elapsed: 0:01:27.


 55%|█████▍    | 150/275 [01:32<01:17,  1.62it/s]

  Batch   150  of    275.    Elapsed: 0:01:33.


 58%|█████▊    | 160/275 [01:39<01:11,  1.61it/s]

  Batch   160  of    275.    Elapsed: 0:01:39.


 62%|██████▏   | 170/275 [01:45<01:05,  1.61it/s]

  Batch   170  of    275.    Elapsed: 0:01:45.


 65%|██████▌   | 180/275 [01:51<00:58,  1.61it/s]

  Batch   180  of    275.    Elapsed: 0:01:52.


 69%|██████▉   | 190/275 [01:57<00:52,  1.61it/s]

  Batch   190  of    275.    Elapsed: 0:01:58.


 73%|███████▎  | 200/275 [02:03<00:46,  1.61it/s]

  Batch   200  of    275.    Elapsed: 0:02:04.


 76%|███████▋  | 210/275 [02:10<00:40,  1.62it/s]

  Batch   210  of    275.    Elapsed: 0:02:10.


 80%|████████  | 220/275 [02:16<00:34,  1.61it/s]

  Batch   220  of    275.    Elapsed: 0:02:16.


 84%|████████▎ | 230/275 [02:22<00:27,  1.62it/s]

  Batch   230  of    275.    Elapsed: 0:02:23.


 87%|████████▋ | 240/275 [02:28<00:21,  1.61it/s]

  Batch   240  of    275.    Elapsed: 0:02:29.


 91%|█████████ | 250/275 [02:34<00:15,  1.62it/s]

  Batch   250  of    275.    Elapsed: 0:02:35.


 95%|█████████▍| 260/275 [02:41<00:09,  1.61it/s]

  Batch   260  of    275.    Elapsed: 0:02:41.


 96%|█████████▌| 263/275 [02:42<00:07,  1.62it/s]

### 3.4 Plot

**Plot the average loss in training**

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
plt.title("Average training loss")

# Plot the learning curve.
x_axis = np.arange(1, epochs+1)
plt.plot(x_axis, loss_values, 'b-o')

plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()

### 3.5 Evaluation

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
idx = 0
correct = 0
for batch in tqdm(test_dataloader):
    
    idx += 1
    
    # Add batch to device
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
  
    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    # Get the prediction probability
    logits = outputs[0]
    
    # Get the prediction label
    pred = torch.argmax(logits, 1)
   
    # Get the total correct number between pred and true labels in each batch
    correct += (pred == b_labels).sum().item()
    

print('DONE.')
print("Total correct = ", correct)
print("Test accuracy = {0:.2f}".format(correct / len(test_inputs)))