# Import dependencies

In [1]:
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd

import os
import time
import datetime
import re
import random

from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda')

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2060 SUPER


In [3]:
train_df = pd.read_csv('./data/semeval_train_data.csv', encoding='ISO-8859-1')

print(f"Number of entries in dataset: {train_df.shape[0]}")

train_df.sample(5)

Number of entries in dataset: 2000


Unnamed: 0,Review,Polarity
1191,Good luck getting a table.,2
1157,The food was actually aweful.,0
177,Admittedly some nights inside the restaurant w...,0
571,Very good wine choices.,1
821,less wait time for me!,2


# Clean and process dataset

In [4]:
def clean_review(text):
    # Remove non-ASCII chars
    re_nonascii = r'[^\x00-\x7F]+'
    text = re.sub(re_nonascii, '', text)

    # Remove URLs
    re_url = r'https?://[A-Za-z0-9./]+'
    text = re.sub(re_url, '', text)

    # Remove punctuation
    re_puncuation = r'[\.\,\!\?\:\;\-\=\(\)\[\]\"\'\%\*\#\@]'
    text = re.sub(re_puncuation, " ", text)

    return text

In [5]:
train_df['cleaned_text'] = list(map(lambda x: clean_review(x), train_df['Review']))

train_df.sample(10)

Unnamed: 0,Review,Polarity,cleaned_text
202,The exotic food is beautifully presented and i...,1,The exotic food is beautifully presented and i...
558,A guaranteeed delight!,1,A guaranteeed delight
1704,I went here for a family birthday dinner.,2,I went here for a family birthday dinner
1032,Their bagels are fine; but they are a little o...,0,Their bagels are fine but they are a little o...
249,I asked the chef what he recommends and he ask...,2,I asked the chef what he recommends and he ask...
1596,I will NEVER return.,0,I will NEVER return
127,The hostess is rude to the point of being offe...,0,The hostess is rude to the point of being offe...
1134,Don't miss Bloom's on your next trip to Manhat...,1,Don t miss Bloom s on your next trip to Manhat...
1557,A fairly late entry into the haute barnyard sw...,2,A fairly late entry into the haute barnyard sw...
1768,Too bad I had paid an extra $2 for the stone b...,0,Too bad I had paid an extra $2 for the stone b...


In [6]:
reviews = train_df['cleaned_text'].values
labels = train_df['Polarity'].values

# Fine-tuning BERT

## Tokenization

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
# Testing for a simple input

review_sample = reviews[0]

print('Original review:')
print('\t', review_sample)

print('Tokenized review')
print('\t', tokenizer.tokenize(review_sample))

print('Token IDs for review')
print('\t', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(review_sample)))

Original review:
	 Judging from previous posts this used to be a good place  but not any longer 
Tokenized review
	 ['judging', 'from', 'previous', 'posts', 'this', 'used', 'to', 'be', 'a', 'good', 'place', 'but', 'not', 'any', 'longer']
Token IDs for review
	 [13325, 2013, 3025, 8466, 2023, 2109, 2000, 2022, 1037, 2204, 2173, 2021, 2025, 2151, 2936]


In [9]:
max_len = 0

for review in reviews:
    input_ids = tokenizer.encode(review, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))

print('Max review length: ', max_len)

# Will round max_len to the nearest multiple of 10 just to be sure
max_len = max_len + int(round(max_len, -1) - max_len) % 10

print('Max review length after rounding up: ', max_len)

Max review length:  71
Max review length after rounding up:  80


In [10]:
input_ids = []
attention_masks = []

for review in reviews:
    encoded_dict = tokenizer.encode_plus(
                        review,                    # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation = True,
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', reviews[1])
print('Token IDs:', input_ids[1])

Original:  We  there were four of us  arrived at noon   the place was empty   and the staff acted like we were imposing on them and they were very rude 
Token IDs: tensor([  101,  2057,  2045,  2020,  2176,  1997,  2149,  3369,  2012, 11501,
         1996,  2173,  2001,  4064,  1998,  1996,  3095,  6051,  2066,  2057,
         2020, 16625,  2006,  2068,  1998,  2027,  2020,  2200, 12726,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


## Data split - 80% training, 20% validation

In [11]:
training_data_pct = 0.8

In [12]:
dataset = TensorDataset(input_ids, attention_masks, labels)

In [13]:
train_size = int(training_data_pct * len(dataset))
validation_size = len(dataset) - train_size

train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])

print(f"Training samples count: {train_size}")
print(f"Validation samples count: {validation_size}")

Training samples count: 1600
Validation samples count: 400


In [14]:
# Check distribution of samples

t_pos_cnt = 0
t_neg_cnt = 0
t_neutr_cnt = 0

for _, _, lbl in train_dataset:
    if lbl == 1:
        t_pos_cnt += 1
    if lbl == 0:
        t_neg_cnt += 1
    if lbl == 1:
        t_neutr_cnt += 1

v_pos_cnt = 0
v_neg_cnt = 0
v_neutr_cnt = 0

for _, _, lbl in validation_dataset:
    if lbl == 1:
        v_pos_cnt += 1
    if lbl == 0:
        v_neg_cnt += 1
    if lbl == 1:
        v_neutr_cnt += 1

print(f"Training size of positive reviews: {t_pos_cnt}")
print(f"Validation size of positive reviews: {v_pos_cnt}")
print(f"Positive samples ratio train to validation: {t_pos_cnt / v_pos_cnt}\n")

print(f"Training size of negative reviews: {t_neg_cnt}")
print(f"Validation size of negative reviews: {v_neg_cnt}")
print(f"Negative samples ratio train to validation: {t_pos_cnt / v_pos_cnt}\n")

print(f"Training size of neutral reviews: {t_neutr_cnt}")
print(f"Validation size of neutral reviews: {v_neutr_cnt}")
print(f"Neutral samples ratio train to validation: {t_pos_cnt / v_pos_cnt}\n")



Training size of positive reviews: 860
Validation size of positive reviews: 204
Positive samples ratio train to validation: 4.215686274509804

Training size of negative reviews: 441
Validation size of negative reviews: 120
Negative samples ratio train to validation: 4.215686274509804

Training size of neutral reviews: 860
Validation size of neutral reviews: 204
Neutral samples ratio train to validation: 4.215686274509804



## Create iterators

In [15]:
BATCH_SIZE = 32

train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = BATCH_SIZE
)

validation_dataloader = DataLoader(
    validation_dataset,
    sampler = SequentialSampler(validation_dataset),
    batch_size = BATCH_SIZE
)

## Tuning

In [16]:
# Load the model

BERT_PRETRAINED_MODEL = 'bert-base-uncased'

model = BertForSequenceClassification.from_pretrained(
    BERT_PRETRAINED_MODEL, 
    num_labels = 3, 
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [17]:
# Checking parameters
# We will check the embedding layer and the output layer

params = list(model.named_parameters())

print('Embedding layer')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\nOutput layer')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


Embedding layer
bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

Output layer
bert.pooler.dense.weight                                  (768, 768)
bert.pooler.dense.bias                                        (768,)
classifier.weight                                           (3, 768)
classifier.bias                                                 (3,)


In [18]:
LEARNING_RATE = 5e-5 # default value from args.learning_rate
EPS_ERR = 1e-8 # default value from args.eps_err

EPOCHS = 4

In [19]:
optimizer = AdamW(
    model.parameters(),
    lr = LEARNING_RATE,
    eps = EPS_ERR
)

In [20]:
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

In [21]:
# Accuracy function
def compute_accuracy(predictions, labels):
    predictions_flattened = np.argmax(predictions, axis=1).flatten()
    labels_flattened = labels.flatten()

    return np.sum(predictions_flattened == labels_flattened) / len(labels_flattened)

In [22]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [23]:
training_statistics = [] 
total_t0 = time.time()

for epoch in range(0, EPOCHS):
    print(f"=== EPOCH {epoch} / {EPOCHS} ===")

    t0 = time.time()
    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        
        if step % 25 == 0 and not step == 0:
            elapsed = str(datetime.timedelta(seconds = int(round((time.time() - t0)))))

            print(f"\tBatch {step} of {len(train_dataloader)}. Elapsed: {elapsed}.")
        
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        # Clear previously calculated gradients before the backward pass
        model.zero_grad()

        # Forward pass
        loss, logits = model(
            batch_input_ids, 
            token_type_ids = None, 
            attention_mask = batch_input_mask, 
            labels = batch_labels,
            return_dict = False
        )

        total_train_loss += loss.item()

        # Backward pass
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = str(datetime.timedelta(seconds = int(round((time.time() - t0)))))

    print(f"\t Average training loss: {avg_train_loss}")
    print(f"\t Training epoch time: {training_time}")

    # Validation

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        with torch.no_grad():
            loss, logits = model(
                batch_input_ids, 
                token_type_ids = None, 
                attention_mask = batch_input_mask, 
                labels = batch_labels,
                return_dict = False
            )

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = batch_labels.cpu().numpy()

        total_eval_accuracy += compute_accuracy(logits, label_ids)

        nb_eval_steps += 1
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = str(datetime.timedelta(seconds = int(round((time.time() - t0)))))
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_statistics.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print(" === Training complete ===")

print(f"Total training took {str(datetime.timedelta(seconds= int(round((time.time() - total_t0)))))} (hh:mm:ss).")

=== EPOCH 0 / 4 ===
	Batch 25 of 50. Elapsed: 0:00:10.
	 Average training loss: 0.8068425112962723
	 Training epoch time: 0:00:17
  Accuracy: 0.78
  Validation Loss: 0.60
  Validation took: 0:00:01
=== EPOCH 1 / 4 ===
	Batch 25 of 50. Elapsed: 0:00:07.
	 Average training loss: 0.41965128272771834
	 Training epoch time: 0:00:15
  Accuracy: 0.81
  Validation Loss: 0.54
  Validation took: 0:00:01
=== EPOCH 2 / 4 ===
	Batch 25 of 50. Elapsed: 0:00:07.
	 Average training loss: 0.2024732592701912
	 Training epoch time: 0:00:14
  Accuracy: 0.81
  Validation Loss: 0.62
  Validation took: 0:00:01
=== EPOCH 3 / 4 ===
	Batch 25 of 50. Elapsed: 0:00:07.
	 Average training loss: 0.10443506196141243
	 Training epoch time: 0:00:15
  Accuracy: 0.81
  Validation Loss: 0.62
  Validation took: 0:00:01
 === Training complete ===
Total training took 0:01:06 (hh:mm:ss).


## Preview training evolution

In [24]:
pd.set_option('precision', 2)

df_statistics = pd.DataFrame(data = training_statistics)
df_statistics = df_statistics.set_index('epoch')

df_statistics

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.81,0.6,0.78,0:00:17,0:00:01
2,0.42,0.54,0.81,0:00:15,0:00:01
3,0.2,0.62,0.81,0:00:14,0:00:01
4,0.1,0.62,0.81,0:00:15,0:00:01


## Saving the model 

In [25]:
output_dir = './bert_fine_tuned'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"Saving fine-tuned BERT to {output_dir}")

model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training

model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

torch.save(model.state_dict(), os.path.join(output_dir, 'model.pth'))

Saving fine-tuned BERT to ./bert_fine_tuned
