In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import re

import torch

from sklearn.model_selection import train_test_split

## A. Load the data and set up the GPU

#### 1.1 Load the Train data

In [None]:
data_complaint = pd.read_csv("/content/drive/MyDrive/Data/BERT Sentiment Analysis/complaint1700.csv")
data_complaint['label'] = 1
data_complaint.head(2)

Unnamed: 0,id,airline,tweet,label
0,80938,United,@united I'm having issues. Yesterday I rebooke...,1
1,10959,United,@united kinda feel like the $6.99 you charge f...,1


In [None]:
data_noncomplaint = pd.read_csv("/content/drive/MyDrive/Data/BERT Sentiment Analysis/noncomplaint1700.csv")
data_noncomplaint['label'] = 0
data_noncomplaint.head(2)

Unnamed: 0,id,airline,tweet,label
0,404,United,@brianfadem @united The best summertime soap o...,0
1,706,SouthWest,@aresef @united yes the change fees are cheape...,0


In [None]:
data = pd.concat([data_complaint, data_noncomplaint], axis=0).reset_index(drop=True)
data.drop('airline', axis=1, inplace=True)
data.sample(5)

Unnamed: 0,id,tweet,label
908,43165,Homeward bound! Only 11 hrs 2 go. Only complai...,1
500,6567,"@united my day?my whole""holiday""is ruined cuz ...",1
1885,19991,@JetBlue Ismelda at HPN was amazing!! What a ...,0
3392,171764,@paultowntwo @omarsuleiman504 @christocarbone ...,0
28,13915,@united my poor friend is getting sass from yo...,1


#### 1.2 Load the Test data

In [None]:
test_data = pd.read_csv("/content/drive/MyDrive/Data/BERT Sentiment Analysis/test_data.csv")
test_data = test_data[['id', 'tweet']]
test_data.sample(5)

Unnamed: 0,id,tweet
4205,160159,"@DeltaAssist if flight mssd d/t wx, and nxt fl..."
2599,99470,"@Delta @DeltaAssist Mike McEntire, red coat a..."
1447,56239,Still stuck at LAX. Navigation issues on this ...
3538,133746,"@AmericanAir flight 3321 late due to weather, ..."
2563,98002,@hipcop @AmericanAir You've been on hold that ...


#### 1.3 Randomly split the data into train and test sets

In [None]:
X = data['tweet'].values
y = data['label'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=2020)

### 2 Set up the GPU

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print(f"There are {torch.cuda.device_count()} GPUs available")
  print(f"GPU available : {torch.cuda.get_device_name(0)}")
else:
  print("No GPU is available")
  device = torch.device("cpu")

There are 1 GPUs available
GPU available : Tesla K80


# B. Fine tuning BERT

#### 1. Install the huggingface library

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 10.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 47.4 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 7.0 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.9 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 

#### 2. Clean up the text

In [None]:
def text_preprocessing(text):

    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
print("Original text")
print(X[0])
print("\nProcessed text")
print(text_preprocessing(X[0]))

Original text
@united I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on &amp; check in. Can you help?

Processed text
I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on & check in. Can you help?


#### 3. BERT Tokenizer

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def preprocessing_for_bert(data):

  input_ids = []
  attention_masks = []

  for sent in data:
    
    # encode_plus will : 1) Tokenize the sentence
    #                    2) Add the [CLS] and [SEP] tokens
    #                    3) truncate/Pad the sentence to the maximum length specified
    #                    4) map tokens to their IDs
    #                    5) Create the attention mask
    #                    6) Return a dictionary of outputs


    encoded_sent = tokenizer.encode_plus(text = text_preprocessing(sent), 
                                         add_special_tokens=True,
                                         max_length=MAX_LEN,
                                         padding='max_length',
                                         return_attention_mask=True)
    
    input_ids.append(encoded_sent.get('input_ids'))
    attention_masks.append(encoded_sent.get('attention_mask'))

  # Convert the lists to tensors
  input_ids = torch.tensor(input_ids)
  attention_masks = torch.tensor(attention_masks)

  return input_ids, attention_masks

#### 3.1 Get the maximum sentence length

In [None]:
# Concatenate the training and test data
all_tweets = np.concatenate([data['tweet'].values, test_data['tweet'].values])

# Encode each sentence
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_tweets]

# Find the maximum length
max_len = max([len(i) for i in encoded_tweets])
print("Maximum sentence length : ", max_len)

Maximum sentence length :  68


#### 3.2 Tokenize the data

In [None]:
MAX_LEN = 64

# Print a sample transformation
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print("Original text : ", X[0])
print("Token IDs : ", token_ids)

Original text :  @united I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on &amp; check in. Can you help?
Token IDs :  [101, 1045, 1005, 1049, 2383, 3314, 1012, 7483, 1045, 2128, 8654, 2098, 2005, 2484, 2847, 2044, 1045, 2001, 4011, 2000, 4875, 1010, 2085, 1045, 2064, 1005, 1056, 8833, 2006, 1004, 4638, 1999, 1012, 2064, 2017, 2393, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
# Tokenize the full data
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

#### 3.2 Create PyTorch DataLoader

In [None]:
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

# Convert data to torch tensors
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

# Use a batch size of 32 (recommended is 16 or 32)
batch_size = 32

# Create the data loaders
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

## 4. Train the model

#### 4.1 Create a custom BERT classifier

In [None]:
%%time

import torch.nn as nn
from transformers import BertModel


class BertClassifier(nn.Module):

  def __init__(self, freeze_bert=False):

    super(BertClassifier, self).__init__()

    D_in = 768
    H = 50
    D_out = 2

    # Instantiate BERT model
    self.bert = BertModel.from_pretrained('bert-base-uncased')

    # Instantiate a one-layer feed-forward classifier
    self.classifier = nn.Sequential(
                            nn.Linear(D_in, H),
                            nn.ReLU(),
                            nn.Linear(H, D_out)
                            )
    
    if freeze_bert:
      for param in self.bert.parameters():
        param.requires_grad = False

  def forward(self, input_ids, attention_mask):

    # Feed input to BERT
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

    # Extract the last hidden state of the [CLS] token for the classification layer
    last_hidden_state_cls = outputs[0][:, 0, :]

    # Feed the hidden state to the classifier to compute the logits
    logits = self.classifier(last_hidden_state_cls)

    return logits
    

CPU times: user 18.9 ms, sys: 0 ns, total: 18.9 ms
Wall time: 19.3 ms


#### 4.2 Optimizer and Learning rate scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=2):

  # Instantiate the BERT classifier
  bert_classifier = BertClassifier(freeze_bert=False)

  # Tell pytorch to run the model on the GPU
  bert_classifier.to(device)

  # Create the optimizer
  optimizer = AdamW(bert_classifier.parameters(), lr=5e-5, eps=1e-8)

  # Total number of training steps
  total_steps = len(train_dataloader) * epochs

  # Set up the Learning Rate scheduler
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps=0, 
                                              num_training_steps=total_steps)
  
  return bert_classifier, optimizer, scheduler

#### 4.3 Run the Training loop

The following steps would be performed for the Training step

1. Unpack the data from the DataLoader and load it to the GPU
2. Zero out the gradients calculated in the previous pass
3. Perform a forward pass to compute the logits and loss
4. Perform a backward pass to compute the gradients --> loss.backward()
5. Clip the norm of the gradients to 1.0 to prevent exploding gradients
6. Update the models parameters  --> optimizer.step()
7. Update the learning rate --> scheduler.step()

Evaluation

1. Unpack the validation data and load on to the GPU
2. Forward pass
3. Compute the loss and accuracy

In [None]:
import random
import time

# Specify the loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):

  random.seed(seed_value)
  np.random.seed(seed_value)
  torch.manual_seed(seed_value)
  torch.cuda.manual_seed_all(seed_value)


def train(model, train_dataloader, val_dataloader=None, epochs=2, evaluation=False):

  print("Starting the training step...")

  for epoch_i in range(epochs):

    # Print the header of the summary
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*70)

    # Measure the time for each epoch
    t0_epoch, t0_batch = time.time(), time.time()

    # reset the tracking variables at the start of each epoch
    total_loss, batch_loss, batch_counts = 0, 0, 0

    # Put the model into the Training mode
    model.train()

    # Train for each batch of data
    #=============================
    for step, batch in enumerate(train_dataloader):

      batch_counts += 1

      # Load the data to GPU
      b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

      # Zero out previous gradients
      model.zero_grad()

      # Perform a forward-pass to get logits
      logits = model(b_input_ids, b_attn_mask)

      # Compute the loss and accumulate the loss value
      loss = loss_fn(logits, b_labels)
      batch_loss += loss.item()
      total_loss += loss.item()

      # Perform a backward pass to calculate the gradients
      loss.backward()

      # Clip the norm of the gradients to 1.0 to prevent exploding gradients
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Update the parameters and the Learning rate
      optimizer.step()
      scheduler.step()

      # Print the loss values for every 20 batches
      if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
        time_elapsed = time.time() - t0_batch
        
        # Print training results
        print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}") 

        # Reset batch tracking variables
        batch_loss, batch_counts = 0, 0
        t0_batch = time.time()

    # Calculate the average loss over the entire training data
    avg_train_loss = total_loss / len(train_dataloader)
    
    print("-"*70)


    # Evaluation
    # ==========
    if evaluation == True:
        # After the completion of each training epoch, measure the model's performance
        # on our validation set.
        val_loss, val_accuracy = evaluate(model, val_dataloader)

        # Print performance over the entire training data
        time_elapsed = time.time() - t0_epoch
        
        print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
        print("-"*70)
        print("\n")

    print("Training complete!")


# def evaluate(model, val_dataloader):

#   # Put the model into the evluation mode. Dropout layers are disabled during the testing time
#   model.eval()

#   # Initiate the tracking variables
#   val_accuracy = []
#   val_loss = []

#   # For each batch of validation data
#   for batch in val_dataloader:

#     # Load the data to GPU
#     b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

#     # Compute the logits
#     with torch.no_grad():
#       logits = model(b_input_ids, b_attn_mask)

#     # Compute loss
#     loss = loss_fn(logits, b_labels)
#     val_loss.append(loss)

#     # Get the predictions
#     preds = torch.argmax(logits, dim=1).flatten()

#     # Calculate the accuracy
#     accuracy = (preds == b_labels).cpu().numpy().mean() * 100
#     val_accuracy.append(accuracy)

#   # Compute the average accuracy and loss over the entire validation set
#   val_loss = np.mean(val_loss)
#   val_accuracy = np.mean(val_accuracy)

#   return val_loss, val_accuracy


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

#### 4.4 Now the most interesting part!.....start the training

In [None]:
set_seed(42)
bert_classifier, optimizer, scheduler = initialize_model(epochs=2)
train(bert_classifier, train_dataloader, val_dataloader, epochs=2, evaluation=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Starting the training step...
 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.637324   |     -      |     -     |   13.97  
   1    |   40    |   0.556735   |     -      |     -     |   13.15  
   1    |   60    |   0.459118   |     -      |     -     |   13.20  
   1    |   80    |   0.482745   |     -      |     -     |   13.24  
   1    |   95    |   0.484808   |     -      |     -     |   9.74   
----------------------------------------------------------------------
   1    |    -    |   0.527374   |  0.397946  |   82.56   |   65.65  
----------------------------------------------------------------------


Training complete!
 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   2    |   20    |   0.296510   |     -      |     -     |   13.89  
   2    |   40    |   0.295367   | 