In [1]:
# Import all the required libraries
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import time
import datetime
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Choose Dataset
# StereoSet
# ToxicBias
dataset = 'ToxicBias'

#Choose encoder model
# bert-base-uncased

model_name = 'bert-base-uncased'

# Define Parameters
learning_rate = 2e-5
epochs = 4

In [3]:
df_train = pd.read_csv(f'Dataset/{dataset}/train.csv')
df_val = pd.read_csv(f'Dataset/{dataset}/val.csv')
df_test = pd.read_csv(f'Dataset/{dataset}/test.csv')
print('Training size: ', df_train.size)
print('Validation size: ', df_val.size)
print('Test size: ', df_test.size)

Training size:  25962
Validation size:  2592
Test size:  3900


In [4]:
# List of sentences and labels
sentences_train = df_train.Sentence.values
labels_train = df_train.labels.values
sentences_val = df_val.Sentence.values
labels_val = df_val.labels.values

In [5]:
# Import tokenizer
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [6]:
# Print the original sentence.
print(' Original: ', sentences_train[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences_train[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences_train[0])))

 Original:  We are still having their water put at risk in the Dakotas for some white mans profit. When white people complained about the same issue the natives were just not important enough to care about their water supply. Some of you white folks refuse to acknowledge how much you benefit from the rape robbery torture and murder done in your benefit. Many of you are quite proud in your self righteous ignorance. Or maybe you just dont think those lives matter. Or they both may be true.
Tokenized:  ['we', 'are', 'still', 'having', 'their', 'water', 'put', 'at', 'risk', 'in', 'the', 'dakota', '##s', 'for', 'some', 'white', 'mans', 'profit', '.', 'when', 'white', 'people', 'complained', 'about', 'the', 'same', 'issue', 'the', 'natives', 'were', 'just', 'not', 'important', 'enough', 'to', 'care', 'about', 'their', 'water', 'supply', '.', 'some', 'of', 'you', 'white', 'folks', 'refuse', 'to', 'acknowledge', 'how', 'much', 'you', 'benefit', 'from', 'the', 'rape', 'robbery', 'torture', 'and

In [7]:
# Function that encode every sentence, add padding and return the input ids and attention mask
def encode_sentences(sentences):
    input_ids = []
    attention_mask = []

    # For every sentence
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=64,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        #Add the encoded sentence to list
        input_ids.append(encoded_dict['input_ids'])

        #Add attention mask
        attention_mask.append(encoded_dict['attention_mask'])
    return input_ids, attention_mask

In [8]:
# Get input ids and attention mask
train_input_ids, train_attention_mask = encode_sentences(sentences_train)
val_input_ids, val_attention_mask = encode_sentences(sentences_val)

#Convert the list into tensors
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_mask = torch.cat(train_attention_mask, dim=0)
labels_train = torch.tensor(labels_train)
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_mask = torch.cat(val_attention_mask, dim=0)
labels_val = torch.tensor(labels_val)

#Print sentence 0 
print('Original: ', sentences_train[0])
print('Token ID: ', train_input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




Original:  We are still having their water put at risk in the Dakotas for some white mans profit. When white people complained about the same issue the natives were just not important enough to care about their water supply. Some of you white folks refuse to acknowledge how much you benefit from the rape robbery torture and murder done in your benefit. Many of you are quite proud in your self righteous ignorance. Or maybe you just dont think those lives matter. Or they both may be true.
Token ID:  tensor([  101,  2057,  2024,  2145,  2383,  2037,  2300,  2404,  2012,  3891,
         1999,  1996,  7734,  2015,  2005,  2070,  2317, 16042,  5618,  1012,
         2043,  2317,  2111, 10865,  2055,  1996,  2168,  3277,  1996, 12493,
         2020,  2074,  2025,  2590,  2438,  2000,  2729,  2055,  2037,  2300,
         4425,  1012,  2070,  1997,  2017,  2317, 12455, 10214,  2000, 13399,
         2129,  2172,  2017,  5770,  2013,  1996,  9040, 13742,  8639,  1998,
         4028,  2589,  1999, 

In [9]:
# Create a TensorDataset
train_dataset = TensorDataset(train_input_ids, train_attention_mask, labels_train)
val_dataset = TensorDataset(val_input_ids, val_attention_mask, labels_val)

In [10]:
# Define batch size
batch_size = 32

# Create a DataLoader
train_dataloader = DataLoader(
            train_dataset,
            sampler=RandomSampler(train_dataset),
            batch_size=batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler=SequentialSampler(val_dataset),
            batch_size=batch_size
        )

In [11]:
# Load the model
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
# Define optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,
                  eps=1e-8
                  )

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)



In [13]:
# Function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [14]:
# Function to format time
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [15]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [16]:
# To store training and validation loss
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

In [17]:
# For each epoch
for epoch in range(epochs):
    # TRAINING
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from dataloader.
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()

        # Perform a forward pass
        loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=False)

        # Accumulate the training loss over all of the batches
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
    
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print('Average training loss: {0:.2f}'.format(avg_train_loss))
    print('Training epoch took: {:}'.format(training_time))

    # VALIDATION
    print('Running Validation...')
    t0 = time.time()

    # Put the model in evaluation mode
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=False)

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    # Average validation accuracy
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('Accuracy: {0:.2f}'.format(avg_val_accuracy))

    # Average loss over all the batches
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took
    validation_time = format_time(time.time() - t0)

    print('Validation Loss: {0:.2f}'.format(avg_val_loss))
    print('Validation took: {:}'.format(validation_time))

    # Record all statistics from this epoch
    training_stats.append(
        {
            'epoch': epoch + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print('Training complete!')
print('Total training took {:} (h:mm:ss)'.format(format_time(time.time() - total_t0)))
      


Training...
Batch    40 of   136. Elapsed: 0:00:04.
Batch    80 of   136. Elapsed: 0:00:07.
Batch   120 of   136. Elapsed: 0:00:11.
Average training loss: 0.48
Training epoch took: 0:00:12
Running Validation...
Accuracy: 0.81
Validation Loss: 0.44
Validation took: 0:00:00
Training...
Batch    40 of   136. Elapsed: 0:00:03.
Batch    80 of   136. Elapsed: 0:00:07.
Batch   120 of   136. Elapsed: 0:00:10.
Average training loss: 0.40
Training epoch took: 0:00:11
Running Validation...
Accuracy: 0.82
Validation Loss: 0.43
Validation took: 0:00:00
Training...
Batch    40 of   136. Elapsed: 0:00:03.
Batch    80 of   136. Elapsed: 0:00:07.
Batch   120 of   136. Elapsed: 0:00:10.
Average training loss: 0.31
Training epoch took: 0:00:11
Running Validation...
Accuracy: 0.84
Validation Loss: 0.45
Validation took: 0:00:00
Training...
Batch    40 of   136. Elapsed: 0:00:03.
Batch    80 of   136. Elapsed: 0:00:07.
Batch   120 of   136. Elapsed: 0:00:10.
Average training loss: 0.23
Training epoch took: 

In [18]:
# Display floats with two decimal places.
pd.set_option('display.precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.48,0.44,0.81,0:00:12,0:00:00
2,0.4,0.43,0.82,0:00:11,0:00:00
3,0.31,0.45,0.84,0:00:11,0:00:00
4,0.23,0.47,0.81,0:00:11,0:00:00
