In [37]:
# Import all the required libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from sklearn.metrics import classification_report
import time
import datetime
import random

In [38]:
# Choose Dataset
# StereoSet
# ToxicBias
dataset = 'StereoSet'
dataset_type = 'Stereotype'

# BERT large    ----> bert-large-uncased
# ALBERT large  ----> albert-xxlarge-v2
# RoBERTa large ----> roberta-large
model_name = 'roberta-large'

# Define Parameters
learning_rate = 2e-5
epochs = 4
batch_size = 32

In [39]:
df_train = pd.read_csv(f'Dataset/{dataset}/train.csv')
df_val = pd.read_csv(f'Dataset/{dataset}/val.csv')
df_test = pd.read_csv(f'Dataset/{dataset}/test.csv')
print('Training shape: ', df_train.shape)
print('Validation shape: ', df_val.shape)
print('Test shape: ', df_test.shape)

Training shape:  (6113, 2)
Validation shape:  (680, 2)
Test shape:  (1699, 2)


In [40]:
# List of sentences and labels
sentences_train = df_train.Sentence.values
labels_train = df_train.labels.values
sentences_val = df_val.Sentence.values
labels_val = df_val.labels.values

print(len(df_train['labels']))

print(f"{dataset_type} instances in training data: ", df_train["labels"].value_counts())
print(f"{dataset_type} instances in validation data: ", df_val["labels"].value_counts())

6113
Stereotype instances in training data:  labels
0    3098
1    3015
Name: count, dtype: int64
Stereotype instances in validation data:  labels
0    340
1    340
Name: count, dtype: int64


In [41]:
# Import tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

In [42]:
# Print the original sentence.
print(' Original: ', sentences_train[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences_train[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences_train[0])))

 Original:  The schoolboy has some young qualities about him. Young qualities are usually producing charming people.
Tokenized:  ['the', 'school', '##boy', 'has', 'some', 'young', 'qualities', 'about', 'him', '.', 'young', 'qualities', 'are', 'usually', 'producing', 'charming', 'people', '.']
Token IDs:  [1996, 2082, 11097, 2038, 2070, 2402, 11647, 2055, 2032, 1012, 2402, 11647, 2024, 2788, 5155, 11951, 2111, 1012]


In [43]:
# Function that encode every sentence, add padding and return the input ids and attention mask
def encode_sentences(sentences):
    input_ids = []
    attention_mask = []

    # For every sentence
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=64,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        #Add the encoded sentence to list
        input_ids.append(encoded_dict['input_ids'])

        #Add attention mask
        attention_mask.append(encoded_dict['attention_mask'])
    return input_ids, attention_mask

In [44]:
# Get input ids and attention mask
train_input_ids, train_attention_mask = encode_sentences(sentences_train)
val_input_ids, val_attention_mask = encode_sentences(sentences_val)

#Convert the list into tensors
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_mask = torch.cat(train_attention_mask, dim=0)
labels_train = torch.tensor(labels_train)
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_mask = torch.cat(val_attention_mask, dim=0)
labels_val = torch.tensor(labels_val)

#Print sentence 0 
print('Original: ', sentences_train[0])
print('Token ID: ', train_input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




Original:  The schoolboy has some young qualities about him. Young qualities are usually producing charming people.
Token ID:  tensor([  101,  1996,  2082, 11097,  2038,  2070,  2402, 11647,  2055,  2032,
         1012,  2402, 11647,  2024,  2788,  5155, 11951,  2111,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


In [45]:
# Create a TensorDataset
train_dataset = TensorDataset(train_input_ids, train_attention_mask, labels_train)
val_dataset = TensorDataset(val_input_ids, val_attention_mask, labels_val)

In [46]:
# Create a DataLoader
train_dataloader = DataLoader(
            train_dataset,
            sampler=RandomSampler(train_dataset),
            batch_size=batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler=SequentialSampler(val_dataset),
            batch_size=batch_size
        )

In [47]:
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [48]:
# Define optimizer
optimizer = AdamW(model.parameters(),
                  lr=learning_rate,
                  eps=1e-8
                  )

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)



In [49]:
# Function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [50]:
# Function to format time
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [51]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [52]:
# To store training and validation loss
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch
for epoch in range(epochs):
    # TRAINING
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode
    model.train()

    # For each batch of training data
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from dataloader.
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()

        # Perform a forward pass
        loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=False)

        # Accumulate the training loss over all of the batches
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
    
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print('Average training loss: {0:.2f}'.format(avg_train_loss))
    print('Training epoch took: {:}'.format(training_time))

    # VALIDATION
    print('Running Validation...')
    t0 = time.time()

    # Put the model in evaluation mode
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Predictions list and true labels list
    (predictions, true_labels) = ([], [])

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, return_dict=False)

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        predictions.append(logits)
        true_labels.append(label_ids)

        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    # Average validation accuracy
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('Accuracy: {0:.2f}'.format(avg_val_accuracy))

    # Average loss over all the batches
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took
    validation_time = format_time(time.time() - t0)

    print('Validation Loss: {0:.2f}'.format(avg_val_loss))
    print('Validation took: {:}'.format(validation_time))

    # get the predictions and true labels in form of list
    (pred_list, true_list) = ([], [])
    for i in range(len(true_labels)):
        pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
        pred_list.extend(list(pred_labels_i))
        true_list.extend(list(true_labels[i]))

    # Print classification report
    f1_score = classification_report(true_list, pred_list, output_dict=True)
    print(f1_score['weighted avg']['f1-score'])
    print(pd.DataFrame(f1_score).transpose())

    # Record all statistics from this epoch
    training_stats.append(
        {
            'epoch': epoch + 1,
            'f1-score': f1_score['weighted avg']['f1-score'],
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    model.save_pretrained(f'Models/{model_name}/{dataset}/{epoch}.pt')

print('Training complete!')
print('Total training took {:} (h:mm:ss)'.format(format_time(time.time() - total_t0)))
      


Training...
Batch    40 of   192. Elapsed: 0:00:10.
Batch    80 of   192. Elapsed: 0:00:21.
Batch   120 of   192. Elapsed: 0:00:31.
Batch   160 of   192. Elapsed: 0:00:42.
Average training loss: 0.57
Training epoch took: 0:00:50
Running Validation...
Accuracy: 0.81
Validation Loss: 0.44
Validation took: 0:00:02
0.8027346963517177
              precision  recall  f1-score  support
0                  0.82    0.77      0.80    340.0
1                  0.78    0.84      0.81    340.0
accuracy           0.80    0.80      0.80      0.8
macro avg          0.80    0.80      0.80    680.0
weighted avg       0.80    0.80      0.80    680.0
Training...
Batch    40 of   192. Elapsed: 0:00:10.
Batch    80 of   192. Elapsed: 0:00:21.
Batch   120 of   192. Elapsed: 0:00:31.
Batch   160 of   192. Elapsed: 0:00:42.
Average training loss: 0.33
Training epoch took: 0:00:50
Running Validation...
Accuracy: 0.85
Validation Loss: 0.40
Validation took: 0:00:02
0.8528788154148385
              precision  recal

In [53]:
# Display floats with two decimal places.
pd.set_option('display.precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats.to_csv(f'Models/{model_name}/{dataset}/stats.csv')
df_stats

Unnamed: 0_level_0,f1-score,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.8,0.57,0.44,0.81,0:00:50,0:00:02
2,0.85,0.33,0.4,0.85,0:00:50,0:00:02
3,0.89,0.15,0.37,0.88,0:00:50,0:00:02
4,0.89,0.07,0.41,0.88,0:00:50,0:00:02


In [16]:
# Run the model on Test data
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [29]:
# define variables
test_model_name = 'roberta-large'
test_dataset = 'StereoSet'
test_model_number = 3


In [30]:
# Import model
model_path = f'Models/{test_model_name}/{test_dataset}/{test_model_number}.pt'
test_model = AutoModelForSequenceClassification.from_pretrained(model_path, output_attentions=True)

# Import tokenizer
tokenizer = AutoTokenizer.from_pretrained(test_model_name, do_lower_case=True)

# Load test data

df_test = pd.read_csv(f'Dataset/{test_dataset}/test.csv')

# Store predicted and true labels and sentences
(test_sentences, test_predicted_labels, test_true_labels) = ([], [], [])

for i in range(len(df_test)):
    sentence = df_test['Sentence'][i]
    label = df_test['labels'][i]
    encoded_dict = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=64,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    with torch.no_grad():
        output = test_model(input_ids, token_type_ids=None, attention_mask=attention_mask, return_dict=True)
        logits = output.logits
        
    logits = logits.detach().cpu().numpy()
    pred_label = np.argmax(logits, axis=1).flatten()
    test_sentences.append(sentence)
    test_predicted_labels.append(pred_label)
    test_true_labels.append(label)
    if(i%100 == 0):
        print('Completed ', i)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Completed  0
Completed  100
Completed  200
Completed  300
Completed  400
Completed  500
Completed  600
Completed  700
Completed  800
Completed  900
Completed  1000
Completed  1100
Completed  1200
Completed  1300
Completed  1400
Completed  1500
Completed  1600


In [32]:
# get classification report
f1_score = classification_report(test_true_labels, test_predicted_labels)
print(f1_score)

# get confusion matrix
conf_matrix = confusion_matrix(test_true_labels, test_predicted_labels)
print(conf_matrix)

              precision    recall  f1-score   support

           0       0.87      0.88      0.88       808
           1       0.89      0.88      0.89       891

    accuracy                           0.88      1699
   macro avg       0.88      0.88      0.88      1699
weighted avg       0.88      0.88      0.88      1699

[[713  95]
 [104 787]]
