In [9]:
# Import all the required libraries
import pandas as pd
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset

In [10]:
# Choose Dataset
# StereoSet
# ToxicBias
dataset = 'StereoSet'

#Choose encoder model
# bert-base-uncased

model = 'bert-base-uncased'

In [13]:
df_train = pd.read_csv(f'Dataset/{dataset}/train.csv')
df_val = pd.read_csv(f'Dataset/{dataset}/val.csv')
df_test = pd.read_csv(f'Dataset/{dataset}/test.csv')
print('Training size: ', df_train.size)
print('Validation size: ', df_val.size)
print('Test size: ', df_test.size)

Training size:  12226
Validation size:  1360
Test size:  3398


In [14]:
# List of sentences and labels
sentences_train = df_train.Sentence.values
labels_train = df_train.labels.values
sentences_val = df_val.Sentence.values
labels_val = df_val.labels.values

In [15]:
# Import tokenizer
tokenizer = BertTokenizer.from_pretrained(model, do_lower_case=True)

In [16]:
# Print the original sentence.
print(' Original: ', sentences_train[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences_train[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences_train[0])))

 Original:  The schoolboy has some young qualities about him. Young qualities are usually producing charming people.
Tokenized:  ['the', 'school', '##boy', 'has', 'some', 'young', 'qualities', 'about', 'him', '.', 'young', 'qualities', 'are', 'usually', 'producing', 'charming', 'people', '.']
Token IDs:  [1996, 2082, 11097, 2038, 2070, 2402, 11647, 2055, 2032, 1012, 2402, 11647, 2024, 2788, 5155, 11951, 2111, 1012]


In [17]:
# Function that encode every sentence, add padding and return the input ids and attention mask
def encode_sentences(sentences):
    input_ids = []
    attention_mask = []

    # For every sentence
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=64,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        #Add the encoded sentence to list
        input_ids.append(encoded_dict['input_ids'])

        #Add attention mask
        attention_mask.append(encoded_dict['attention_mask'])
    return input_ids, attention_mask

In [18]:
# Get input ids and attention mask
train_input_ids, train_attention_mask = encode_sentences(sentences_train)
val_input_ids, val_attention_mask = encode_sentences(sentences_val)

#Convert the list into tensors
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_mask = torch.cat(train_attention_mask, dim=0)
labels_train = torch.tensor(labels_train)
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_mask = torch.cat(val_attention_mask, dim=0)
labels_val = torch.tensor(labels_val)

#Print sentence 0 
print('Original: ', sentences_train[0])
print('Token ID: ', train_input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  The schoolboy has some young qualities about him. Young qualities are usually producing charming people.
Token ID:  tensor([  101,  1996,  2082, 11097,  2038,  2070,  2402, 11647,  2055,  2032,
         1012,  2402, 11647,  2024,  2788,  5155, 11951,  2111,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
