In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

In [6]:
data = pd.read_csv('./final_dataset/train.csv')
data.shape

(4036, 2)

In [7]:
temp_labels = ['O', 'Generic she', 'Generic he',
               'Behavioural Stereotypes', 'i-Behavioural Stereotypes',
               'Societal Stereotypes', 'i-Societal Stereotypes',
               'Explicit Marking of Sex', 'i-Explicit Marking of Sex']
label2id = {k: v for v, k in enumerate(temp_labels)}
id2label = {v: k for v, k in enumerate(temp_labels)}
label2id

{'O': 0,
 'Generic she': 1,
 'Generic he': 2,
 'Behavioural Stereotypes': 3,
 'i-Behavioural Stereotypes': 4,
 'Societal Stereotypes': 5,
 'i-Societal Stereotypes': 6,
 'Explicit Marking of Sex': 7,
 'i-Explicit Marking of Sex': 8}

In [16]:
print(id2label)

{0: 'O', 1: 'Generic she', 2: 'Generic he', 3: 'Behavioural Stereotypes', 4: 'i-Behavioural Stereotypes', 5: 'Societal Stereotypes', 6: 'i-Societal Stereotypes', 7: 'Explicit Marking of Sex', 8: 'i-Explicit Marking of Sex'}


In [20]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
            tokenized_sentence = tokenized_sentence[:maxlen]
            labels = labels[:maxlen]
        else:
          # pad
            tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
            labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [8]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
t_t = tokenizer.tokenize("You're self-motivated and decisive, but willing to make changes with minimal grumbling when the client demands it.")
t_id = tokenizer.convert_tokens_to_ids(t_t)
print(t_t)
print(t_id)

['you', "'", 're', 'self', '-', 'motivated', 'and', 'decisive', ',', 'but', 'willing', 'to', 'make', 'changes', 'with', 'minimal', 'gr', '##umb', '##ling', 'when', 'the', 'client', 'demands', 'it', '.']
[2017, 1005, 2128, 2969, 1011, 12774, 1998, 13079, 1010, 2021, 5627, 2000, 2191, 3431, 2007, 10124, 24665, 25438, 2989, 2043, 1996, 7396, 7670, 2009, 1012]


In [10]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [13]:
print(data.sentence[20])
print(data.word_labels[20])

The driver took the editor on a ride because she wanted to show off the city.
O,O,O,O,O,O,O,O,O,Generic she,O,O,O,O,O,O,


In [15]:
t_sentence, t_lables = tokenize_and_preserve_labels(data.sentence[20], data.word_labels[20], tokenizer)
print(t_sentence)
print(t_lables)

['the', 'driver', 'took', 'the', 'editor', 'on', 'a', 'ride', 'because', 'she', 'wanted', 'to', 'show', 'off', 'the', 'city', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Generic she', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [21]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (4036, 2)
TRAIN Dataset: (3229, 2)
TEST Dataset: (807, 2)


In [22]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [32]:
training_loader.key


AttributeError: 'DataLoader' object has no attribute 'key'

In [23]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs[0], outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")