In [35]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import copy
import transformers
import numpy as np
import random
from transformers import AdamW, BertTokenizer, get_linear_schedule_with_warmup
from datasets import load_dataset
from model import SentimentClassifierWithMultipleHeads

In [36]:
dataset = load_dataset("sst2")

In [37]:
df_train = dataset['train'].to_pandas()
df_val = dataset['validation'].to_pandas()
df_test = dataset['test'].to_pandas()

In [53]:
train_messages = df_train['sentence'].to_list()
train_labels = df_train['label'].to_list()
val_messages = df_val['sentence'].to_list()
val_labels = df_val['label'].to_list()
test_messages = df_test['sentence'].to_list()
test_labels = df_test['label'].to_list()

In [39]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [40]:
print(' Original: ', train_messages[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_messages[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_messages[0])))

 Original:  hide new secretions from the parental units 
Tokenized:  ['hide', 'new', 'secret', '##ions', 'from', 'the', 'parental', 'units']
Token IDs:  [5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197]


In [41]:
max_len = 0
all_messages = copy.deepcopy(train_messages)
for sent in val_messages:
    all_messages.append(sent)
for sent in test_messages:
    all_messages.append(sent)
# For every sentence...
for sent in all_messages:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  66


In [42]:
def tokenize_sentences(sentences, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_len,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                       )

        input_ids.append(encoded_dict['input_ids'])

        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [43]:
train_inputs, train_masks = tokenize_sentences(train_messages, tokenizer, max_len)
val_inputs, val_masks = tokenize_sentences(val_messages, tokenizer, max_len)
test_inputs, test_masks = tokenize_sentences(test_messages, tokenizer, max_len)

train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
test_labels = torch.tensor(test_labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [44]:
# print all shapes
print("Train Messages: ", len(train_messages))
print("Train Inputs: ", train_inputs.shape)
print("Train Masks: ", train_masks.shape)
print("Train Labels: ", train_labels.shape)
print("Validation Messages: ", len(val_messages))
print("Validation Inputs: ", val_inputs.shape)
print("Validation Masks: ", val_masks.shape)
print("Validation Labels: ", val_labels.shape)
print("Test Messages: ", len(test_messages))
print("Test Inputs: ", test_inputs.shape)
print("Test Masks: ", test_masks.shape)
print("Test Labels: ", test_labels.shape)

Train Messages:  67349
Train Inputs:  torch.Size([67349, 66])
Train Masks:  torch.Size([67349, 66])
Train Labels:  torch.Size([67349])
Validation Messages:  872
Validation Inputs:  torch.Size([872, 66])
Validation Masks:  torch.Size([872, 66])
Validation Labels:  torch.Size([872])
Test Messages:  1821
Test Inputs:  torch.Size([1821, 66])
Test Masks:  torch.Size([1821, 66])
Test Labels:  torch.Size([1821])


In [45]:
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)

In [47]:
batch_size = 32

train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [50]:
model = SentimentClassifierWithMultipleHeads('bert-base-uncased', 2)
model = model.to(device)

In [51]:
optimizer = AdamW(model.parameters())



In [None]:
epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [55]:
def compute_aggregate_and_classwise_metrics(predicted_labels, true_labels):
    classification_metrics = classification_report(true_labels, predicted_labels, output_dict=True)
    return classification_metrics

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

for epoch_i in range(0, epochs):
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        outputs = model(b_input_ids, b_input_mask)
        loss = outputs[0]
        total_train_loss += loss.item()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    model.eval()
    eval_preds, eval_labels = [], []
    total_eval_loss = 0
    nb_eval_steps = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            outputs = model(b_input_ids, b_input_mask)
        loss = outputs[0]
        logits = outputs[1]
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions = np.argmax(logits, axis=1).flatten()
        eval_preds.extend(predictions)
        eval_labels.extend(label_ids)
    val_metrics = compute_aggregate_and_classwise_metrics(eval_preds, eval_labels)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("Validation Loss: {}".format(avg_val_loss))

    model.eval()
    test_preds, test_labels = [], []
    total_test_loss = 0
    nb_test_steps = 0
    for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            outputs = model(b_input_ids, b_input_mask)
        loss = outputs[0]
        logits = outputs[1]
        total_test_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions = np.argmax(logits, axis=1).flatten()
        test_preds.extend(predictions)
        test_labels.extend(label_ids)
    test_metrics = compute_aggregate_and_classwise_metrics(test_preds, test_labels)
    avg_test_loss = total_test_loss / len(test_dataloader)
    print("Test Loss: {}".format(avg_test_loss))
    training_stats.append({'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Test Loss': avg_test_loss, 'Test Metrics': test_metrics})

print("Training complete!")
    

In [None]:
# Test set accuracy
model.eval()
total_test_accuracy = 0
total_test_loss = 0
nb_test_steps = 0

for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
        outputs = model(b_input_ids, b_input_mask)
    loss = outputs[0]
    logits = outputs[1]
    total_test_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    total_test_accuracy += flat_accuracy(logits, label_ids)

avg_test_accuracy = total_test_accuracy / len(test_dataloader)
print("Accuracy: {}".format(avg_test_accuracy))
avg_test_loss = total_test_loss / len(test_dataloader)
print("Test Loss: {}".format(avg_test_loss))


In [None]:
# print training stats
print("Training stats:")
for stat in training_stats:
    print(stat)