# BERT Model Evaluation

This code evaluates the performance of BERT models in varying classes to see if there are any class specific biases, and generates plots to illustrate this.

In [None]:

from transformers import BertForMaskedLM, BertTokenizer, DataCollatorForLanguageModeling

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

Load the pre-processed dataset created in the fine-tuning notebook

In [None]:
from datasets import DatasetDict

lm_datasets = DatasetDict()
lm_datasets = lm_datasets.load_from_disk("processed_dataset")
lm_datasets

In [None]:
lm_datasets['validation'][0]

A custom class based on the HuggingFace trainer is used to perform class-specific evaluations for all frequently occurring words.  The evaluation results on each of the fine-tuned models are saved as dictionaries.

In [None]:
from torch.utils.data import DataLoader
import torch
import numpy as np
from torch import nn
from transformers import Trainer
from collections import defaultdict

class ClassEvaluator(Trainer):
    
    def eval_by_class(self, num_occurrences=10):
        
        model = self._wrap_model(self.model, training=False)
        model.eval()
        loss_by_class = defaultdict(list)
        mean_loss_by_class = defaultdict(float)
        eval_dataloader = self.get_eval_dataloader()
        #eval_dataloader = DataLoader(lm_datasets["validation"], shuffle=False)
        prediction_loss_only = False
        
        for step, inputs in enumerate(eval_dataloader):
            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=[])
            labels = tokenizer.batch_decode(labels[0])
            loss = float(loss.cpu().numpy())
            for label in labels:
                if label != -100:
                    loss_by_class[label].append(loss)
                    
        for key, val in loss_by_class.items():
            if len(val) >= num_occurrences:
                mean_loss_by_class[key] = np.mean(val)
        
        return mean_loss_by_class

In [None]:
model_names = ["weight_decay_0.1", "weight_decay_0.001", "weight_decay_0.01", "weight_decay_0", "dropout_0", "dropout_0.2", "dropout_0.4"]
from transformers import TrainingArguments

for model_name in model_names:
    
    model = model = BertForMaskedLM.from_pretrained(model_name)

    eval_args = TrainingArguments(
        per_device_eval_batch_size = 1,
        output_dir = "eval_tmp"
    )

    class_evaluator = ClassEvaluator(
        model = model,
        args = eval_args,
        train_dataset = lm_datasets["train"],
        eval_dataset = lm_datasets["validation"],
        data_collator = data_collator
    )

    class_probs = class_evaluator.eval_by_class(25)
    
    print(model_name)
    print(class_probs)
    np.save(model_name + "_dict.npy", class_probs)

Words/classes with a high difference in perplexity across different weight decay and dropout settings and over 25 occurrences in the validation set are graphed.  These images are used in the final paper.

In [None]:
dict_1 = np.load("weight_decay_0_dict.npy", allow_pickle=True).item()
dict_2 = np.load("weight_decay_0.1_dict.npy", allow_pickle=True).item()

l = []
for key, _ in dict_1.items():
    diff = dict_2[key] - dict_1[key]
    l.append((diff, key))
    
l.sort()
print(l)

In [None]:
dict_1 = np.load("dropout_0_dict.npy", allow_pickle=True).item()
dict_2 = np.load("dropout_0.2_dict.npy", allow_pickle=True).item()

l = []
for key, _ in dict_1.items():
    diff = dict_2[key] - dict_1[key]
    l.append((diff, key))
    
l.sort()
print(l)

In [None]:
losses = {}

for model_name in model_names:
    
    model = model = BertForMaskedLM.from_pretrained(model_name)
    
    eval_args = TrainingArguments(
        per_device_eval_batch_size = 1,
        output_dir = "eval_tmp"
    )

    class_evaluator = ClassEvaluator(
        model = model,
        args = eval_args,
        train_dataset = lm_datasets["train"],
        eval_dataset = lm_datasets["validation"],
        data_collator = data_collator
    )
    eval_results = class_evaluator.evaluate()
    print(model_name)
    print("loss " + str(eval_results))
    losses[model_name] = np.exp(eval_results["eval_loss"])
    
np.save("model_perplexities.npy", losses)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

decays = [0, 0.001, 0.01, 0.1]

word1 = "u n i t e d"
word1_title = '"united"'
word2 = "a l o n g"
word2_title = '"along"'
word1_perps = []
word2_perps = []
perps = []

for decay in decays:
    dict_results = np.load("weight_decay_" + str(decay) + "_dict.npy", allow_pickle=True).item()
    word1_perp = np.exp(dict_results[word1])
    word1_perps.append(word1_perp)
    word2_perp = np.exp(dict_results[word2])
    word2_perps.append(word2_perp)
    
total_losses = np.load("model_perplexities.npy", allow_pickle=True).item()
for decay in decays:
    perps.append(total_losses["weight_decay_" + str(decay)])

#print(word_perps)
plt.clf()
plt.xscale("symlog", linthresh=0.0015)

plt.xticks(decays)
plt.xlabel("Weight Decay")
plt.ylabel("Perplexity")
#plt.title("Class Specific Perplexities")
plt.plot(decays, word1_perps, marker='o', label = word1_title)
plt.plot(decays, word2_perps, marker='o', label = word2_title)
plt.plot(decays, perps, marker='o', label = "Base Model")

plt.legend()
plt.savefig('class_perp_2_notitle.png')

In [None]:
import matplotlib.pyplot as plt

dropouts = [0, 0.1, 0.2]

word1 = "s c i e n t o l o g y"
word1_title = '"scientology"'
word2 = "s o m e"
word2_title = '"some"'
word1_perps = []
word2_perps = []
perps = []

for dropout in dropouts:
    dict_results = np.load("dropout_" + str(dropout) + "_dict.npy", allow_pickle=True).item()
    word1_perp = np.exp(dict_results[word1])
    word1_perps.append(word1_perp)
    word2_perp = np.exp(dict_results[word2])
    word2_perps.append(word2_perp)
    
total_losses = np.load("model_perplexities.npy", allow_pickle=True).item()
total_losses["dropout_0.1"] = total_losses["weight_decay_0.01"]
for dropout in dropouts:
    perps.append(total_losses["dropout_" + str(dropout)])

#print(word_perps)
plt.clf()
#plt.rcParams["figure.figsize"] = (8,5)
#plt.xscale("symlog", linthresh=0.0015)
plt.xticks(dropouts)
#plt.ylim(3.5, 5)
plt.xlabel("Dropout")
plt.ylabel("Perplexity")
#plt.title("Class Specific Perplexities")
plt.plot(dropouts, word1_perps, marker='o', label = word1_title)
plt.plot(dropouts, word2_perps, marker='o', label = word2_title)
plt.plot(dropouts, perps, marker='o', label = "Base Model")
plt.legend()
plt.savefig('class_perp_1_notitles.png')