# Training and Evaluating Focal Loss from Huang et al. 2021

### Huang et al. data preprocessing

In [3]:
# Importing aptemod dataset, getting training, validation, and test splits into same format as Huang et al.

import os
import xml.etree.ElementTree as ET
from collections import Counter
from sklearn.model_selection import train_test_split
import pickle

def read_labels(labels_path):
    """Parse labels file into a dict mapping doc_id to list of labels"""
    doc_to_labels = {}
    with open(labels_path, 'r', encoding='utf-8') as f:
        for line in f:
            doc_id, label = line.strip().split(' ', 1)
            doc_id = doc_id.replace('test/', '')
            doc_id = doc_id.replace('training/', '')
            doc_to_labels[doc_id] = label.split(' ')
    return doc_to_labels

def read_document(file_path):
    """Read a single document, clean its contents, and return them"""
    with open(file_path, 'r', encoding='latin-1') as f:
        
        content = f.read()
        content = content.replace('\n', ' ')
        content = ' '.join(content.split())
        return content

# Read in document ids and associated labels

labels_path = os.path.join('reuters-aptemod', 'cats.txt')
labels = read_labels(labels_path)

print(f"Number of documents with labels: {len(labels)}")

# Read in document texts

training_path = os.path.join('reuters-aptemod', 'training')
data_train_all = []
for file in os.listdir(training_path):
    if file in labels:
        file_dict = {
            'text': read_document(os.path.join(training_path, file)),
            'labels': labels[file]
        }
        data_train_all.append(file_dict)

test_path = os.path.join('reuters-aptemod', 'test')
data_test = []
for file in os.listdir(test_path):
    file_dict = {}
    if file in labels:
        file_dict = {
            'text': read_document(os.path.join(test_path, file)),
            'labels': labels[file]
        }
        data_test.append(file_dict)

# Split validation data from training data. 

data_train, data_validation = train_test_split(data_train_all, random_state = 100, test_size = 1000) # Using a different random seed relative to Huang et al. because their seed of 123 was splitting my 'data_train_all' variable such that the training set was missing a single label, 'groundnut-oil'. This discrepancy occurs despite the similar seed because our 'data_train_all' variable has its documents in a different order than what Huang et al. originally had. I could not determine the exact order in which Huang et al. had their training documents in prior to splitting off validation data, but this should not be a big issue so long as our training set still has all 90 labels. The results of the various loss functions should not vary greatly from Huang et al.'s original results since we're just working with a slightly different variation of their original split.

print(f"Number of training documents {len(data_train)}")

print(f"Number of validation documents {len(data_validation)}")

print(f"Number of testing documents {len(data_test)}")

Number of documents with labels: 10788
Number of training documents 6769
Number of validation documents 1000
Number of testing documents 3019


In [4]:
# Making sure number of unique labels in the entire dataset is 90

unique_labels = set()
for label_list in labels.values():
    unique_labels.update(label_list)
print(f"Number of unique labels in cats.txt: {len(unique_labels)}")
print(f"Labels are: {sorted(list(unique_labels))}")

Number of unique labels in cats.txt: 90
Labels are: ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [5]:
term2count = Counter([x for docu in data_train for x in docu['labels']])
FREQ_CUTOFF = 0 
term_freq = sorted([term for term, count in term2count.items() if count>=FREQ_CUTOFF])
labels_ref = sorted([z for z in set([y for x in data_train for y in x['labels']]) if z in term_freq]) 
print(len(term2count), len(labels_ref))
class_freq = [term2count[x] for x in labels_ref]
train_num = len(data_train)

90 90


### Huang et al. Configuration

In [6]:
from util_loss import ResampleLoss

In [7]:
import sys
import os
import torch
import json
import numpy as np
from torch import nn
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from tqdm import trange
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from util_loss import ResampleLoss
from torch.utils.data import Dataset, DataLoader

In [8]:
# Initializing model and tokenizer
num_labels = len(labels_ref)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Set up device and move model to it
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# Defining optimizer
# Our own original experiments did not use grouped parameters to define which parameters should and shouldn't have weight decay applied. This is clearly a step forward relative to our model in that it allows more flexibility in terms of fine-tuning.

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4) # the learning rate applied is also different relative to our own experiments from before.



In [11]:
# Setting up Focal-Loss Function according to Huang et al. methodology
loss_func = ResampleLoss(
    reweight_func=None,
    loss_weight=1.0,
    focal=dict(focal=True, alpha=0.5, gamma=2),
    logit_reg=dict(),
    class_freq=class_freq,
    train_num=train_num
)

In [12]:
def preprocess_function(docu):
    labels = [1 if x in docu['labels'] else 0 for x in labels_ref]
    encodings = tokenizer(
        docu['text'],
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )
    return {
        'input_ids': encodings['input_ids'].flatten(),
        'attention_mask': encodings['attention_mask'].flatten(),
        'labels': torch.tensor(labels, dtype=torch.float)
    }

class CustomDataset(Dataset):
    def __init__(self, documents):
        self.documents = documents

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, index):
        return preprocess_function(self.documents[index])

# Create datasets
train_dataset = CustomDataset(data_train)
val_dataset = CustomDataset(data_validation)

# Create data loaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

validation_dataloader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False
)


### Huang et al. Training Loop

In [13]:
# Create training loop as per Huang et al.
source_dir = './'

prefix = 'reuters'
loss_func_name = 'FL'
suffix = 'rand100'
model_name = 'bert-base-cased'

epochs = 40 # Epoch count utilized by Huang et al.
best_f1_for_epoch = 0 # Tracking best f1 score
epochs_without_improvement = 0 # Implementing early stop if loss does not improve

# Create directories if they don't already exist
model_dir = os.path.join(source_dir, 'models')
log_dir = os.path.join(source_dir, 'logs')

os.makedirs(model_dir, exist_ok=True)  # Creates models directory if it doesn't exist
os.makedirs(log_dir, exist_ok=True)    # Creates logs directory if it doesn't exist

for epoch in trange(epochs, desc='Epoch'): # Using trange from the tqdm library for the progress bar. 
    model.train()
    training_loss = 0
    training_steps = 0
    
    for batch in train_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()} # Moving tensors in batch to GPU
        b_input_ids = batch['input_ids']
        b_input_mask = batch['attention_mask']
        b_labels = batch['labels']
        optimizer.zero_grad() # Clearing gradients from prior batch, prevent accumulation across batches
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # passing input into BERT model to retrieve logits
        logits = outputs[0]
        loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) # calculating loss via the loss function we specified from the util_loss module's ResampleLoss class, in this case just regular BCE. Additionally, we're reshaping the logits to match the labels, converting labels to match the same data type as the logits, and also reshaping them.
        loss.backward() # Computing gradients
        optimizer.step() # Updating weights
        training_loss += loss.item() # Summing training loss
        training_steps += 1 # Counting training steps
        
    print("Train loss: {}".format(training_loss/training_steps))
    
    # Validation section
    model.eval()
    val_loss = 0
    val_steps = 0
    true_labels,pred_labels = [],[]
    
    for batch in validation_dataloader:
        batch = {key: value.to(device) for key, value in batch.items()}
        b_input_ids = batch['input_ids']
        b_input_mask = batch['attention_mask']
        b_labels = batch['labels']
        with torch.no_grad():
            outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred) # Applying sigmoid to logits to acquire probabilities
            loss = loss_func(b_logit_pred.view(-1,num_labels),b_labels.type_as(b_logit_pred).view(-1,num_labels))
            val_loss += loss.item()
            val_steps += 1
            
            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            b_labels = b_labels.to('cpu').numpy()
            
        true_labels.append(b_labels)
        pred_labels.append(pred_label)
        
    print("Validation loss: {}".format(val_loss/val_steps))
    
    # Flatten outputs into 1d lists.
    true_labels = [item for sublist in true_labels for item in sublist]
    pred_labels = [item for sublist in pred_labels for item in sublist]
    
    # Calculate Accuracy
    threshold = 0.5
    true_bools = [tl==1 for tl in true_labels] # turning actual labels into booleans
    pred_bools = [pl>threshold for pl in pred_labels] # predicting labels based on threshold
    val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
    val_precision_accuracy = precision_score(true_bools, pred_bools,average='micro')
    val_recall_accuracy = recall_score(true_bools, pred_bools,average='micro')
    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Precision Validation Accuracy: ', val_precision_accuracy)
    print('Recall Validation Accuracy: ', val_recall_accuracy)
    
    # Calculate AUC as well, will need to look into this some more as I'm unsure what this is exactly
    val_auc_score = roc_auc_score(true_bools, pred_labels, average='micro')
    print('AUC Validation: ', val_auc_score)
    
    # Searching for best Threshold for f1. Essentially, what's going on here is that we're creating a range of thresholds from 0.4 to 0.6 with steps of 0.01 in between. Then, we're looping over this range and testing for which threshold yields the highest f1 score, printing that which gives the best results. 
    best_med_th = 0.5
    micro_thresholds = (np.array(range(-10,11))/100)+best_med_th
    f1_results, prec_results, recall_results = [], [], []
    for th in micro_thresholds:
        pred_bools = [pl>th for pl in pred_labels]
        test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
        test_precision_accuracy = precision_score(true_bools, pred_bools,average='micro')
        test_recall_accuracy = recall_score(true_bools, pred_bools,average='micro')
        f1_results.append(test_f1_accuracy)
        prec_results.append(test_precision_accuracy)
        recall_results.append(test_recall_accuracy)
    best_f1_idx = np.argmax(f1_results) #best threshold value
    
    print('Best Threshold: ', micro_thresholds[best_f1_idx])
    print('Test F1 Accuracy: ', f1_results[best_f1_idx])
    
    # Save the model if this epoch gives the best f1 score in validation set
    if f1_results[best_f1_idx] > (best_f1_for_epoch * 0.995):
        best_f1_for_epoch = f1_results[best_f1_idx]
        epochs_without_improvement = 0
        for fname in os.listdir(model_dir):
            if fname.startswith('_'.join([prefix,model_name,loss_func_name,suffix])):
                os.remove(os.path.join(model_dir, fname))
        torch.save(model.state_dict(), os.path.join(model_dir, '_'.join([prefix,model_name,loss_func_name,suffix,'epoch'])+str(epoch+1)+'para'))
    else:
        epochs_without_improvement += 1    
        
    # Log all results in validation set with different thresholds
    with open(os.path.join(log_dir, '_'.join([prefix,model_name,loss_func_name,suffix,'epoch'])+str(epoch+1)+'.json'),'w') as f:
        d = {}
        d["f1_accuracy_default"] =  val_f1_accuracy
        d["pr_accuracy_default"] =  val_precision_accuracy
        d["rec_accuracy_default"] =  val_recall_accuracy
        d["auc_score_default"] =  val_auc_score
        d["thresholds"] =  list(micro_thresholds)
        d["threshold_f1s"] =  f1_results
        d["threshold_precs"] =  prec_results
        d["threshold_recalls"] =  recall_results
        json.dump(d, f)
    
    # If 5 epochs pass without improvement consider the model as saturated and exit
    if epochs_without_improvement > 4:
        break

Epoch:   0%|                                                                                    | 0/40 [02:50<?, ?it/s]


KeyboardInterrupt: 

### Huang et al. Validation