# Fine-tuning
In this file we will fine-tune Encoder models, like BERET, RoBERTa, etc., on the corpus of PAN. Then, evaluate them on the test set.

In [1]:
import torch
from transformers import BertTokenizer, DataCollatorWithPadding, utils
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification
import matplotlib.pyplot as plt
import numpy as np
import os
import torchmetrics
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from bertviz import model_view
from datasets import Dataset
from bertviz import head_view
import wandb

from utilities import (read_paragraphs,
                       read_ground_truth, 
                       generate_dataset)
utils.logging.set_verbosity_error()  # Suppress standard warnings

In [2]:
train_directory = './data/train_processed'
train_label_directory = './data/train_label'
test_directory = './data/validation_processed'
test_label_directory = './data/validation_label'
checkpoint = "finetuned-bert-base-cased/sweep-1-checkpoint-225" 
# File path to check
train_embedding_file_path = "train_embeddings.npy"
val_embedding_file_path   = "val_embeddings.npy"
test_embedding_file_path  = "test_embeddings.npy"

# hyperparameters of Random Forest Classifier
opt_n_estimators=600 
opt_random_state = 42
sweep = False

file_exists = False
# Check if file exists
if os.path.isfile(train_embedding_file_path):
  file_exists = True
  print(f"File '{train_embedding_file_path}' exists.")
else:
  print(f"File '{train_embedding_file_path}' does not exist.")

File 'train_embeddings.npy' exists.


In [3]:

# Read documents
train_data = read_paragraphs(train_directory, start_id=1, end_id=4200) # {'problem-x': [sen 1, sen 2, ...], ...}
test_data = read_paragraphs(test_directory, start_id=1, end_id=900)
# Read ground truth labels
train_labels = read_ground_truth(train_label_directory, start_id=1, end_id=4200) # {'problem-x': [1, ...], ...}
test_labels  = read_ground_truth(test_label_directory, start_id=1, end_id=900)

tokenizer = BertTokenizer.from_pretrained(checkpoint)

train_dataset = generate_dataset(train_data, train_labels, tokenizer)
test_dataset = generate_dataset(test_data, test_labels, tokenizer)
training_sets = train_dataset.train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
training_sets["validation"] = training_sets.pop("test")
# Add the "test" set to our `DatasetDict`
training_sets["test"] = test_dataset
# print(training_sets)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], 
                    truncation=True)

tokenized_datasets = training_sets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(tokenized_datasets)
del training_sets, train_dataset, test_dataset

Map:   0%|          | 0/17530 [00:00<?, ? examples/s]

Map:   0%|          | 0/4383 [00:00<?, ? examples/s]

Map:   0%|          | 0/4592 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17530
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4383
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4592
    })
})


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else  'cpu')

if not file_exists:
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, output_hidden_states=True)
    model.to(device)

In [5]:
# if not file_exists:
def get_embeddings(bert_model, dataset, file_name, save=True):
    # Calculate classification logits
    batch_size=100
    embeddings = []
    with torch.no_grad():
        for batch in range(0, dataset.num_rows, batch_size): # 
                input_ids = {k:v for k, v in dataset[batch: batch + batch_size].items() if k not in ['sentence1', 'sentence2', 'idx']}
                batch_data = data_collator(input_ids)
                batch_data = {k:v.to(device) for k, v in batch_data.items()}
                outputs = bert_model(**batch_data)
                embeddings.append(outputs.hidden_states[-1][:, 0, :].detach().cpu().numpy().copy())
                del outputs
                # break

    concatenated_embeddings = np.concatenate(embeddings, axis=0)
    print(concatenated_embeddings.shape)
    if save:
        np.save(file_name, concatenated_embeddings)
    else:
        return concatenated_embeddings

In [6]:
if not file_exists:
    get_embeddings(model, tokenized_datasets['test'], test_embedding_file_path)
    np.save('test_labels.npy', np.array(tokenized_datasets['test']['label']))
    get_embeddings(model, tokenized_datasets['validation'], val_embedding_file_path)
    np.save('val_labels.npy', np.array(tokenized_datasets['validation']['label']))
    get_embeddings(model, tokenized_datasets['train'], train_embedding_file_path)
    np.save('train_labels.npy', np.array(tokenized_datasets['train']['label']))
else:
    test_embedding  = np.load(test_embedding_file_path)
    val_embedding   = np.load(val_embedding_file_path)
    train_embedding = np.load(train_embedding_file_path)

    test_labels  = np.load('test_labels.npy')
    val_labels   = np.load('val_labels.npy')
    train_labels = np.load('train_labels.npy')


In [7]:
if sweep:
    # hyper-parameter search, using sweep
    sweep_config = {
                    'method': 'bayes',
                    'metric': {'goal': 'maximize', 'name': 'eval/F1'},
                    'parameters': {
                        'n_estimators': {
                            'values': [100, 200, 300, 400, 500, 600],
                        },
                        'random_state': {'distribution': 'int_uniform',
                                        'max': 50,
                                        'min': 1,
                        }
                    }
    }

    sweep_id = wandb.sweep(sweep_config, project="multi_author_random_forest_sweep")

    def train(config=None):
        with wandb.init(config=config):
            # set sweep configuration
            config = wandb.config
            clf = RandomForestClassifier(n_estimators=config.n_estimators, random_state=config.random_state)
            
            clf.fit(train_embedding, train_labels) # X (n_samples, n_features)
            y_pred = clf.predict(val_embedding)

            wandb.log({'eval/F1': f1_score(val_labels, y_pred)})

    wandb.agent(sweep_id, train, count=10)

    # Best performance: F1: 0.7884, n_estimators=600, random_state = 42


# Test

In [18]:
def test_model(checkpoint):
    bert_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, output_hidden_states=True)
    bert_model.to(device)
    train_embeddings = get_embeddings(bert_model, tokenized_datasets['train'], test_embedding_file_path, save=False)
    
    clf = RandomForestClassifier(n_estimators=opt_n_estimators, random_state=opt_random_state)
    clf.fit(train_embeddings, tokenized_datasets['train']['label']) # X (n_samples, n_features)
    
    test_embeddings = get_embeddings(bert_model, tokenized_datasets['test'], test_embedding_file_path, save=False)
    y_pred = clf.predict(test_embeddings)
    
    f1 = f1_score(tokenized_datasets['test']['label'], y_pred)
    print(f'{checkpoint}: {f1}')


In [19]:
test_model("finetuned-bert-base-cased/sweep-1-checkpoint-225")

(17530, 768)
(4592, 768)
finetuned-bert-base-cased/sweep-1-checkpoint-225: 0.8064575988123955


In [20]:
test_model("bert-base-cased")

(17530, 768)
(4592, 768)
bert-base-cased: 0.7953941541186891
