# PAP (Binary) - Fine-tuning a BERT based model

## Load and preprocess data

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding, get_scheduler
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import pandas as pd
from datasets import Dataset, DatasetDict
import evaluate


# Load raw data
datasets_path='../../../datasets/pap/train-dev-test-split/binary'
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(pd.read_csv(f'{datasets_path}/train.csv')),
    'validation': Dataset.from_pandas(pd.read_csv(f'{datasets_path}/dev.csv')),
    'test': Dataset.from_pandas(pd.read_csv(f'{datasets_path}/test.csv'))
})

# Initialize model's parameters
model_parameters = {
    'learning_rate': 3e-5,
    'epochs': 3,
    'model_name': 'bert-base-uncased',
}
checkpoint = model_parameters['model_name']

# Init pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained(checkpoint)

# Simple tokenize function
def tokenize_function(dataset, truncation=True):
    return tokenizer(dataset['text'])

# Tokenize raw data
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Clear datasets dictionary
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'original_label'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

# Initialize the data collator with padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# DEBUG
print('raw datasets:\n', raw_datasets, '\n')
print('tokenized datasets:\n', tokenized_datasets)

# Initialize DataLoader for train, validation and test splits
batch_size = 32
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
validation_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)



Map:   0%|          | 0/1728 [00:00<?, ? examples/s][A
Map: 100%|██████████| 1728/1728 [00:00<00:00, 8749.12 examples/s][A

Map: 100%|██████████| 216/216 [00:00<00:00, 8058.01 examples/s]

Map: 100%|██████████| 216/216 [00:00<00:00, 8016.08 examples/s]

raw datasets:
 DatasetDict({
    train: Dataset({
        features: ['text', 'original_label', 'label'],
        num_rows: 1728
    })
    validation: Dataset({
        features: ['text', 'original_label', 'label'],
        num_rows: 216
    })
    test: Dataset({
        features: ['text', 'original_label', 'label'],
        num_rows: 216
    })
}) 

tokenized datasets:
 DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1728
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 216
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 216
    })
})





## Training

In [2]:
# Initialize a pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=model_parameters['learning_rate'])

# Run on GPU available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Initialize scheduler
num_training_steps = model_parameters['epochs'] * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Display progress bar
progress_bar = tqdm(range(num_training_steps))

# Training loop
model.train()
for epoch in range(model_parameters['epochs']):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

model.safetensors: 100%|██████████| 440M/440M [01:11<00:00, 6.13MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 162/162 [1:52:27<00:00, 40.19s/it] 

## Evaluation

Evaluate on development data:

In [12]:
# Run on GPU available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define evaluation metrics
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
roc_auc =  evaluate.load("roc_auc")

# Set model in evaluation mode
model.eval()

for batch in validation_dataloader:
    
    # Move batch data to the specified device (GPU or CPU)
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**batch)

    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities[:, 1].to(device).numpy()

    # Update metrics for accuracy, precision, recall, F1 and ROC-AUC
    accuracy.add_batch(predictions=predictions, references=batch['labels'])
    precision.add_batch(predictions=predictions, references=batch['labels'])
    recall.add_batch(predictions=predictions, references=batch['labels'])
    f1.add_batch(predictions=predictions, references=batch['labels'])
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# Compute metrics for accuracy, precision, recall, F1 and ROC-AUC
validation_eval_dict = {}
validation_eval_dict.update(accuracy.compute())
validation_eval_dict.update(precision.compute(average='macro'))
validation_eval_dict.update(recall.compute(average='macro'))
validation_eval_dict.update(f1.compute(average='macro'))
validation_eval_dict.update(roc_auc.compute(average='macro'))

# Print evaluation metrics
validation_eval_dict

{'accuracy': 0.7546296296296297,
 'precision': 0.7018722633247773,
 'recall': 0.6400293255131965,
 'f1': 0.6527436527436528,
 'roc_auc': 0.7747172182656052}

Predictions on test data:

In [15]:
# Re-define evaluation metrics
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
roc_auc =  evaluate.load("roc_auc")

# Set model in evaluation mode
model.eval()

for batch in test_dataloader:
    
    # Move batch data to the specified device (GPU or CPU)
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**batch)

    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities[:, 1].to(device).numpy()

    # Update metrics for accuracy, precision, recall, and F1        
    accuracy.add_batch(predictions=predictions, references=batch['labels'])
    precision.add_batch(predictions=predictions, references=batch['labels'])
    recall.add_batch(predictions=predictions, references=batch['labels'])
    f1.add_batch(predictions=predictions, references=batch['labels'])
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])


# Compute metrics for accuracy, precision, recall, F1 and ROC-AUC
test_eval_dict = {}
test_eval_dict.update(accuracy.compute())
test_eval_dict.update(precision.compute(average='macro'))
test_eval_dict.update(recall.compute(average='macro'))
test_eval_dict.update(f1.compute(average='macro'))
test_eval_dict.update(roc_auc.compute(average='macro'))

# Print evaluation metrics
test_eval_dict

{'accuracy': 0.6898148148148148,
 'precision': 0.5937979094076655,
 'recall': 0.570485965647256,
 'f1': 0.5729335733467111,
 'roc_auc': 0.6820276497695853}