# PAP (Binary) - Fine-tuning a BERT based model

## Load and preprocess data

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding, get_scheduler
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import pandas as pd
from datasets import Dataset, DatasetDict
import evaluate


# Load raw data
datasets_path='datasets/pap/train-dev-test-split/binary'
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(pd.read_csv(f'{datasets_path}/train.csv')),
    'validation': Dataset.from_pandas(pd.read_csv(f'{datasets_path}/dev.csv')),
    'test': Dataset.from_pandas(pd.read_csv(f'{datasets_path}/test.csv'))
})

# Init pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Simple tokenize function
def tokenize_function(dataset, truncation=True):
    return tokenizer(dataset['text'])

# Tokenize raw data
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Clear datasets dictionary
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'original_label'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

# Initialize the data collator with padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# DEBUG
print('raw datasets:\n', raw_datasets, '\n')
print('tokenized datasets:\n', tokenized_datasets)

# Initialize DataLoader for train, validation and test splits
batch_size = 200
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)


Map: 100%|██████████| 1728/1728 [00:00<00:00, 12803.51 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 11215.69 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 11002.39 examples/s]


raw datasets:
 DatasetDict({
    train: Dataset({
        features: ['text', 'original_label', 'label'],
        num_rows: 1728
    })
    validation: Dataset({
        features: ['text', 'original_label', 'label'],
        num_rows: 216
    })
    test: Dataset({
        features: ['text', 'original_label', 'label'],
        num_rows: 216
    })
}) 

tokenized datasets:
 DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1728
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 216
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 216
    })
})


## Training

In [7]:
# Initialize parameters
EPOCHS = 3
LEARNING_RATE = 3e-5

# Initialize a pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Run on GPU available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Initialize scheduler
num_training_steps = EPOCHS * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Display progress bar
progress_bar = tqdm(range(num_training_steps))

# Training loop
model.train()
for epoch in range(EPOCHS):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 27/27 [04:15<00:00,  9.48s/it]
100%|██████████| 27/27 [01:32<00:00,  3.31s/it]

## Evaluation

Evaluate on development data:

In [3]:
# Run on GPU available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define evaluation metrics
metrics = {
    'accuracy': evaluate.load('accuracy'),
    'precision': evaluate.load('precision'),
    'recall': evaluate.load('recall'),
    'f1': evaluate.load('f1'),
}

# Load ROC AUC matric
roc_auc = evaluate.load('roc_auc')

# Set model in evaluation mode
model.eval()

for batch in eval_dataloader:
    
    # Move batch data to the specified device (GPU or CPU)
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**batch)

    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities[:, 1].to(device).numpy()

    # Update metrics for accuracy, precision, recall, and F1
    for metric in metrics.values():
        metric.add_batch(predictions=predictions, references=batch['labels'])

    # Update ROC AUC metric
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# Compute metrics for accuracy, precision, recall, and F1
eval_dict = {}
for metric in metrics.values():
    eval_dict.update(metric.compute())

# Compute ROC AUC metric
eval_dict.update(roc_auc.compute())



eval_dict

{'accuracy': 0.7129629629629629,
 'precision': 0.7129629629629629,
 'recall': 1.0,
 'f1': 0.8324324324324325,
 'roc_auc': 0.7647674905739422}

Predictions on test data:

In [9]:
# Run on GPU available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define evaluation metrics
metrics = {
    'accuracy': evaluate.load('accuracy'),
    'precision': evaluate.load('precision'),
    'recall': evaluate.load('recall'),
    'f1': evaluate.load('f1'),
}

# Load ROC AUC matric
roc_auc = evaluate.load('roc_auc')

# Set model in evaluation mode
model.eval()

for batch in test_dataloader:
    
    # Move batch data to the specified device (GPU or CPU)
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**batch)

    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities[:, 1].to(device).numpy()

    # Update metrics for accuracy, precision, recall, and F1
    for metric in metrics.values():
        metric.add_batch(predictions=predictions, references=batch['labels'])

    # Update ROC AUC metric
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# Compute metrics for accuracy, precision, recall, and F1
test_dict = {}
for metric in metrics.values():
    test_dict.update(metric.compute())

# Compute ROC AUC metric
test_dict.update(roc_auc.compute())



test_dict

{'accuracy': 0.7129629629629629,
 'precision': 0.7129629629629629,
 'recall': 1.0,
 'f1': 0.8324324324324325,
 'roc_auc': 0.7093632174277336}