# PAP with Concreteness Ratings - Fine-tuning a BERT based model

## Load and preprocess data

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm
import evaluate

# Loads PAP datasets
datasets_path = '../../../datasets/pap/train-dev-test-split/binary'
train_df = pd.read_csv(f'{datasets_path}/train.csv')
dev_df = pd.read_csv(f'{datasets_path}/dev.csv')
test_df = pd.read_csv(f'{datasets_path}/test.csv')

### Loads and preprocess concreteness ratings

In [None]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
# Load concreteness ratings
concreteness_df = pd.read_csv('../../../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

# Map and normalize conreteness ratings
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    
    # Normalizing to a scale of 0 to 1
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 

Define helper functions:

In [None]:
def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return round(word_to_concreteness_score_map.get(word, 0.5), 3)

def calculate_text_concreteness_sequence(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    concreteness_scores = " ".join([str(i) for i in concreteness_scores])
    # Take the average concreteness score of all words in the text
    return concreteness_scores

Add the sequence concreteness score to the datasets:

In [None]:
# Add concreteness scores for the every sequence
train_df['concreteness_score_sequence'] = train_df.text.apply(calculate_text_concreteness_sequence)
dev_df['concreteness_score_sequence'] = dev_df.text.apply(calculate_text_concreteness_sequence)
test_df['concreteness_score_sequence'] = test_df.text.apply(calculate_text_concreteness_sequence)

In [None]:
# Load PAP datasets with Concreteness Scores  
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)
})

# Initialize model's parameters
model_parameters = {
    'learning_rate': 3e-5,
    'epochs': 3,
    'model_name': 'bert-base-uncased',
}
checkpoint = model_parameters['model_name']

# Init pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Simple tokenize function
def tokenize_function(dataset, truncation=True):
    return tokenizer(dataset['text'], dataset['concreteness_score_sequence'])

# Tokenize raw data
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Clear datasets dictionary
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'original_label', 'concreteness_score_sequence'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

# Initialize the data collator with padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# DEBUG
print('raw datasets:\n', raw_datasets, '\n')
print('tokenized datasets:\n', tokenized_datasets)


# Initialize DataLoader for train, validation and test splits
batch_size = 32
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)

## Training Loop

In [None]:
# Initialize a pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained(model_parameters['model_name'], num_labels=2)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=model_parameters['learning_rate'])

# Run on GPU available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Initialize scheduler
num_training_steps = model_parameters['epochs'] * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Display progress bar
progress_bar = tqdm(range(num_training_steps))

# Training loop
model.train()
for epoch in range(model_parameters['epochs']):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        
        # calculating gradients
        loss.backward()

        # optimizing weights
        optimizer.step()
        
        # updating learning rate
        lr_scheduler.step()
        
        # flushing gradients
        optimizer.zero_grad()
        
        # updating progress bar
        progress_bar.update(1)

## Evaluation

Evaluate on development data:

In [None]:
# Run on GPU available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define evaluation metrics
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
roc_auc =  evaluate.load("roc_auc")

# Set model in evaluation mode
model.eval()

for batch in eval_dataloader:
    
    # Move batch data to the specified device (GPU or CPU)
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**batch)

    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities[:, 1].to(device).numpy()

    # Update metrics for accuracy, precision, recall, F1 and ROC-AUC
    accuracy.add_batch(predictions=predictions, references=batch['labels'])
    precision.add_batch(predictions=predictions, references=batch['labels'])
    recall.add_batch(predictions=predictions, references=batch['labels'])
    f1.add_batch(predictions=predictions, references=batch['labels'])
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# Compute metrics for accuracy, precision, recall, F1 and ROC-AUC
validation_eval_dict = {}
validation_eval_dict.update(accuracy.compute())
validation_eval_dict.update(precision.compute(average='macro'))
validation_eval_dict.update(recall.compute(average='macro'))
validation_eval_dict.update(f1.compute(average='macro'))
validation_eval_dict.update(roc_auc.compute(average='macro'))

# Print evaluation metrics
validation_eval_dict

Predictions on test data:

In [None]:
# Re-define evaluation metrics
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
roc_auc =  evaluate.load("roc_auc")

for batch in test_dataloader:
    
    # Move batch data to the specified device (GPU or CPU)
    batch = {k: v.to(device) for k, v in batch.items()}
    
    # Forward pass
    with torch.no_grad():
        outputs = model(**batch)

    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities[:, 1].to(device).numpy()

    # Update metrics for accuracy, precision, recall, F1 and ROC-AUC
    accuracy.add_batch(predictions=predictions, references=batch['labels'])
    precision.add_batch(predictions=predictions, references=batch['labels'])
    recall.add_batch(predictions=predictions, references=batch['labels'])
    f1.add_batch(predictions=predictions, references=batch['labels'])
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# Compute metrics for accuracy, precision, recall, F1 and ROC-AUC
test_eval_dict = {}
test_eval_dict.update(accuracy.compute())
test_eval_dict.update(precision.compute(average='macro'))
test_eval_dict.update(recall.compute(average='macro'))
test_eval_dict.update(f1.compute(average='macro'))
test_eval_dict.update(roc_auc.compute(average='macro'))

# Print evaluation metrics
test_eval_dict