# PAP (Binary) - Models

## Load and preprocess data

In [1]:
import json
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm
import evaluate

# Loads PAP datasets
datasets_path = '../../datasets/pap/train-dev-test-split/binary'
train_df = pd.read_csv(f'{datasets_path}/train.csv')
dev_df = pd.read_csv(f'{datasets_path}/dev.csv')
test_df = pd.read_csv(f'{datasets_path}/test.csv')

  from .autonotebook import tqdm as notebook_tqdm


Loads and preprocess concreteness ratings:

In [2]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
# Load concreteness ratings
concreteness_df = pd.read_csv('../../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

# Map and normalize conreteness ratings
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    
    # Normalizing to a scale of 0 to 1
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 

Define helper functions to get concreteness scores:

In [3]:

def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return round(word_to_concreteness_score_map.get(word, 0.5), 3)

def calculate_text_concreteness_sequence(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    concreteness_scores = " ".join([str(i) for i in concreteness_scores])
    # Take the average concreteness score of all words in the text
    return concreteness_scores

Load PAP datasets:

In [4]:
# Add concreteness scores for the every sequence
train_df['concreteness_score_sequence'] = train_df.text.apply(calculate_text_concreteness_sequence)
dev_df['concreteness_score_sequence'] = dev_df.text.apply(calculate_text_concreteness_sequence)
test_df['concreteness_score_sequence'] = test_df.text.apply(calculate_text_concreteness_sequence)

# Load PAP datasets with Concreteness Scores  
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)
})

In [5]:
COLUMNS_TO_KEEP = ['label', 'input_ids', 'token_type_ids', 'attention_mask']

class ModellingExperiments:
    
    def __init__(self, model_name, dataset, batch_size, learning_rate):
        self.model_name = model_name
        self.dataset = dataset
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.cols_to_keep = set(COLUMNS_TO_KEEP)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.data_collator = DataCollatorWithPadding(self.tokenizer)

    def init_model(self):
        torch.manual_seed(12)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
        self.model.to(self.device)
        
    def tokenize_sentence_with_concreteness_score(self, item):
        # We also tried using the concreteness score for the whole sentence as a feature input
        # To implement that, we changed the source code of transformers library and changed the classification head manually 
        # So that we can accomodate that extra feature, but this method of encoding concreteness score sequence was giving better score
        # Hence, we are using this only in the final experiments. You can check that experiment out in the following notebook:
        # modelling/pap/experiments/FinalModellingWithConcretenessScore[BERT] - PAP
        return self.tokenizer(item['text'], item['concreteness_score_sequence'], truncation=True)
        
    def tokenize_sentence(self, item):
        # Normal tokenization
        return self.tokenizer(item['text'], truncation=True)
        
    def add_strategy_to_tokenizer_function_map(self):
        # Mapping between strategy and the tokenization functions defined above
        # Strategy refers to whether we are using normal tokenization or whether we want to do paired tokenization of 
        # both input sentence and the sequence of concreteness score for that sentence
        self.strategy_to_tokenizer_function_map = dict()
        self.strategy_to_tokenizer_function_map['normal_finetuning'] = self.tokenize_sentence_with_concreteness_score
        self.strategy_to_tokenizer_function_map['concreteness_score_addition'] = self.tokenize_sentence
        
    def prepare_dataset(self, strategy):
        # Here, we wull tokenize the dataset based on the strategy we are planning to use
        self.strategy = strategy
        self.add_strategy_to_tokenizer_function_map()
        self.tokenized_dataset = self.dataset.map(self.strategy_to_tokenizer_function_map[self.strategy], batched=True)
        current_cols = set(list(self.tokenized_dataset['train'].features.keys()))
        self.tokenized_dataset = self.tokenized_dataset.remove_columns(list(current_cols - self.cols_to_keep))
        self.tokenized_dataset = self.tokenized_dataset.rename_column("label", "labels")
        self.tokenized_dataset = self.tokenized_dataset.with_format("torch")

    def prepare_dataloaders(self):
        self.train_dataloader = DataLoader(self.tokenized_dataset['train'], batch_size=self.batch_size, shuffle=True, collate_fn=self.data_collator)
        self.validation_dataloader = DataLoader(self.tokenized_dataset['validation'], batch_size=self.batch_size, collate_fn=self.data_collator)
        self.test_dataloader = DataLoader(self.tokenized_dataset['test'], batch_size=self.batch_size, collate_fn=self.data_collator)

    def setup_optimizer(self, num_epochs):
        # Setting up optimizer and learning rate scheduler
        self.num_epochs = num_epochs
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        self.num_training_steps = self.num_epochs*len(self.train_dataloader)
        self.learning_rate_scheduler = get_scheduler("linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=self.num_training_steps)
    
    def train_model(self):
        # Training the Model
        self.evaluation_results_list = list()
        progress_bar = tqdm(range(self.num_training_steps))
        for epoch in range(self.num_epochs):
            self.model.train()
            for batch in self.train_dataloader:
                batch = {k:v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                loss = outputs.loss
                # calculating gradients
                loss.backward()
                # optimizing weights
                self.optimizer.step()
                # updating learning rate
                self.learning_rate_scheduler.step()
                # flushing gradients
                self.optimizer.zero_grad()
                # updating progress bar
                progress_bar.update(1)
            # evaluating per epoch
            self.eval_model(self.validation_dataloader)
            eval_results = dict()
            eval_results['epoch'] = epoch
            for k, v in self.eval_dict.items():
                eval_results["validation_{}".format(k)] = v
            self.eval_model(self.test_dataloader)
            for k, v in self.eval_dict.items():
                eval_results["test_{}".format(k)] = v
            self.evaluation_results_list.append(eval_results)

    def initialize_metrics(self):
        self.metrics = {
            'accuracy': evaluate.load('accuracy'),
            'precision': evaluate.load('precision'),
            'recall': evaluate.load('recall'),
            'f1': evaluate.load('f1'),
            'roc-auc': evaluate.load("roc_auc"),
            #'confusion-matrix': evaluate.load("BucketHeadP65/confusion_matrix"),
            #'roc-curve': evaluate.load("BucketHeadP65/roc_curve"),
        }
                
    def eval_model(self, dataloader):
        
        # Evaluating the model on different dataloaders
        self.initialize_metrics()
        
        self.model.eval()
        
        for batch in dataloader:
            # Move batch data to the specified device (GPU or CPU)
            batch = {k: v.to(self.device) for k, v in batch.items()}
    
            # Forward pass
            with torch.no_grad():
                outputs = self.model(**batch)
            
            # Extract logits and predictions
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
        
            # Apply softmax to convert logits to probabilities
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            
            # Extract probabilities for the positive class
            positive_probabilities = probabilities[:, 1].to(self.device).numpy()
        
            # Update metrics for accuracy, precision, recall, F1 and ROC-AUC
            self.metrics['accuracy'].add_batch(predictions=predictions, references=batch['labels'])
            self.metrics['precision'].add_batch(predictions=predictions, references=batch['labels'])
            self.metrics['recall'].add_batch(predictions=predictions, references=batch['labels'])
            self.metrics['f1'].add_batch(predictions=predictions, references=batch['labels'])
            self.metrics['roc-auc'].add_batch(prediction_scores=positive_probabilities, references=batch['labels'])
            #self.metrics['confusion-matrix'].add_batch(predictions=predictions, references=batch['labels'])
            #self.metrics['roc-curve'].add_batch(prediction_scores=positive_probabilities, references=batch['labels'])
        
        # Compute metrics for accuracy, precision, recall, F1 and ROC-AUC
        self.eval_dict = {}
        self.eval_dict.update(self.metrics['accuracy'].compute())
        self.eval_dict.update(self.metrics['precision'].compute(average="macro"))
        self.eval_dict.update(self.metrics['recall'].compute(average="macro"))
        self.eval_dict.update(self.metrics['f1'].compute(average="macro"))
        self.eval_dict.update(self.metrics['roc-auc'].compute(average="macro"))
        #self.eval_dict.update(self.metrics['confusion-matrix'].compute()) 
        #self.eval_dict.update(self.metrics['roc-curve'].compute())    

## Run Experiments

Defining parameters on which we will run the experiments

In [6]:
# Initialize model parameters
model_name_list = ["facebook/bart-base", "microsoft/deberta-base"]
num_epochs = 4
strategies_list = ["normal_finetuning", "concreteness_score_addition"]

Defining static arguments

In [7]:
kw_args = {
    'dataset': raw_datasets,
    'batch_size': 32,
    'learning_rate': 3e-5,
}

### Experiments loop

In [None]:
# Open file to save results
with open('result_dynamic_dict_final.json', 'r') as openfile:
    result_dynamic_dict = json.load(openfile)

result_list = list()
for model_name in model_name_list:
    
    # Setting Model Name
    kw_args["model_name"] = model_name
    
    # Initializing ModellingExperiments Object
    modelling_obj = ModellingExperiments(**kw_args)
    
    for strategy in strategies_list:
        
        # Preparing dataset for a specific strategy
        modelling_obj.prepare_dataset(strategy=strategy)
        
        # Preparing data loaders
        modelling_obj.prepare_dataloaders()

        # Initialize dictionary for storing results
        result_dict = {
            'model_name': model_name,
            'startegy':strategy 
        }
        print("Model Training with the following Configurations: {}".format(result_dict))
        
        unique_key = "#".join(str(i) for i in list(result_dict.values()))
        if not result_dynamic_dict.get(unique_key):
            result_dynamic_dict[unique_key] = dict()
            
            # initializing model
            modelling_obj.init_model()
            
            # For a specic num_epochs variable, we are setting up the optimizers
            modelling_obj.setup_optimizer(num_epochs=num_epochs)
            
            # Now, we are training the model
            modelling_obj.train_model()
            
            # Saving evaluated results
            evaluation_results_list = modelling_obj.evaluation_results_list
            for evaluation_results in evaluation_results_list:
                result_dict.update(evaluation_results)
                result_list.append(result_dict)
                
            # Updating the stored file
            result_dynamic_dict[unique_key] = evaluation_results_list
            
            # Storing the updated result file
            with open('result_dynamic_dict_final.json', 'w', encoding='utf-8') as f:
                    json.dump(result_dynamic_dict, f, ensure_ascii=False, indent=4)
        else:
            print("Model already trained, results are stored already!")

Map: 100%|██████████| 1728/1728 [00:00<00:00, 74722.23 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 50342.84 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 48595.70 examples/s]


Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'startegy': 'normal_finetuning'}


Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/216 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 14%|█▍        | 31/216 [04:36<26:39,  8.65s/it]

In [None]:
results_df = pd.DataFrame(result_list)
results_df.head()

In [None]:
results_df.to_csv('../../results/FinalResultsPAP.csv', index=False)