# PAP (Binary) - Models

## Load and preprocess data

In [2]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm
import evaluate

# Loads PAP datasets
datasets_path = '../datasets/pap/train-dev-test-split/binary'
train_df = pd.read_csv(f'{datasets_path}/train.csv')
dev_df = pd.read_csv(f'{datasets_path}/dev.csv')
test_df = pd.read_csv(f'{datasets_path}/test.csv')

  from .autonotebook import tqdm as notebook_tqdm


Loads and preprocess concreteness ratings:

In [3]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
# Load concreteness ratings
concreteness_df = pd.read_csv('../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

# Map and normalize conreteness ratings
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    
    # Normalizing to a scale of 0 to 1
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 

Define helper functions to get concreteness scores:

In [4]:

def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return round(word_to_concreteness_score_map.get(word, 0.5), 3)

def calculate_text_concreteness_sequence(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    concreteness_scores = " ".join([str(i) for i in concreteness_scores])
    # Take the average concreteness score of all words in the text
    return concreteness_scores

Load PAP datasets:

In [5]:
# Add concreteness scores for the every sequence
train_df['concreteness_score_sequence'] = train_df.text.apply(calculate_text_concreteness_sequence)
dev_df['concreteness_score_sequence'] = dev_df.text.apply(calculate_text_concreteness_sequence)
test_df['concreteness_score_sequence'] = test_df.text.apply(calculate_text_concreteness_sequence)

# Load PAP datasets with Concreteness Scores  
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)
})

In [33]:
COLUMNS_TO_KEEP = ['label', 'input_ids', 'token_type_ids', 'attention_mask']

class ModellingExperiments:
    
    def __init__(self, model_name, dataset, batch_size, learning_rate):
        self.model_name = model_name
        self.dataset = dataset
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
        self.cols_to_keep = set(COLUMNS_TO_KEEP)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.data_collator = DataCollatorWithPadding(self.tokenizer)
        self.model.to(self.device)
        
    def tokenize_sentence_with_concreteness_score(self, item):
        # We also tried using the concreteness score for the whole sentence as a feature input
        # To implement that, we changed the source code of transformers library and changed the classification head manually 
        # So that we can accomodate that extra feature, but this method of encoding concreteness score sequence was giving better score
        # Hence, we are using this only in the final experiments. You can check that experiment out in the following notebook:
        # modelling/pap/experiments/FinalModellingWithConcretenessScore[BERT] - PAP
        return self.tokenizer(item['text'], item['concreteness_score_sequence'], truncation=True)
        
    def tokenize_sentence(self, item):
        # Normal tokenization
        return self.tokenizer(item['text'], truncation=True)
        
    def add_strategy_to_tokenizer_function_map(self):
        # Mapping between strategy and the tokenization functions defined above
        # Strategy refers to whether we are using normal tokenization or whether we want to do paired tokenization of 
        # both input sentence and the sequence of concreteness score for that sentence
        self.strategy_to_tokenizer_function_map = dict()
        self.strategy_to_tokenizer_function_map['normal_finetuning'] = self.tokenize_sentence_with_concreteness_score
        self.strategy_to_tokenizer_function_map['concreteness_score_addition'] = self.tokenize_sentence
        
    def prepare_dataset(self, strategy):
        # Here, we wull tokenize the dataset based on the strategy we are planning to use
        self.strategy = strategy
        self.add_strategy_to_tokenizer_function_map()
        self.tokenized_dataset = self.dataset.map(self.strategy_to_tokenizer_function_map[self.strategy], batched=True)
        current_cols = set(list(self.tokenized_dataset['train'].features.keys()))
        self.tokenized_dataset = self.tokenized_dataset.remove_columns(list(current_cols - self.cols_to_keep))
        self.tokenized_dataset = self.tokenized_dataset.rename_column("label", "labels")
        self.tokenized_dataset = self.tokenized_dataset.with_format("torch")

    def prepare_dataloaders(self):
        self.train_dataloader = DataLoader(self.tokenized_dataset['train'], batch_size=self.batch_size, shuffle=True, collate_fn=self.data_collator)
        self.validation_dataloader = DataLoader(self.tokenized_dataset['validation'], batch_size=self.batch_size, collate_fn=self.data_collator)
        self.test_dataloader = DataLoader(self.tokenized_dataset['test'], batch_size=self.batch_size, collate_fn=self.data_collator)

    def setup_optimizer(self, num_epochs):
        # Setting up optimizer and learning rate scheduler
        self.num_epochs = num_epochs
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        self.num_training_steps = self.num_epochs*len(self.train_dataloader)
        self.learning_rate_scheduler = get_scheduler("linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=self.num_training_steps)
    
    def train_model(self):
        # Training the Model
        self.model.train()
        progress_bar = tqdm(range(self.num_training_steps))
        for epoch in range(self.num_epochs):
            for batch in self.train_dataloader:
                batch = {k:v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                loss = outputs.loss
                # calculating gradients
                loss.backward()
                # optimizing weights
                self.optimizer.step()
                # updating learning rate
                self.learning_rate_scheduler.step()
                # flushing gradients
                self.optimizer.zero_grad()
                # updating progress bar
                progress_bar.update(1)

    def initialize_metrics(self):
        self.metrics = {
            'accuracy': evaluate.load('accuracy'),
            'precision': evaluate.load('precision'),
            'recall': evaluate.load('recall'),
            'f1': evaluate.load('f1'),
            'roc-auc': evaluate.load("roc_auc"),
        }
                
    def eval_model(self, dataloader):
        # Run on GPU available
        #device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        
        # Evaluating the model on different dataloaders
        self.initialize_metrics()
        
        self.model.eval()
        
        for batch in dataloader:
            # Move batch data to the specified device (GPU or CPU)
            batch = {k: v.to(self.device) for k, v in batch.items()}
    
            # Forward pass
            with torch.no_grad():
                outputs = self.model(**batch)
            
            # Extract logits and predictions
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
        
            # Apply softmax to convert logits to probabilities
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            
            # Extract probabilities for the positive class
            positive_probabilities = probabilities[:, 1].to(self.device).numpy()
        
            # Update metrics for accuracy, precision, recall, F1 and ROC-AUC
            self.metrics['accuracy'].add_batch(predictions=predictions, references=batch['labels'])
            self.metrics['precision'].add_batch(predictions=predictions, references=batch['labels'])
            self.metrics['recall'].add_batch(predictions=predictions, references=batch['labels'])
            self.metrics['f1'].add_batch(predictions=predictions, references=batch['labels'])
            self.metrics['roc-auc'].add_batch(prediction_scores=positive_probabilities, references=batch['labels'])
        
        # Compute metrics for accuracy, precision, recall, F1 and ROC-AUC
        self.eval_dict = {}
        self.eval_dict.update(self.metrics['accuracy'].compute())
        self.eval_dict.update(self.metrics['precision'].compute(average="macro"))
        self.eval_dict.update(self.metrics['recall'].compute(average="macro"))
        self.eval_dict.update(self.metrics['f1'].compute(average="macro"))
        self.eval_dict.update(self.metrics['roc-auc'].compute(average="macro"))     

## Run Experiments

Defining parameters on which we will run the experiments

In [26]:
# Initialize model parameters
model_name_list = ["facebook/bart-base", "microsoft/deberta-base"]
num_epochs_list = [1, 2, 3, 4, 5]
strategies_list = ["normal_finetuning", "concreteness_score_addition"]

Defining static arguments

In [27]:
kw_args = {
    'dataset': raw_datasets,
    'batch_size': 32,
    'learning_rate': 3e-5,
}

### Experiments loop

In [34]:
result_list = list()
for model_name in model_name_list:
    
    # Setting Model Name
    kw_args["model_name"] = model_name
    
    # Initializing ModellingExperiments Object
    modelling_obj = ModellingExperiments(**kw_args)
    
    for strategy in strategies_list:
        
        # Preparing dataset for a specific strategy
        modelling_obj.prepare_dataset(strategy=strategy)
        
        # Preparing data loaders
        modelling_obj.prepare_dataloaders()
            
        # Training loop
        for num_epochs in num_epochs_list:
            # Initializing dictionary for storing results
            result_dict = dict()
            result_dict["model_name"] = model_name
            result_dict["strategy"] = strategy
            #result_dict["train_dataset_type"] = train_dataset_type
            result_dict["num_epochs"] = num_epochs
            print("Model Training with the following Configurations: {}".format(result_dict))
            
            # For a specic num_epochs variable, we are setting up the optimizers
            modelling_obj.setup_optimizer(num_epochs=num_epochs)
            
            # Now, we are training the model
            modelling_obj.train_model()
            
            # Now, we will evaluate the model on validation dataset
            modelling_obj.eval_model(modelling_obj.validation_dataloader)
            
            # Storing results on validation set
            for k, v in modelling_obj.eval_dict.items():
                result_dict["validation_{}".format(k)] = v
            print("Validation Set Results: {}".format(modelling_obj.eval_dict))
            
            # Now, we will evaluate the model on test dataset
            modelling_obj.eval_model(modelling_obj.test_dataloader)
            
            # Storing results on test set
            for k, v in modelling_obj.eval_dict.items():
                result_dict["test_{}".format(k)] = v
            
            print("Test Set Results: {}".format(modelling_obj.eval_dict))
            
            # Storing all the results in the results_list
            result_list.append(result_dict)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.out_proj.bias', 'classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1728/1728 [00:00<00:00, 62064.41 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 37908.27 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 42138.12 examples/s]


Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'normal_finetuning', 'num_epochs': 1}


  0%|          | 0/54 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 54/54 [01:01<00:00,  1.14s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.5528906577293674}


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.5438835358190197}
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'normal_finetuning', 'num_epochs': 2}


100%|██████████| 108/108 [02:03<00:00,  1.15s/it]


Validation Set Results: {'accuracy': 0.7546296296296297, 'precision': 0.716931216931217, 'recall': 0.6159405111018015, 'f1': 0.6249877157925771, 'roc_auc': 0.7924172601591956}
Test Set Results: {'accuracy': 0.6851851851851852, 'precision': 0.5554655870445344, 'recall': 0.5286971093422707, 'f1': 0.514799154334038, 'roc_auc': 0.7250733137829912}
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'normal_finetuning', 'num_epochs': 3}


100%|██████████| 162/162 [03:06<00:00,  1.15s/it]


Validation Set Results: {'accuracy': 0.7407407407407407, 'precision': 0.6838905775075987, 'recall': 0.6013824884792627, 'f1': 0.6070175438596491, 'roc_auc': 0.768537913699204}
Test Set Results: {'accuracy': 0.7314814814814815, 'precision': 0.6623655913978495, 'recall': 0.5948889819857561, 'f1': 0.5994884910485934, 'roc_auc': 0.7400502723083369}
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'normal_finetuning', 'num_epochs': 4}


100%|██████████| 216/216 [04:27<00:00,  1.24s/it]


Validation Set Results: {'accuracy': 0.6712962962962963, 'precision': 0.5551109768986864, 'recall': 0.538227901131127, 'f1': 0.5348075348075348, 'roc_auc': 0.702869710934227}
Test Set Results: {'accuracy': 0.7546296296296297, 'precision': 0.7018722633247773, 'recall': 0.6400293255131965, 'f1': 0.6527436527436528, 'roc_auc': 0.7422496857980728}
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'normal_finetuning', 'num_epochs': 5}


100%|██████████| 270/270 [05:42<00:00,  1.27s/it]


Validation Set Results: {'accuracy': 0.6805555555555556, 'precision': 0.5532915360501567, 'recall': 0.5302681189777964, 'f1': 0.5197061003512616, 'roc_auc': 0.6816087138667783}
Test Set Results: {'accuracy': 0.7083333333333334, 'precision': 0.6221208170360712, 'recall': 0.5882907415165479, 'f1': 0.5929526487391941, 'roc_auc': 0.7135525764558022}


Map: 100%|██████████| 1728/1728 [00:00<00:00, 93412.18 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 62069.72 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 62018.73 examples/s]


Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 1}


100%|██████████| 54/54 [00:42<00:00,  1.28it/s]


Validation Set Results: {'accuracy': 0.6990740740740741, 'precision': 0.5988574267262792, 'recall': 0.5625261834939255, 'f1': 0.5614555677026394, 'roc_auc': 0.6852744030163385}
Test Set Results: {'accuracy': 0.7407407407407407, 'precision': 0.6771141336487285, 'recall': 0.6254713028906578, 'f1': 0.6356626506024097, 'roc_auc': 0.7166945957268538}
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 2}


100%|██████████| 108/108 [01:25<00:00,  1.26it/s]


Validation Set Results: {'accuracy': 0.6990740740740741, 'precision': 0.5960767218831735, 'recall': 0.5577084206116464, 'f1': 0.5546674279189266, 'roc_auc': 0.6726539589442815}
Test Set Results: {'accuracy': 0.7222222222222222, 'precision': 0.6451803666469544, 'recall': 0.6028487641390867, 'f1': 0.6096385542168674, 'roc_auc': 0.7129241726015919}
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 3}


100%|██████████| 162/162 [02:07<00:00,  1.27it/s]


Validation Set Results: {'accuracy': 0.6759259259259259, 'precision': 0.5565610859728507, 'recall': 0.5366568914956011, 'f1': 0.53125, 'roc_auc': 0.6653749476330122}
Test Set Results: {'accuracy': 0.6944444444444444, 'precision': 0.5972797161442933, 'recall': 0.5689149560117301, 'f1': 0.5706024096385542, 'roc_auc': 0.699937159614579}
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 4}


100%|██████████| 216/216 [02:47<00:00,  1.29it/s]


Validation Set Results: {'accuracy': 0.6851851851851852, 'precision': 0.5699728260869565, 'recall': 0.5431503979891077, 'f1': 0.5377061563640941, 'roc_auc': 0.669145370758274}
Test Set Results: {'accuracy': 0.7222222222222222, 'precision': 0.6451803666469544, 'recall': 0.6028487641390867, 'f1': 0.6096385542168674, 'roc_auc': 0.708106409719313}
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 5}


100%|██████████| 270/270 [03:28<00:00,  1.29it/s]


Validation Set Results: {'accuracy': 0.6805555555555556, 'precision': 0.558413251961639, 'recall': 0.5350858818600754, 'f1': 0.5272623465600914, 'roc_auc': 0.6581483033095936}
Test Set Results: {'accuracy': 0.7037037037037037, 'precision': 0.6089204912734325, 'recall': 0.5705906996229577, 'f1': 0.5714285714285715, 'roc_auc': 0.6981566820276497}


tokenizer_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 245kB/s]
config.json: 100%|██████████| 474/474 [00:00<00:00, 2.54MB/s]
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 3.95MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 4.69MB/s]
pytorch_model.bin: 100%|██████████| 559M/559M [00:50<00:00, 11.1MB/s] 
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1728/1728 [00:00<00:00, 52007.44 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 31259.74 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 24498.25 examples/s]


Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'normal_finetuning', 'num_epochs': 1}


  0%|          | 0/54 [00:00<?, ?it/s]You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 54/54 [01:02<00:00,  1.15s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.4480519480519481}


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.5093213238374529}
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'normal_finetuning', 'num_epochs': 2}


100%|██████████| 108/108 [02:15<00:00,  1.25s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.6347926267281105}


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.6092375366568915}
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'normal_finetuning', 'num_epochs': 3}


100%|██████████| 162/162 [03:24<00:00,  1.26s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.7310431503979892}


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.6934436531210725}
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'normal_finetuning', 'num_epochs': 4}


100%|██████████| 216/216 [04:43<00:00,  1.31s/it]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.7817343946376205}


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.7372224549643904}
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'normal_finetuning', 'num_epochs': 5}


100%|██████████| 270/270 [05:52<00:00,  1.31s/it]


Validation Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.64375, 'recall': 0.6348973607038123, 'f1': 0.6385620209435388, 'roc_auc': 0.756284038542103}
Test Set Results: {'accuracy': 0.7314814814814815, 'precision': 0.6636904761904762, 'recall': 0.6382488479262672, 'f1': 0.6463015245623942, 'roc_auc': 0.7663908671973187}


Map: 100%|██████████| 1728/1728 [00:00<00:00, 105140.53 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 39472.36 examples/s]
Map: 100%|██████████| 216/216 [00:00<00:00, 47420.55 examples/s]


Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 1}


100%|██████████| 54/54 [00:47<00:00,  1.14it/s]


Validation Set Results: {'accuracy': 0.7037037037037037, 'precision': 0.6566332218506131, 'recall': 0.676581483033096, 'f1': 0.661839530332681, 'roc_auc': 0.7253875157100964}
Test Set Results: {'accuracy': 0.6898148148148148, 'precision': 0.6349746144266692, 'recall': 0.6475701717637201, 'f1': 0.6390572390572391, 'roc_auc': 0.7218265605362378}
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 2}


100%|██████████| 108/108 [01:33<00:00,  1.16it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Validation Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.7680142438206954}


  _warn_prf(average, modifier, msg_start, len(result))


Test Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.35648148148148145, 'recall': 0.5, 'f1': 0.41621621621621624, 'roc_auc': 0.7408881441139507}
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 3}


100%|██████████| 162/162 [02:16<00:00,  1.19it/s]


Validation Set Results: {'accuracy': 0.7129629629629629, 'precision': 0.6369047619047619, 'recall': 0.6156263091746963, 'f1': 0.6219085262563524, 'roc_auc': 0.7132383745286971}
Test Set Results: {'accuracy': 0.7222222222222222, 'precision': 0.6491048593350384, 'recall': 0.6221198156682027, 'f1': 0.6296296296296295, 'roc_auc': 0.7328236279849184}
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 4}


100%|██████████| 216/216 [03:10<00:00,  1.13it/s]


Validation Set Results: {'accuracy': 0.7083333333333334, 'precision': 0.6292962356792144, 'recall': 0.607561793045664, 'f1': 0.6134859544976852, 'roc_auc': 0.7270632593213238}
Test Set Results: {'accuracy': 0.7222222222222222, 'precision': 0.6502976190476191, 'recall': 0.6269375785504818, 'f1': 0.6341050254093732, 'roc_auc': 0.7204650188521157}
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'concreteness_score_addition', 'num_epochs': 5}


100%|██████████| 270/270 [03:57<00:00,  1.13it/s]


Validation Set Results: {'accuracy': 0.7175925925925926, 'precision': 0.6443235977025541, 'recall': 0.6236908253037285, 'f1': 0.6302096489012377, 'roc_auc': 0.7269585253456221}
Test Set Results: {'accuracy': 0.7314814814814815, 'precision': 0.6636904761904762, 'recall': 0.6382488479262672, 'f1': 0.6463015245623942, 'roc_auc': 0.6936531210724759}


In [35]:
results_df = pd.DataFrame(result_list)
results_df.head()

Unnamed: 0,model_name,strategy,num_epochs,validation_accuracy,validation_precision,validation_recall,validation_f1,validation_roc_auc,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,facebook/bart-base,normal_finetuning,1,0.712963,0.356481,0.5,0.416216,0.552891,0.712963,0.356481,0.5,0.416216,0.543884
1,facebook/bart-base,normal_finetuning,2,0.75463,0.716931,0.615941,0.624988,0.792417,0.685185,0.555466,0.528697,0.514799,0.725073
2,facebook/bart-base,normal_finetuning,3,0.740741,0.683891,0.601382,0.607018,0.768538,0.731481,0.662366,0.594889,0.599488,0.74005
3,facebook/bart-base,normal_finetuning,4,0.671296,0.555111,0.538228,0.534808,0.70287,0.75463,0.701872,0.640029,0.652744,0.74225
4,facebook/bart-base,normal_finetuning,5,0.680556,0.553292,0.530268,0.519706,0.681609,0.708333,0.622121,0.588291,0.592953,0.713553


In [36]:
results_df.to_csv('../../results/FinalResultsPAP.csv', index=False)