# Loading Dataset

In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
!ls ../datasets/adept/train-dev-test-split/

ls: cannot access '../datasets/adept/train-dev-test-split/': No such file or directory


In [5]:
import json
import nltk
import re

In [6]:
adept_data_path = "../../datasets/adept/train-dev-test-split"
split = "train"

In [7]:
train_data = json.load(open('{}/{}.json'.format(adept_data_path, split), 'r'))

In [8]:
df_train = pd.DataFrame(train_data)
df_train.head()

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620
2,A scene is painted.,A negative scene is painted.,negative,scene,2,3324
3,A bone breaks a tooth.,An alleged bone breaks a tooth.,alleged,bone,2,10610
4,A trip causes a happening.,A fabulous trip causes a happening.,fabulous,trip,2,14917


In [9]:
label_to_class_map = {0:"Impossible", 1:"Less Likely", 2:"Equally Likely", 3:"More Likely", 4:"Necessarily True"}

In [10]:
label_to_class_map.values()

dict_values(['Impossible', 'Less Likely', 'Equally Likely', 'More Likely', 'Necessarily True'])

Preprocessing data to get concreteness scores

In [11]:
df_train['sentence2_preprocessed'] = df_train['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_train['sentence2_preprocessed'] = df_train['sentence2_preprocessed'].map(lambda x: x.lower())
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath


In [12]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
concreteness_df = pd.read_csv('../../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.9,0,0


In [13]:
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 # Normalizing to a scale of 0 to 1

In [14]:
len(word_to_concreteness_score_map.keys())

39954

Some utility functions to get concreteness scores for the input sentence

In [15]:
def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return round(word_to_concreteness_score_map.get(word, 0.5), 3)

In [16]:
def calculate_text_concreteness(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    # Take the average concreteness score of all words in the text
    return sum(concreteness_scores) / len(concreteness_scores)

In [17]:
def calculate_text_concreteness_sequence(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    concreteness_scores = " ".join([str(i) for i in concreteness_scores])
    # Take the average concreteness score of all words in the text
    return concreteness_scores

In [18]:
# Example usage
text = "the laws of the world can't stop him"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.5246666666666667


In [19]:
# Example usage
text = "car crash"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.868


We are using concreteness score sequence because in the EDA we found that concreteness score can be a usefull factor in distinguishing whether the sentence is plausible or not

In [20]:
df_train['concreteness_score_sequence'] = df_train.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

In [21]:
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed,concreteness_score_sequence
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation,0.286 0.36 0.334 0.486 0.846 0.318 0.42
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath,0.292 0.48 1.0 0.318 0.326 0.394 0.872


In [22]:
df_train.shape

(12892, 8)

Loading validation and test dataset and adding concreteness scores sequences for them

In [23]:
df_validation = pd.DataFrame(json.load(open('{}/{}.json'.format(adept_data_path, "val"), 'r')))
df_test = pd.DataFrame(json.load(open('{}/{}.json'.format(adept_data_path, "test"), 'r')))
print(df_validation.shape, df_test.shape)

(1611, 6) (1612, 6)


In [24]:
df_validation['sentence2_preprocessed'] = df_validation['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_validation['sentence2_preprocessed'] = df_validation['sentence2_preprocessed'].map(lambda x: x.lower())
df_validation['concreteness_score_sequence'] = df_validation.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

df_test['sentence2_preprocessed'] = df_test['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_test['sentence2_preprocessed'] = df_test['sentence2_preprocessed'].map(lambda x: x.lower())
df_test['concreteness_score_sequence'] = df_test.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

### Artificial Dataset Combined

Loading artificially created dataset (using Llama 2 70B model). Checkout the notebook modelling/adept/ArtificialDataCreation - ADEPT.ipynb for more info

In [25]:
df_artificial_train_combined = pd.read_csv('../../datasets/adept/generated-data/artificial_train_combined.csv')

In [26]:
df_artificial_train_combined['sentence2_preprocessed'] = df_artificial_train_combined['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_artificial_train_combined['sentence2_preprocessed'] = df_artificial_train_combined['sentence2_preprocessed'].map(lambda x: x.lower())
df_artificial_train_combined['concreteness_score_sequence'] = df_artificial_train_combined.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

In [27]:
df_artificial_train_combined.head(2)

Unnamed: 0,noun,modifier,sentence2,label,sentence1,idx,sentence2_preprocessed,concreteness_score_sequence
0,Dog,Happy,A happy dog wags its tail.,4,,,a happy dog wags its tail,0.292 0.512 0.97 0.5 0.38 0.992
1,Building,Tall,A tall building casts a long shadow.,4,,,a tall building casts a long shadow,0.292 0.672 0.928 0.5 0.292 0.636 0.908


# Fine Tuning Different Transformer Models using different strategies on ADEPT

## Data Preparation

In [28]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
adept_data_path

'../../datasets/adept/train-dev-test-split'

Loading dataset in a format that can be used by huggingface transformers library

In [30]:
adept_dataset = DatasetDict({
    'train': Dataset.from_pandas(df_train),
    'validation': Dataset.from_pandas(df_validation),
    'test': Dataset.from_pandas(df_test),
    'artificial_train_combined': Dataset.from_pandas(df_artificial_train_combined)
})

In [31]:
adept_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 1612
    })
    artificial_train_combined: Dataset({
        features: ['noun', 'modifier', 'sentence2', 'label', 'sentence1', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 13186
    })
})

In [32]:
adept_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'modifier': Value(dtype='string', id=None),
 'noun': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'idx': Value(dtype='int64', id=None),
 'sentence2_preprocessed': Value(dtype='string', id=None),
 'concreteness_score_sequence': Value(dtype='string', id=None)}

In [33]:
adept_dataset['train'][10]

{'sentence1': 'A year is made up of 365 days.',
 'sentence2': 'An outstanding year is made up of 365 days.',
 'modifier': 'outstanding',
 'noun': 'year',
 'label': 2,
 'idx': 2825,
 'sentence2_preprocessed': 'an outstanding year is made up of 365 days',
 'concreteness_score_sequence': '0.292 0.35 0.65 0.318 0.504 0.766 0.334 0.5 0.672'}

These are the best params we got after fine tuning different models and parameter using optuna

In [34]:
from torch.optim.adamw import AdamW
from torch.utils.data import DataLoader
import evaluate

In [35]:
COLUMNS_TO_KEEP = ['label', 'input_ids', 'token_type_ids', 'attention_mask']

In [36]:
class ModellingExperiments:
    def __init__(self, model_name, dataset, batch_size, learning_rate):
        self.model_name = model_name
        self.dataset = dataset
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.cols_to_keep = set(COLUMNS_TO_KEEP)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.data_collator = DataCollatorWithPadding(self.tokenizer)
        
    def tokenize_sentence_with_concreteness_score(self, item):
        # We also tried using the concreteness score for the whole sentence as a feature input
        # To implement that, we changed the source code of transformers library and changed the classification head manually 
        # So that we can accomodate that extra feature, but this method of encoding concreteness score sequence was giving better score
        # Hence, we are using this only in the final experiments. You can check that experiment out in the following notebook:
        # modelling/adept/experiments/FinalModellingWithConcretenessScore[DeBERTa] - ADEPT
        return self.tokenizer(item['sentence2'], item['concreteness_score_sequence'], truncation=True)
        
    def tokenize_sentence(self, item):
        # Normal tokenization
        return self.tokenizer(item['sentence2'], truncation=True)
        
    def add_strategy_to_tokenizer_function_map(self):
        # Mapping between strategy and the tokenization functions defined above
        # Strategy refers to whether we are using normal tokenization or whether we want to do paired tokenization of 
        # both input sentence and the sequence of concreteness score for that sentence
        self.strategy_to_tokenizer_function_map = dict()
        self.strategy_to_tokenizer_function_map['normal_finetuning'] = self.tokenize_sentence_with_concreteness_score
        self.strategy_to_tokenizer_function_map['concreteness_score_addition'] = self.tokenize_sentence
        
    def prepare_dataset(self, strategy):
        # Here, we wull tokenize the dataset based on the strategy we are planning to use
        self.strategy = strategy
        self.add_strategy_to_tokenizer_function_map()
        self.tokenized_dataset = self.dataset.map(self.strategy_to_tokenizer_function_map[self.strategy], batched=True)
        current_cols = set(list(self.tokenized_dataset['train'].features.keys()))
        self.tokenized_dataset = self.tokenized_dataset.remove_columns(list(current_cols - self.cols_to_keep))
        self.tokenized_dataset = self.tokenized_dataset.rename_column("label", "labels")
        self.tokenized_dataset = self.tokenized_dataset.with_format("torch")

    def prepare_dataloaders(self, train_dataset_type):
        # Here, we prepare the dataloaders, it also takes an argument named train_dataset_type which specified
        # whether we want to use the original training data or the one combined with out artificially created dataset
        self.train_dataset_type = train_dataset_type
        self.train_dataloader = DataLoader(self.tokenized_dataset[self.train_dataset_type], batch_size=self.batch_size, shuffle=True, collate_fn=self.data_collator)
        self.validation_dataloader = DataLoader(self.tokenized_dataset['validation'], batch_size=self.batch_size, collate_fn=self.data_collator)
        self.test_dataloader = DataLoader(self.tokenized_dataset['test'], batch_size=self.batch_size, collate_fn=self.data_collator)

    def init_model(self):
        torch.manual_seed(4)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)
        self.model.to(self.device)

    def setup_optimizer(self, num_epochs):
        # Setting up optimizer and learning rate scheduler
        self.num_epochs = num_epochs
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        self.num_training_steps = self.num_epochs*len(self.train_dataloader)
        self.learning_rate_scheduler = get_scheduler("linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=self.num_training_steps)
    
    def train_model(self):
        # Training the Model
        self.evaluation_results_list = list()
        progress_bar = tqdm(range(self.num_training_steps))
        for epoch in range(self.num_epochs):
            self.model.train()
            for batch in self.train_dataloader:
                batch = {k:v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                loss = outputs.loss
                # calculating gradients
                loss.backward()
                # optimizing weights
                self.optimizer.step()
                # updating learning rate
                self.learning_rate_scheduler.step()
                # flushing gradients
                self.optimizer.zero_grad()
                # updating progress bar
                progress_bar.update(1)
            # evaluating per epoch
            self.eval_model(self.validation_dataloader)
            eval_results = dict()
            eval_results['epoch'] = epoch + 1
            for k, v in self.eval_dict.items():
                eval_results["validation_{}".format(k)] = v
            self.eval_model(self.test_dataloader)
            for k, v in self.eval_dict.items():
                eval_results["test_{}".format(k)] = v
            self.evaluation_results_list.append(eval_results)

    def initialize_metrics(self):
        # Initializing evaluation metrics
        self.accuracy = evaluate.load('accuracy')
        self.precision = evaluate.load('precision')
        self.recall = evaluate.load('recall')
        self.f1 = evaluate.load('f1')
        self.roc_auc =  evaluate.load("roc_auc", "multiclass")
        self.metrics = [self.accuracy, self.precision, self.recall, self.f1]
                
    def eval_model(self, dataloader):
        # Evaluating the model on different dataloaders
        self.initialize_metrics()
        self.model.eval()
        for batch in dataloader:
            batch = {k: v.to(self.device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = self.model(**batch)
            # Extract logits and predictions
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
        
            # Apply softmax to convert logits to probabilities
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            
            # Extract probabilities for the positive class
            positive_probabilities = probabilities
        
            # Update metrics for accuracy, precision, recall, and F1
            for metric in self.metrics:
                metric.add_batch(predictions=predictions, references=batch['labels'])
        
            # Update ROC AUC metric
            self.roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])
        
        # # Compute metrics for accuracy, precision, recall, and F1
        self.eval_dict = {}
        self.eval_dict.update(self.accuracy.compute())
        self.eval_dict.update(self.precision.compute(average="macro"))
        self.eval_dict.update(self.recall.compute(average="macro"))
        self.eval_dict.update(self.f1.compute(average="macro"))
        self.eval_dict.update(self.roc_auc.compute(multi_class='ovo', average="macro"))        

# Running Experiments

Defining parameters on which we will run the experiments

In [37]:
model_name_list = ["microsoft/deberta-base", "facebook/bart-base"]
num_epochs = 4
strategies_list = ["concreteness_score_addition", "normal_finetuning"]
train_dataset_type_list = ["train", "artificial_train_combined"]

Defining static arguments

In [38]:
kw_args = dict()
kw_args["dataset"] = adept_dataset
kw_args["batch_size"] = 32
# This learning rate was found when we were doing hyperparameter tuning of different models
# To check out the hyperparameter tuning, look at the following notebook:
# modelling/adept/experiments/FineTuningAndModelSelection - ADEPT.ipynb
kw_args["learning_rate"] = 3.660515504756857e-05

Running experiments in a loop

In [39]:
with open('result_dynamic_dict_final_random_seed_4.json', 'r') as openfile:
    result_dynamic_dict = json.load(openfile)

In [40]:
result_list = list()
for model_name in model_name_list:
    # Setting Model Name
    kw_args["model_name"] = model_name
    # Initializing ModellingExperiments Object
    modelling_obj = ModellingExperiments(**kw_args)
    for strategy in strategies_list:
        # Preparing dataset for a specific strategy
        modelling_obj.prepare_dataset(strategy=strategy)
        for train_dataset_type in train_dataset_type_list:
            # Preparing data loaders
            modelling_obj.prepare_dataloaders(train_dataset_type=train_dataset_type)
            print("*"*50)
            # Initializing dictionary for storing results
            result_dict = dict()
            result_dict["model_name"] = model_name
            result_dict["strategy"] = strategy
            result_dict["train_dataset_type"] = train_dataset_type
            print("Model Training with the following Configurations: {}".format(result_dict))
            unique_key = "#".join(str(i) for i in list(result_dict.values()))
            if not result_dynamic_dict.get(unique_key):
                result_dynamic_dict[unique_key] = dict()
                # initializing model
                modelling_obj.init_model()
                # For a specic num_epochs variable, we are setting up the optimizers
                modelling_obj.setup_optimizer(num_epochs=num_epochs)
                # Now, we are training the model
                modelling_obj.train_model()
                # Saving evaluated results
                evaluation_results_list = modelling_obj.evaluation_results_list
                for evaluation_results in evaluation_results_list:
                    print("Evaluation Results: {}".format(evaluation_results))
                    result_dict.update(evaluation_results)
                    result_list.append(result_dict)
                # Updating the stored file
                result_dynamic_dict[unique_key] = evaluation_results_list
                # Storing the updated result file
                with open('result_dynamic_dict_final_random_seed_4.json', 'w', encoding='utf-8') as f:
                    json.dump(result_dynamic_dict, f, ensure_ascii=False, indent=4)
            else:
                print("Model already trained, results are stored already!")
                for res in result_dynamic_dict[unique_key]:
                    print("Evaluation Results: {}".format(res))

Map: 100%|██████████| 12892/12892 [00:00<00:00, 32020.83 examples/s]
Map: 100%|██████████| 1611/1611 [00:00<00:00, 55202.64 examples/s]
Map: 100%|██████████| 1612/1612 [00:00<00:00, 42416.94 examples/s]
Map: 100%|██████████| 13186/13186 [00:00<00:00, 43613.25 examples/s]


**************************************************
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'concreteness_score_addition', 'train_dataset_type': 'train'}
Model already trained, results are stored already!
Evaluation Results: {'epoch': 1, 'validation_accuracy': 0.6927374301675978, 'validation_precision': 0.4218020040624624, 'validation_recall': 0.3559372710929617, 'validation_f1': 0.3768922810205211, 'validation_roc_auc': 0.7322247730008058, 'test_accuracy': 0.7109181141439206, 'test_precision': 0.43716666396861975, 'test_recall': 0.355651127194351, 'test_f1': 0.3810411090174126, 'test_roc_auc': 0.7480621093169341}
Evaluation Results: {'epoch': 2, 'validation_accuracy': 0.6970825574177529, 'validation_precision': 0.42389880772256855, 'validation_recall': 0.3704935960884305, 'validation_f1': 0.3897601769441422, 'validation_roc_auc': 0.7471510359169014, 'test_accuracy': 0.716501240694789, 'test_precision': 0.44323430934064056, 

Map: 100%|██████████| 12892/12892 [00:00<00:00, 27778.76 examples/s]
Map: 100%|██████████| 1611/1611 [00:00<00:00, 32111.62 examples/s]
Map: 100%|██████████| 1612/1612 [00:00<00:00, 29213.19 examples/s]
Map: 100%|██████████| 13186/13186 [00:00<00:00, 27302.39 examples/s]


**************************************************
Model Training with the following Configurations: {'model_name': 'microsoft/deberta-base', 'strategy': 'normal_finetuning', 'train_dataset_type': 'train'}
Model already trained, results are stored already!
Evaluation Results: {'epoch': 1, 'validation_accuracy': 0.6641837368094351, 'validation_precision': 0.13283674736188703, 'validation_recall': 0.2, 'validation_f1': 0.15964192465497945, 'validation_roc_auc': 0.4925887407513775, 'test_accuracy': 0.6836228287841191, 'test_precision': 0.13672456575682382, 'test_recall': 0.2, 'test_f1': 0.1624170965364775, 'test_roc_auc': 0.5066870287818176}
Evaluation Results: {'epoch': 2, 'validation_accuracy': 0.6641837368094351, 'validation_precision': 0.13283674736188703, 'validation_recall': 0.2, 'validation_f1': 0.15964192465497945, 'validation_roc_auc': 0.5096583369009977, 'test_accuracy': 0.6836228287841191, 'test_precision': 0.13672456575682382, 'test_recall': 0.2, 'test_f1': 0.1624170965364775,

Map: 100%|██████████| 12892/12892 [00:00<00:00, 66126.59 examples/s]
Map: 100%|██████████| 1611/1611 [00:00<00:00, 38439.14 examples/s]
Map: 100%|██████████| 1612/1612 [00:00<00:00, 73348.79 examples/s]
Map: 100%|██████████| 13186/13186 [00:00<00:00, 53923.18 examples/s]


**************************************************
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'concreteness_score_addition', 'train_dataset_type': 'train'}
Model already trained, results are stored already!
Evaluation Results: {'epoch': 1, 'validation_accuracy': 0.7070142768466791, 'validation_precision': 0.36399443772173024, 'validation_recall': 0.3070603925335841, 'validation_f1': 0.31791435746436125, 'validation_roc_auc': 0.7120372589799651, 'test_accuracy': 0.7053349875930521, 'test_precision': 0.3375714285714286, 'test_recall': 0.28835432254569604, 'test_f1': 0.29675173294200463, 'test_roc_auc': 0.7358062994449799}
Evaluation Results: {'epoch': 2, 'validation_accuracy': 0.6883923029174426, 'validation_precision': 0.42029394992688457, 'validation_recall': 0.3541590503336739, 'validation_f1': 0.36837985688588315, 'validation_roc_auc': 0.7197915116873297, 'test_accuracy': 0.7084367245657568, 'test_precision': 0.4364602969860095,

Map: 100%|██████████| 12892/12892 [00:00<00:00, 40285.02 examples/s]
Map: 100%|██████████| 1611/1611 [00:00<00:00, 9955.57 examples/s]
Map: 100%|██████████| 1612/1612 [00:00<00:00, 43072.22 examples/s]
Map: 100%|██████████| 13186/13186 [00:00<00:00, 39848.30 examples/s]


**************************************************
Model Training with the following Configurations: {'model_name': 'facebook/bart-base', 'strategy': 'normal_finetuning', 'train_dataset_type': 'train'}
Model already trained, results are stored already!
Evaluation Results: {'epoch': 1, 'validation_accuracy': 0.7039106145251397, 'validation_precision': 0.35093410985497314, 'validation_recall': 0.29744720315989026, 'validation_f1': 0.30546865069415696, 'validation_roc_auc': 0.6933118356398368, 'test_accuracy': 0.7047146401985112, 'test_precision': 0.332382548007548, 'test_recall': 0.2815780832259709, 'test_f1': 0.2887814215915707, 'test_roc_auc': 0.6762599152322426}
Evaluation Results: {'epoch': 2, 'validation_accuracy': 0.6964618249534451, 'validation_precision': 0.4232432628827786, 'validation_recall': 0.35148362068853234, 'validation_f1': 0.3724079161712285, 'validation_roc_auc': 0.7171933773532457, 'test_accuracy': 0.7096774193548387, 'test_precision': 0.440907639772888, 'test_recall'

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.weight', 'classification_head.out_proj.weight', 'classification_head.dense.bias', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/1652 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 1652/1652 [08:48<00:00,  3.12it/s] 

Evaluation Results: {'epoch': 1, 'validation_accuracy': 0.6896337678460583, 'validation_precision': 0.42494599190184346, 'validation_recall': 0.26487257451843044, 'validation_f1': 0.2738568214250596, 'validation_roc_auc': 0.6824714796150582, 'test_accuracy': 0.7009925558312655, 'test_precision': 0.42362675995952737, 'test_recall': 0.2583131869086401, 'test_f1': 0.2657690587570671, 'test_roc_auc': 0.7135223173681743}
Evaluation Results: {'epoch': 2, 'validation_accuracy': 0.6890130353817505, 'validation_precision': 0.41081509094399615, 'validation_recall': 0.34014347671486594, 'validation_f1': 0.3608828302705508, 'validation_roc_auc': 0.7217884808086161, 'test_accuracy': 0.7078163771712159, 'test_precision': 0.45065748660160543, 'test_recall': 0.35938309925669637, 'test_f1': 0.3878910962055507, 'test_roc_auc': 0.737850820834112}
Evaluation Results: {'epoch': 3, 'validation_accuracy': 0.6399751707014277, 'validation_precision': 0.396843273464405, 'validation_recall': 0.3683321505686939, 




We are done with the experiments, now, let's take a look at the results. First, we will have to convert our dynamic results dictionary to a more readable format. Let's do that first

In [42]:
result_list = list()
for k, v in  result_dynamic_dict.items():
    model_name, strategy, train_dataset_type = k.split("#")
    for item in v:
        result_dict = dict()
        result_dict['model_name'] = model_name
        result_dict['strategy'] = strategy
        result_dict['train_dataset_type'] = train_dataset_type
        result_dict.update(item)
        result_list.append(result_dict)

In [43]:
results_df = pd.DataFrame(result_list)
results_df.head()

Unnamed: 0,model_name,strategy,train_dataset_type,epoch,validation_accuracy,validation_precision,validation_recall,validation_f1,validation_roc_auc,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,microsoft/deberta-base,concreteness_score_addition,train,1,0.692737,0.421802,0.355937,0.376892,0.732225,0.710918,0.437167,0.355651,0.381041,0.748062
1,microsoft/deberta-base,concreteness_score_addition,train,2,0.697083,0.423899,0.370494,0.38976,0.747151,0.716501,0.443234,0.375692,0.39911,0.752123
2,microsoft/deberta-base,concreteness_score_addition,train,3,0.69522,0.434807,0.381118,0.398491,0.74167,0.69727,0.41387,0.37111,0.385685,0.757999
3,microsoft/deberta-base,concreteness_score_addition,train,4,0.675978,0.414134,0.393301,0.398001,0.748236,0.687965,0.40267,0.384769,0.389635,0.761849
4,microsoft/deberta-base,concreteness_score_addition,artificial_train_combined,1,0.703911,0.466361,0.284447,0.30245,0.725522,0.704715,0.463664,0.267322,0.28241,0.726188


In [44]:
results_df.shape

(32, 14)

Now, let's take a look at the top three models with best test ROC-AUC score

In [45]:
results_df.sort_values(by=['test_roc_auc'], ascending=False).head(3)

Unnamed: 0,model_name,strategy,train_dataset_type,epoch,validation_accuracy,validation_precision,validation_recall,validation_f1,validation_roc_auc,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
27,facebook/bart-base,normal_finetuning,train,4,0.696462,0.426166,0.373465,0.392378,0.732624,0.722084,0.454838,0.391489,0.415188,0.776406
18,facebook/bart-base,concreteness_score_addition,train,3,0.698324,0.425392,0.353423,0.3761,0.743411,0.725806,0.46521,0.368696,0.398438,0.775411
26,facebook/bart-base,normal_finetuning,train,3,0.705152,0.441156,0.366145,0.38858,0.730755,0.721464,0.465068,0.365629,0.393769,0.773751


Here, we can see that BART Base model with normal finetuning and on original training set is getting .776406 ROC-AUC. Which is good.
We can also see that the BART model in which we have added concreteness score as another input sequence, is not much behind it.
Also, the test accuracy for 5 label classification is better than the one mentioned in the ADEPT paper.

Now, let's take a look at top three models with best test accuracy

In [46]:
results_df.sort_values(by=['test_accuracy'], ascending=False).head(3)

Unnamed: 0,model_name,strategy,train_dataset_type,epoch,validation_accuracy,validation_precision,validation_recall,validation_f1,validation_roc_auc,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
18,facebook/bart-base,concreteness_score_addition,train,3,0.698324,0.425392,0.353423,0.3761,0.743411,0.725806,0.46521,0.368696,0.398438,0.775411
27,facebook/bart-base,normal_finetuning,train,4,0.696462,0.426166,0.373465,0.392378,0.732624,0.722084,0.454838,0.391489,0.415188,0.776406
26,facebook/bart-base,normal_finetuning,train,3,0.705152,0.441156,0.366145,0.38858,0.730755,0.721464,0.465068,0.365629,0.393769,0.773751


Here, the our BART model with concreteness score sequence is giving better results in accuracy. 

Now, let's take a look at models which used artificially created dataset. Let's see how they are performing.

In [49]:
results_df[results_df.train_dataset_type == "artificial_train_combined"].sort_values(by=['test_roc_auc'], ascending=False).head(3)

Unnamed: 0,model_name,strategy,train_dataset_type,epoch,validation_accuracy,validation_precision,validation_recall,validation_f1,validation_roc_auc,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
6,microsoft/deberta-base,concreteness_score_addition,artificial_train_combined,3,0.700807,0.433721,0.359241,0.383321,0.750411,0.701613,0.416728,0.357466,0.379014,0.762923
7,microsoft/deberta-base,concreteness_score_addition,artificial_train_combined,4,0.685289,0.420415,0.403362,0.408644,0.751175,0.675558,0.395915,0.37608,0.382074,0.760869
5,microsoft/deberta-base,concreteness_score_addition,artificial_train_combined,2,0.700186,0.431362,0.370173,0.390604,0.754458,0.702854,0.430742,0.37248,0.39167,0.757785


Here, we can see that BART is performing better with the concreteness score and artificial data, this is interesting because when we were doing individual model experiments, DeBERTa model was getting better results with the addition of concreteness score on 2 epochs of training, but we can't see that result here (it might be due to random state in intialization of the models). We are unable to recreate it with this code but you can check it out in the following notebook: /modelling/adept/experiments/FinalModellingWithConcretenessScoreSequence[DeBERTa] - ADEPT.ipynb

In one experiment, we were getting ROC-AUC of .785589, but due to randomness, we are not able to recreate it. Maybe we will experiment and try to get it before the presentation.

In [50]:
results_df.to_csv('../../results/FinalResultsADEPTNew.csv', index=False)