In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
!ls ../datasets/adept/train-dev-test-split/

README.md  test.json  train.json  val.json


In [5]:
import json
import nltk
import re

In [6]:
adept_data_path = "../datasets/adept/train-dev-test-split"
split = "train"

In [7]:
train_data = json.load(open('{}/{}.json'.format(adept_data_path, split), 'r'))

In [8]:
df_train = pd.DataFrame(train_data)
df_train.head()

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620
2,A scene is painted.,A negative scene is painted.,negative,scene,2,3324
3,A bone breaks a tooth.,An alleged bone breaks a tooth.,alleged,bone,2,10610
4,A trip causes a happening.,A fabulous trip causes a happening.,fabulous,trip,2,14917


In [9]:
label_to_class_map = {0:"Impossible", 1:"Less Likely", 2:"Equally Likely", 3:"More Likely", 4:"Necessarily True"}

In [10]:
label_to_class_map.values()

dict_values(['Impossible', 'Less Likely', 'Equally Likely', 'More Likely', 'Necessarily True'])

In [11]:
df_train['sentence2_preprocessed'] = df_train['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_train['sentence2_preprocessed'] = df_train['sentence2_preprocessed'].map(lambda x: x.lower())
# df_train['class_label'] = df_train.label.map(lambda x: label_to_class_map[x])
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath


In [12]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
concreteness_df = pd.read_csv('../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.9,0,0


In [13]:
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 # Normalizing to a scale of 0 to 1

In [14]:
len(word_to_concreteness_score_map.keys())

39954

In [15]:
def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return round(word_to_concreteness_score_map.get(word, 0.5), 3)

In [16]:
def calculate_text_concreteness(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    # Take the average concreteness score of all words in the text
    return sum(concreteness_scores) / len(concreteness_scores)

In [17]:
def calculate_text_concreteness_sequence(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    concreteness_scores = " ".join([str(i) for i in concreteness_scores])
    # Take the average concreteness score of all words in the text
    return concreteness_scores

In [18]:
# Example usage
text = "the laws of the world can't stop him"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.5246666666666667


In [19]:
# Example usage
text = "car crash"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.868


In [20]:
df_train['concreteness_score_sequence'] = df_train.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

In [21]:
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed,concreteness_score_sequence
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation,0.286 0.36 0.334 0.486 0.846 0.318 0.42
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath,0.292 0.48 1.0 0.318 0.326 0.394 0.872


In [22]:
df_train.shape

(12892, 8)

In [23]:
df_validation = pd.DataFrame(json.load(open('{}/{}.json'.format(adept_data_path, "val"), 'r')))
df_test = pd.DataFrame(json.load(open('{}/{}.json'.format(adept_data_path, "test"), 'r')))
print(df_validation.shape, df_test.shape)

(1611, 6) (1612, 6)


In [24]:
df_validation['sentence2_preprocessed'] = df_validation['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_validation['sentence2_preprocessed'] = df_validation['sentence2_preprocessed'].map(lambda x: x.lower())
df_validation['concreteness_score_sequence'] = df_validation.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

df_test['sentence2_preprocessed'] = df_test['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_test['sentence2_preprocessed'] = df_test['sentence2_preprocessed'].map(lambda x: x.lower())
df_test['concreteness_score_sequence'] = df_test.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

### Artificial Dataset Combined

In [25]:
df_artificial_train_combined = pd.read_csv('../datasets/artificial_adept/artificial_train_combined.csv')

In [26]:
df_artificial_train_combined['sentence2_preprocessed'] = df_artificial_train_combined['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_artificial_train_combined['sentence2_preprocessed'] = df_artificial_train_combined['sentence2_preprocessed'].map(lambda x: x.lower())
df_artificial_train_combined['concreteness_score_sequence'] = df_artificial_train_combined.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

In [27]:
df_artificial_train_combined.head(2)

Unnamed: 0,noun,modifier,Verb,sentence2,label,sentence1,idx,sentence2_preprocessed,concreteness_score_sequence
0,Circuit,Short,Causes,A short circuit causes a power outage.,3,,,a short circuit causes a power outage,0.292 0.722 0.766 0.5 0.292 0.408 0.566
1,Headphones,Comfortable,Listen,Comfortable headphones listen to podcasts.,3,,,comfortable headphones listen to podcasts,0.466 0.958 0.694 0.31 0.5


# Fine Tuning Bert Base Uncased on ADEPT

## Data Preparation

In [28]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
adept_data_path

'../datasets/adept/train-dev-test-split'

In [30]:
!ls '../datasets/adept/train-dev-test-split'

README.md  test.json  train.json  val.json


In [31]:
train_split = "train.json"
validation_split = "val.json"
test_split = "test.json"

In [32]:
adept_dataset = DatasetDict({
    'train': Dataset.from_pandas(df_train),
    'validation': Dataset.from_pandas(df_validation),
    'test': Dataset.from_pandas(df_test),
    'artificial_train_combined': Dataset.from_pandas(df_artificial_train_combined)
})

In [33]:
adept_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 1612
    })
    artificial_train_combined: Dataset({
        features: ['noun', 'modifier', 'Verb', 'sentence2', 'label', 'sentence1', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 13069
    })
})

In [34]:
adept_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'modifier': Value(dtype='string', id=None),
 'noun': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'idx': Value(dtype='int64', id=None),
 'sentence2_preprocessed': Value(dtype='string', id=None),
 'concreteness_score_sequence': Value(dtype='string', id=None)}

In [35]:
adept_dataset['train'][10]

{'sentence1': 'A year is made up of 365 days.',
 'sentence2': 'An outstanding year is made up of 365 days.',
 'modifier': 'outstanding',
 'noun': 'year',
 'label': 2,
 'idx': 2825,
 'sentence2_preprocessed': 'an outstanding year is made up of 365 days',
 'concreteness_score_sequence': '0.292 0.35 0.65 0.318 0.504 0.766 0.334 0.5 0.672'}

These are the best params we got after fine tuning different models and parameter using optuna

In [36]:
from torch.optim.adamw import AdamW
from torch.utils.data import DataLoader
import evaluate

In [37]:
# accuracy = evaluate.load('accuracy')
# precision = evaluate.load('precision')
# recall = evaluate.load('recall')
# f1 = evaluate.load('f1')
# roc_auc =  evaluate.load("roc_auc", "multiclass")
# metrics = [accuracy, precision, recall, f1]

In [38]:
best_params = {'learning_rate': 3.660515504756857e-05,
 'num_train_epochs': 3}

In [39]:
COLUMNS_TO_KEEP = ['label', 'input_ids', 'token_type_ids', 'attention_mask']

In [44]:
class ModellingExperiments:
    def __init__(self, model_name, dataset, train_dataset_type, batch_size, learning_rate):
        self.model_name = model_name
        self.dataset = dataset
        self.train_dataset_type = train_dataset_type
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, ignore_mismatched_sizes=True)
        self.cols_to_keep = set(COLUMNS_TO_KEEP)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.data_collator = DataCollatorWithPadding(self.tokenizer)
        self.model.to(self.device)
        
    def tokenize_sentence_with_concreteness_score(self, item):
        return self.tokenizer(item['sentence2'], item['concreteness_score_sequence'], truncation=True)
        
    def tokenize_sentence(self, item):
        return self.tokenizer(item['sentence2'], truncation=True)
        
    def add_strategy_to_tokenizer_function_map(self):
        self.strategy_to_tokenizer_function_map = dict()
        self.strategy_to_tokenizer_function_map['normal_finetuning'] = self.tokenize_sentence_with_concreteness_score
        self.strategy_to_tokenizer_function_map['concreteness_score_addition'] = self.tokenize_sentence
        
    def prepare_dataset(self, strategy):
        self.strategy = strategy
        self.add_strategy_to_tokenizer_function_map()
        self.tokenized_dataset = self.dataset.map(self.strategy_to_tokenizer_function_map[self.strategy], batched=True)
        current_cols = set(list(self.tokenized_dataset['train'].features.keys()))
        self.tokenized_dataset = self.tokenized_dataset.remove_columns(list(current_cols - self.cols_to_keep))
        self.tokenized_dataset = self.tokenized_dataset.rename_column("label", "labels")
        self.tokenized_dataset = self.tokenized_dataset.with_format("torch")

    def prepare_dataloaders(self):
        self.train_dataloader = DataLoader(self.tokenized_dataset[self.train_dataset_type], batch_size=self.batch_size, shuffle=True, collate_fn=self.data_collator)
        self.validation_dataloader = DataLoader(self.tokenized_dataset['validation'], batch_size=self.batch_size, collate_fn=self.data_collator)
        self.test_dataloader = DataLoader(self.tokenized_dataset['test'], batch_size=self.batch_size, collate_fn=self.data_collator)

    def setup_optimizer(self, num_epochs):
        self.num_epochs = num_epochs
        self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        self.num_training_steps = self.num_epochs*len(self.train_dataloader)
        self.learning_rate_scheduler = get_scheduler("linear", optimizer=self.optimizer, num_warmup_steps=0, num_training_steps=self.num_training_steps)
    
    def train_model(self):
        self.model.train()
        progress_bar = tqdm(range(self.num_training_steps))
        for epoch in range(self.num_epochs):
            for batch in self.train_dataloader:
                batch = {k:v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                loss = outputs.loss
                # calculating gradients
                loss.backward()
                # optimizing weights
                self.optimizer.step()
                # updating learning rate
                self.learning_rate_scheduler.step()
                # flushing gradients
                self.optimizer.zero_grad()
                # updating progress bar
                progress_bar.update(1)

    def initialize_metrics(self):
        self.accuracy = evaluate.load('accuracy')
        self.precision = evaluate.load('precision')
        self.recall = evaluate.load('recall')
        self.f1 = evaluate.load('f1')
        self.roc_auc =  evaluate.load("roc_auc", "multiclass")
        self.metrics = [self.accuracy, self.precision, self.recall, self.f1]
                
    def eval_model(self, dataloader):
        self.initialize_metrics()
        self.model.eval()
        for batch in dataloader:
            batch = {k: v.to(self.device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = self.model(**batch)
            # Extract logits and predictions
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
        
            # Apply softmax to convert logits to probabilities
            probabilities = torch.nn.functional.softmax(logits, dim=-1)
            
            # Extract probabilities for the positive class
            positive_probabilities = probabilities
        
            # Update metrics for accuracy, precision, recall, and F1
            for metric in self.metrics:
                metric.add_batch(predictions=predictions, references=batch['labels'])
        
            # Update ROC AUC metric
            self.roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])
        
        # # Compute metrics for accuracy, precision, recall, and F1
        self.eval_dict = {}
        self.eval_dict.update(self.accuracy.compute())
        self.eval_dict.update(self.precision.compute(average="macro"))
        self.eval_dict.update(self.recall.compute(average="macro"))
        self.eval_dict.update(self.f1.compute(average="macro"))
        self.eval_dict.update(self.roc_auc.compute(multi_class='ovo', average="macro"))        

In [45]:
kw_args = dict()
kw_args["model_name"] = "bert-base-uncased"
# kw_args["strategy"] = "concreteness_score_addition"
kw_args["dataset"] = adept_dataset
kw_args["train_dataset_type"] = "train"
# kw_args["num_epochs"] = 1
kw_args["batch_size"] = 32
kw_args["learning_rate"] = 3.660515504756857e-05

In [46]:
modelling_obj = ModellingExperiments(**kw_args)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
modelling_obj.prepare_dataset("concreteness_score_addition")

Map: 100%|██████████| 12892/12892 [00:00<00:00, 53967.95 examples/s]
Map: 100%|██████████| 1611/1611 [00:00<00:00, 44022.57 examples/s]
Map: 100%|██████████| 1612/1612 [00:00<00:00, 35777.24 examples/s]
Map: 100%|██████████| 13069/13069 [00:00<00:00, 54232.63 examples/s]


In [48]:
modelling_obj.prepare_dataloaders()

In [49]:
modelling_obj.setup_optimizer(num_epochs=1)

In [50]:
modelling_obj.train_model()

  0%|          | 0/403 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 403/403 [00:51<00:00,  7.81it/s]


In [51]:
modelling_obj.eval_model(modelling_obj.validation_dataloader)
modelling_obj.eval_dict

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.7094972067039106,
 'precision': 0.3638229900297088,
 'recall': 0.3167133700897108,
 'f1': 0.31684786530802905,
 'roc_auc': 0.6894649439841288}

In [52]:
modelling_obj.eval_model(modelling_obj.test_dataloader)
modelling_obj.eval_dict

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.717741935483871,
 'precision': 0.35761505789259324,
 'recall': 0.31238193596622144,
 'f1': 0.3155787692126031,
 'roc_auc': 0.683035925257667}