In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
!ls ../datasets/adept/train-dev-test-split/

README.md  test.json  train.json  val.json


In [5]:
import json
import nltk
import re

In [6]:
adept_data_path = "../datasets/adept/train-dev-test-split"
split = "train"

In [7]:
train_data = json.load(open('{}/{}.json'.format(adept_data_path, split), 'r'))

In [8]:
df_train = pd.DataFrame(train_data)
df_train.head()

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620
2,A scene is painted.,A negative scene is painted.,negative,scene,2,3324
3,A bone breaks a tooth.,An alleged bone breaks a tooth.,alleged,bone,2,10610
4,A trip causes a happening.,A fabulous trip causes a happening.,fabulous,trip,2,14917


In [9]:
label_to_class_map = {0:"Impossible", 1:"Less Likely", 2:"Equally Likely", 3:"More Likely", 4:"Necessarily True"}

In [10]:
label_to_class_map.values()

dict_values(['Impossible', 'Less Likely', 'Equally Likely', 'More Likely', 'Necessarily True'])

In [11]:
df_train['sentence2_preprocessed'] = df_train['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_train['sentence2_preprocessed'] = df_train['sentence2_preprocessed'].map(lambda x: x.lower())
df_train['class_label'] = df_train.label.map(lambda x: label_to_class_map[x])
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed,class_label
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation,More Likely
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath,Equally Likely


In [12]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
concreteness_df = pd.read_csv('../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.9,0,0


In [13]:
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 # Normalizing to a scale of 0 to 1

In [14]:
len(word_to_concreteness_score_map.keys())

39954

In [15]:
def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return word_to_concreteness_score_map.get(word, 0.5)

In [16]:
def calculate_text_concreteness(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    # Take the average concreteness score of all words in the text
    return sum(concreteness_scores) / len(concreteness_scores)

In [17]:
# Example usage
text = "the laws of the world can't stop him"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.5246666666666667


In [18]:
# Example usage
text = "car crash"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.868


In [19]:
df_train['concreteness_score'] = df_train.sentence2_preprocessed.apply(calculate_text_concreteness)

In [20]:
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed,class_label,concreteness_score
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation,More Likely,0.435714
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath,Equally Likely,0.526


In [21]:
df_train.shape

(12892, 9)

# Fine Tuning Bert Base Uncased on ADEPT

## Data Preparation

In [22]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
adept_data_path

'../datasets/adept/train-dev-test-split'

In [24]:
!ls '../datasets/adept/train-dev-test-split'

README.md  test.json  train.json  val.json


In [25]:
train_split = "train.json"
validation_split = "val.json"
test_split = "test.json"

In [26]:
data_files = {
    "train": "{}/{}".format(adept_data_path, train_split), 
    "validation": "{}/{}".format(adept_data_path, validation_split), 
    "test": "{}/{}".format(adept_data_path, test_split),
}

In [27]:
adept_dataset = load_dataset("json", data_files=data_files)
adept_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence2', 'label', 'idx', 'sentence1', 'modifier', 'noun'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['sentence2', 'label', 'idx', 'sentence1', 'modifier', 'noun'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['sentence2', 'label', 'idx', 'sentence1', 'modifier', 'noun'],
        num_rows: 1612
    })
})

In [28]:
adept_dataset['train'].features

{'sentence2': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'idx': Value(dtype='int64', id=None),
 'sentence1': Value(dtype='string', id=None),
 'modifier': Value(dtype='string', id=None),
 'noun': Value(dtype='string', id=None)}

In [29]:
adept_dataset['train'][10]

{'sentence2': 'An outstanding year is made up of 365 days.',
 'label': 2,
 'idx': 2825,
 'sentence1': 'A year is made up of 365 days.',
 'modifier': 'outstanding',
 'noun': 'year'}

These are the best params we got after fine tuning different models and parameter using optuna

In [30]:
best_params = {'learning_rate': 3.660515504756857e-05,
 'num_train_epochs': 3,
 'model_name': "microsoft/deberta-base"}

In [31]:
checkpoint = best_params['model_name']

In [32]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [33]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5, ignore_mismatched_sizes=True)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
def tokenize_sentence(item):
    return tokenizer(item['sentence2'], truncation=True)

In [35]:
tokenized_dataset = adept_dataset.map(tokenize_sentence, batched=True)

In [36]:
tokenized_dataset['train'].features

{'sentence2': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'idx': Value(dtype='int64', id=None),
 'sentence1': Value(dtype='string', id=None),
 'modifier': Value(dtype='string', id=None),
 'noun': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [37]:
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx', 'modifier', 'noun'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.with_format("torch")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1612
    })
})

#### Setting Up Data Loaders

In [38]:
batch_size = 32

In [39]:
from torch.utils.data import DataLoader

In [40]:
data_collator = DataCollatorWithPadding(tokenizer)

In [41]:
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
validation_dataloader = DataLoader(tokenized_dataset['validation'], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=batch_size, collate_fn=data_collator)

In [42]:
device = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
model.to(device)
print(device)

cuda


## Training

#### Setting up Optimizer with deacaying learning rate

In [43]:
from torch.optim.adamw import AdamW

In [44]:
optimizer = AdamW(model.parameters(), lr=3.660515504756857e-05)

#### Training Loop

In [45]:
def train_model(model, train_dataloader, num_epochs, optimizer):
    num_training_steps = num_epochs*len(train_dataloader)
    learning_rate_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    model.train()
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k:v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            # calculating gradients
            loss.backward()
            # optimizing weights
            optimizer.step()
            # updating learning rate
            learning_rate_scheduler.step()
            # flushing gradients
            optimizer.zero_grad()
            # updating progress bar
            progress_bar.update(1)

### Evaluation

In [46]:
import evaluate

In [47]:
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
roc_auc =  evaluate.load("roc_auc", "multiclass")

In [48]:
metrics = [accuracy, precision, recall, f1]

#### Evaluation Loop

In [49]:
from sklearn.metrics import roc_auc_score

In [50]:
from datasets import load_metric

In [51]:
def eval_model(model, dataloader):
    model.eval()
    for batch in validation_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        # Extract logits and predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    
        # Apply softmax to convert logits to probabilities
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        
        # Extract probabilities for the positive class
        positive_probabilities = probabilities
    
        # Update metrics for accuracy, precision, recall, and F1
        for metric in metrics:
            metric.add_batch(predictions=predictions, references=batch['labels'])
    
        # Update ROC AUC metric
        roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])
    
    # # Compute metrics for accuracy, precision, recall, and F1
    eval_dict = {}
    
    eval_dict.update(accuracy.compute())
    eval_dict.update(precision.compute(average="macro"))
    eval_dict.update(recall.compute(average="macro"))
    eval_dict.update(f1.compute(average="macro"))
    eval_dict.update(roc_auc.compute(multi_class='ovo', average="macro"))
    return eval_dict

### Training and Testing the Models with different parameters

In [52]:
num_epochs_list = [1, 2, 3, 4, 5]

In [53]:
eval_list = list()
for num_epochs in num_epochs_list:
    data_dict = dict()
    data_dict['num_epochs'] = num_epochs
    train_model(model, train_dataloader, num_epochs, optimizer)
    validation_eval_dict = eval_model(model, validation_dataloader)
    test_eval_dict = eval_model(model, test_dataloader)
    for k, v in validation_eval_dict.items():
        data_dict["validation_{}".format(k)] = v
    for k, v in test_eval_dict.items():
        data_dict["test_{}".format(k)] = v
    print(data_dict)
    eval_list.append(data_dict)

  0%|          | 0/403 [00:00<?, ?it/s]You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 403/403 [01:15<00:00,  5.32it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


NameError: name 'eval_dict' is not defined

In [None]:
results_df = pd.DataFrame(eval_list)
results_df.head(5)