In [1]:
import torch

In [2]:
torch.cuda.is_available()

True

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [4]:
!ls ../../../datasets/adept/train-dev-test-split/

ls: cannot access '../../datasets/adept/train-dev-test-split/': No such file or directory


In [5]:
import json
import nltk
import re

In [6]:
adept_data_path = "../../../datasets/adept/train-dev-test-split"
split = "train"

In [7]:
train_data = json.load(open('{}/{}.json'.format(adept_data_path, split), 'r'))

FileNotFoundError: [Errno 2] No such file or directory: '../../datasets/adept/train-dev-test-split/train.json'

In [None]:
df_train = pd.DataFrame(train_data)
df_train.head()

In [None]:
label_to_class_map = {0:"Impossible", 1:"Less Likely", 2:"Equally Likely", 3:"More Likely", 4:"Necessarily True"}

In [None]:
label_to_class_map.values()

In [None]:
df_train['sentence2_preprocessed'] = df_train['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_train['sentence2_preprocessed'] = df_train['sentence2_preprocessed'].map(lambda x: x.lower())
df_train['class_label'] = df_train.label.map(lambda x: label_to_class_map[x])
df_train.head(2)

In [None]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
concreteness_df = pd.read_csv('../../../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

In [None]:
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 # Normalizing to a scale of 0 to 1

In [None]:
len(word_to_concreteness_score_map.keys())

In [None]:
def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return word_to_concreteness_score_map.get(word, 0.5)

In [None]:
def calculate_text_concreteness(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    # Take the average concreteness score of all words in the text
    return sum(concreteness_scores) / len(concreteness_scores)

In [None]:
# Example usage
text = "the laws of the world can't stop him"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

In [None]:
# Example usage
text = "car crash"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

In [None]:
df_train['concreteness_score'] = df_train.sentence2_preprocessed.apply(calculate_text_concreteness)

In [None]:
df_train.head(2)

In [None]:
df_train.shape

# Fine Tuning Bert Base Uncased on ADEPT

## Data Preparation

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset
from tqdm.auto import tqdm

In [None]:
adept_data_path

In [None]:
!ls '../../../datasets/adept/train-dev-test-split'

In [None]:
train_split = "train.json"
validation_split = "val.json"
test_split = "test.json"

In [None]:
data_files = {
    "train": "{}/{}".format(adept_data_path, train_split), 
    "validation": "{}/{}".format(adept_data_path, validation_split), 
    "test": "{}/{}".format(adept_data_path, test_split),
}

In [None]:
adept_dataset = load_dataset("json", data_files=data_files)
adept_dataset

In [None]:
adept_dataset['train'].features

In [None]:
adept_dataset['train'][10]

These are the best params we got after fine tuning different models and parameter using optuna

In [None]:
best_params = {'learning_rate': 3.660515504756857e-05,
 'num_train_epochs': 3,
 'model_name': "microsoft/deberta-base"}

In [None]:
checkpoint = best_params['model_name']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5, ignore_mismatched_sizes=True)

In [None]:
def tokenize_sentence(item):
    return tokenizer(item['sentence2'], truncation=True)

In [None]:
tokenized_dataset = adept_dataset.map(tokenize_sentence, batched=True)

In [None]:
tokenized_dataset['train'].features

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx', 'modifier', 'noun'])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.with_format("torch")
tokenized_dataset

#### Setting Up Data Loaders

In [None]:
batch_size = 32

In [None]:
from torch.utils.data import DataLoader

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
validation_dataloader = DataLoader(tokenized_dataset['validation'], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=batch_size, collate_fn=data_collator)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
model.to(device)
print(device)

## Training

#### Setting up Optimizer with deacaying learning rate

In [None]:
from torch.optim.adamw import AdamW

In [None]:
optimizer = AdamW(model.parameters(), lr=3.660515504756857e-05)

#### Training Loop

In [None]:
def train_model(model, train_dataloader, num_epochs, optimizer):
    num_training_steps = num_epochs*len(train_dataloader)
    learning_rate_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    model.train()
    progress_bar = tqdm(range(num_training_steps))
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k:v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            # calculating gradients
            loss.backward()
            # optimizing weights
            optimizer.step()
            # updating learning rate
            learning_rate_scheduler.step()
            # flushing gradients
            optimizer.zero_grad()
            # updating progress bar
            progress_bar.update(1)

### Evaluation

In [None]:
import evaluate

In [None]:
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
roc_auc =  evaluate.load("roc_auc", "multiclass")

In [None]:
metrics = [accuracy, precision, recall, f1]

#### Evaluation Loop

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
from datasets import load_metric

In [None]:
def eval_model(model, dataloader):
    model.eval()
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        # Extract logits and predictions
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    
        # Apply softmax to convert logits to probabilities
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        
        # Extract probabilities for the positive class
        positive_probabilities = probabilities
    
        # Update metrics for accuracy, precision, recall, and F1
        for metric in metrics:
            metric.add_batch(predictions=predictions, references=batch['labels'])
    
        # Update ROC AUC metric
        roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])
    
    # # Compute metrics for accuracy, precision, recall, and F1
    eval_dict = {}
    
    eval_dict.update(accuracy.compute())
    eval_dict.update(precision.compute(average="macro"))
    eval_dict.update(recall.compute(average="macro"))
    eval_dict.update(f1.compute(average="macro"))
    eval_dict.update(roc_auc.compute(multi_class='ovo', average="macro"))
    return eval_dict

### Training and Testing the Models with different parameters

In [None]:
num_epochs_list = [1, 2, 3, 4, 5]

In [None]:
eval_list = list()
for num_epochs in num_epochs_list:
    data_dict = dict()
    data_dict['num_epochs'] = num_epochs
    train_model(model, train_dataloader, num_epochs, optimizer)
    validation_eval_dict = eval_model(model, validation_dataloader)
    test_eval_dict = eval_model(model, test_dataloader)
    for k, v in validation_eval_dict.items():
        data_dict["validation_{}".format(k)] = v
    for k, v in test_eval_dict.items():
        data_dict["test_{}".format(k)] = v
    print(data_dict)
    eval_list.append(data_dict)

In [None]:
results_df = pd.DataFrame(eval_list)
results_df.head(5)