In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [5]:
!ls ../datasets/adept/train-dev-test-split/

ls: cannot access '../datasets/adept/train-dev-test-split/': No such file or directory


In [6]:
import json
import nltk
import re

In [7]:
adept_data_path = "../../../datasets/adept/train-dev-test-split"
split = "train"

In [8]:
train_data = json.load(open('{}/{}.json'.format(adept_data_path, split), 'r'))

In [9]:
df_train = pd.DataFrame(train_data)
df_train.head()

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620
2,A scene is painted.,A negative scene is painted.,negative,scene,2,3324
3,A bone breaks a tooth.,An alleged bone breaks a tooth.,alleged,bone,2,10610
4,A trip causes a happening.,A fabulous trip causes a happening.,fabulous,trip,2,14917


In [10]:
label_to_class_map = {0:"Impossible", 1:"Less Likely", 2:"Equally Likely", 3:"More Likely", 4:"Necessarily True"}

In [11]:
label_to_class_map.values()

dict_values(['Impossible', 'Less Likely', 'Equally Likely', 'More Likely', 'Necessarily True'])

In [12]:
df_train['sentence2_preprocessed'] = df_train['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_train['sentence2_preprocessed'] = df_train['sentence2_preprocessed'].map(lambda x: x.lower())
# df_train['class_label'] = df_train.label.map(lambda x: label_to_class_map[x])
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath


In [13]:
# We got this dataset for concreteness of 40k words (https://pubmed.ncbi.nlm.nih.gov/24142837/) from https://web.stanford.edu/class/linguist278/data/
concreteness_df = pd.read_csv('../../../datasets/concreteness/Concreteness_ratings_Brysbaert_et_al_BRM.csv')
concreteness_df.head(2)

Unnamed: 0,Word,Bigram,Conc.M,Conc.SD,Unknown,Total,Percent_known,SUBTLEX,Dom_Pos
0,roadsweeper,0,4.85,0.37,1,27,0.96,0,0
1,traindriver,0,4.54,0.71,3,29,0.9,0,0


In [14]:
word_to_concreteness_score_map = dict()
for idx, row in concreteness_df.iterrows():
    row = row.to_dict()
    word_to_concreteness_score_map[row['Word']] = row['Conc.M']/5.0 # Normalizing to a scale of 0 to 1

In [15]:
len(word_to_concreteness_score_map.keys())

39954

In [16]:
def get_concreteness_score(word):
    """
    Get the concreteness score of a word based on the Concreteness Ratings dataset.
    """
    # If the word is not found in the dataset, return a default score of 0.5
    return round(word_to_concreteness_score_map.get(word, 0.5), 3)

In [17]:
def calculate_text_concreteness(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    # Take the average concreteness score of all words in the text
    return sum(concreteness_scores) / len(concreteness_scores)

In [18]:
def calculate_text_concreteness_sequence(text):
    """
    Calculate the concreteness score for a given text.
    """
    words = nltk.word_tokenize(text)
    concreteness_scores = [get_concreteness_score(word) for word in words]
    concreteness_scores = " ".join([str(i) for i in concreteness_scores])
    # Take the average concreteness score of all words in the text
    return concreteness_scores

In [19]:
# Example usage
text = "the laws of the world can't stop him"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.5246666666666667


In [20]:
# Example usage
text = "car crash"
concreteness_score = calculate_text_concreteness(text)
print(f"Concreteness Score: {concreteness_score}")

Concreteness Score: 0.868


In [21]:
df_train['concreteness_score_sequence'] = df_train.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

In [22]:
df_train.head(2)

Unnamed: 0,sentence1,sentence2,modifier,noun,label,idx,sentence2_preprocessed,concreteness_score_sequence
0,The effect of sleeping is rejuvenation.,The effect of additional sleeping is rejuvenat...,additional,sleeping,3,13484,the effect of additional sleeping is rejuvenation,0.286 0.36 0.334 0.486 0.846 0.318 0.42
1,A toothbrush is for fresh breath.,A regular toothbrush is for fresh breath.,regular,toothbrush,2,2620,a regular toothbrush is for fresh breath,0.292 0.48 1.0 0.318 0.326 0.394 0.872


In [23]:
df_train.shape

(12892, 8)

In [24]:
df_validation = pd.DataFrame(json.load(open('{}/{}.json'.format(adept_data_path, "val"), 'r')))
df_test = pd.DataFrame(json.load(open('{}/{}.json'.format(adept_data_path, "test"), 'r')))
print(df_validation.shape, df_test.shape)

(1611, 6) (1612, 6)


In [25]:
df_validation['sentence2_preprocessed'] = df_validation['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_validation['sentence2_preprocessed'] = df_validation['sentence2_preprocessed'].map(lambda x: x.lower())
df_validation['concreteness_score_sequence'] = df_validation.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

df_test['sentence2_preprocessed'] = df_test['sentence2'].map(lambda x: re.sub('[,\.!?]', '', x))
df_test['sentence2_preprocessed'] = df_test['sentence2_preprocessed'].map(lambda x: x.lower())
df_test['concreteness_score_sequence'] = df_test.sentence2_preprocessed.apply(calculate_text_concreteness_sequence)

# Fine Tuning Bert Base Uncased on ADEPT

## Data Preparation

In [26]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
from transformers import DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, Dataset, DatasetDict
from tqdm.auto import tqdm

In [27]:
adept_data_path

'../../../datasets/adept/train-dev-test-split'

In [28]:
!ls '../../../datasets/adept/train-dev-test-split'

README.md  test.json  train.json  val.json


In [29]:
train_split = "train.json"
validation_split = "val.json"
test_split = "test.json"

In [30]:
adept_dataset = DatasetDict({
    'train': Dataset.from_pandas(df_train),
    'validation': Dataset.from_pandas(df_validation),
    'test': Dataset.from_pandas(df_test)
})

In [31]:
# adept_dataset = load_dataset("json", data_files=data_files)
adept_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'modifier', 'noun', 'label', 'idx', 'sentence2_preprocessed', 'concreteness_score_sequence'],
        num_rows: 1612
    })
})

In [32]:
adept_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'modifier': Value(dtype='string', id=None),
 'noun': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'idx': Value(dtype='int64', id=None),
 'sentence2_preprocessed': Value(dtype='string', id=None),
 'concreteness_score_sequence': Value(dtype='string', id=None)}

In [33]:
adept_dataset['train'][10]

{'sentence1': 'A year is made up of 365 days.',
 'sentence2': 'An outstanding year is made up of 365 days.',
 'modifier': 'outstanding',
 'noun': 'year',
 'label': 2,
 'idx': 2825,
 'sentence2_preprocessed': 'an outstanding year is made up of 365 days',
 'concreteness_score_sequence': '0.292 0.35 0.65 0.318 0.504 0.766 0.334 0.5 0.672'}

In [34]:
best_params = {'learning_rate': 3.660515504756857e-05,
 'num_train_epochs': 3,
 'model_name': 'microsoft/deberta-base'}

In [35]:
checkpoint = best_params['model_name']

In [36]:
models_dict = {"BERT": "bert-base-uncased",
"ROBERTA": "grammarly/detexd-roberta-base",
"DEBERTA": "sileod/deberta-v3-base-tasksource-nli"}

In [37]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [38]:
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5, ignore_mismatched_sizes=True)

In [39]:
def tokenize_sentence(item):
    return tokenizer(item['sentence2'], item['concreteness_score_sequence'], truncation=True)

In [40]:
tokenized_dataset = adept_dataset.map(tokenize_sentence, batched=True)

Map: 100%|██████████| 12892/12892 [00:00<00:00, 24308.99 examples/s]
Map: 100%|██████████| 1611/1611 [00:00<00:00, 31540.39 examples/s]
Map: 100%|██████████| 1612/1612 [00:00<00:00, 31820.94 examples/s]


In [41]:
tokenized_dataset['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'modifier': Value(dtype='string', id=None),
 'noun': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'idx': Value(dtype='int64', id=None),
 'sentence2_preprocessed': Value(dtype='string', id=None),
 'concreteness_score_sequence': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [42]:
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx', 'modifier', 'noun', 'sentence2_preprocessed', "concreteness_score_sequence"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset = tokenized_dataset.with_format("torch")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12892
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1612
    })
})

In [43]:
torch.manual_seed(4)

<torch._C.Generator at 0x7f980c29ea10>

In [44]:
custom_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
custom_model

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

#### Setting Up Data Loaders

In [46]:
num_epochs = 2
batch_size = 32

In [47]:
from torch.utils.data import DataLoader

In [48]:
data_collator = DataCollatorWithPadding(tokenizer)

In [49]:
train_dataloader = DataLoader(tokenized_dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
validation_dataloader = DataLoader(tokenized_dataset['validation'], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=batch_size, collate_fn=data_collator)

In [50]:
device = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
custom_model.to(device)
print(device)

cuda


## Training

#### Setting up Optimizer with deacaying learning rate

In [51]:
from torch.optim.adamw import AdamW

In [52]:
optimizer = AdamW(custom_model.parameters(), lr=3.660515504756857e-05)

In [53]:
num_training_steps = num_epochs*len(train_dataloader)

In [54]:
learning_rate_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

#### Training Loop

In [55]:
custom_model.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = custom_model(**batch)
        loss = outputs.loss
        # calculating gradients
        loss.backward()
        # optimizing weights
        optimizer.step()
        # updating learning rate
        learning_rate_scheduler.step()
        # flushing gradients
        optimizer.zero_grad()
        # updating progress bar
        progress_bar.update(1)

  0%|          | 0/806 [00:00<?, ?it/s]You're using a DebertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 806/806 [05:21<00:00,  2.68it/s]

### Evaluation

In [56]:
custom_model.eval()

DebertaForSequenceClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (

In [57]:
import evaluate

In [58]:
accuracy = evaluate.load('accuracy')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
roc_auc =  evaluate.load("roc_auc", "multiclass")

In [59]:
metrics = [accuracy, precision, recall, f1]

#### Evaluation Loop

In [60]:
from sklearn.metrics import roc_auc_score

In [61]:
from datasets import load_metric

In [62]:
for batch in validation_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = custom_model(**batch)
    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities

    # Update metrics for accuracy, precision, recall, and F1
    for metric in metrics:
        metric.add_batch(predictions=predictions, references=batch['labels'])

    # Update ROC AUC metric
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# # Compute metrics for accuracy, precision, recall, and F1
validation_eval_dict = {}
# for metric in metrics.values():
#     eval_dict.update(metric.compute(average="macro"))

validation_eval_dict.update(accuracy.compute())
validation_eval_dict.update(precision.compute(average="macro"))
validation_eval_dict.update(recall.compute(average="macro"))
validation_eval_dict.update(f1.compute(average="macro"))
validation_eval_dict.update(roc_auc.compute(multi_class='ovo', average="macro"))


In [63]:
validation_eval_dict

{'accuracy': 0.707635009310987,
 'precision': 0.43933949561038654,
 'recall': 0.3683868322677431,
 'f1': 0.3920092483048548,
 'roc_auc': 0.7571802001571821}

In [64]:
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = custom_model(**batch)
    # Extract logits and predictions
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    # Apply softmax to convert logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Extract probabilities for the positive class
    positive_probabilities = probabilities

    # Update metrics for accuracy, precision, recall, and F1
    for metric in metrics:
        metric.add_batch(predictions=predictions, references=batch['labels'])

    # Update ROC AUC metric
    roc_auc.add_batch(prediction_scores=positive_probabilities, references=batch['labels'])

# # Compute metrics for accuracy, precision, recall, and F1
test_eval_dict = {}
# for metric in metrics.values():
#     eval_dict.update(metric.compute(average="macro"))

test_eval_dict.update(accuracy.compute())
test_eval_dict.update(precision.compute(average="macro"))
test_eval_dict.update(recall.compute(average="macro"))
test_eval_dict.update(f1.compute(average="macro"))
test_eval_dict.update(roc_auc.compute(multi_class='ovo', average="macro"))


In [65]:
test_eval_dict

{'accuracy': 0.7295285359801489,
 'precision': 0.46807625765550753,
 'recall': 0.3794498702591282,
 'f1': 0.4090051076275638,
 'roc_auc': 0.7824726370438758}

In [66]:
# custom_model.save_pretrained("deberta_with_concreteness_sequence_2_epoch")