In [1]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import os
import json
import re
from tqdm import tqdm
tqdm.pandas()
from evaluate import load
from sklearn.model_selection import train_test_split

In [2]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")



In [3]:
# Initialize the model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

In [4]:
# Set device (CUDA or CPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [5]:
# Load the data
train_df = pd.read_json(r"D:/BP/data/full_data_train.json")
test_df = pd.read_json(r"D:/BP/data/full_data_test.json")


In [6]:
# For hyperparameter search, use a sample of 10% of the data
hp_train_df = train_df.sample(frac=0.1, random_state=42).reset_index(drop=True)
hp_test_df = test_df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [7]:
# Custom Dataset class for handling the input data
class LegalDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        text_data = entry['text']

        # Combine facts and arguments + judge opinion
        combined_text = " ".join(text_data['facts-and-arguments'] + text_data['judge-opinion'])

        # Tokenize
        encoded_sent = self.tokenizer.encode_plus(
            text=combined_text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )

        input_ids = torch.tensor(encoded_sent['input_ids'])
        attention_mask = torch.tensor(encoded_sent['attention_mask'])
        label = torch.tensor(entry['label'])

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}


In [8]:
# Create datasets
train_dataset = LegalDataset(train_df.to_dict('records'), tokenizer)
test_dataset = LegalDataset(test_df.to_dict('records'), tokenizer)
hp_train_dataset = LegalDataset(hp_train_df.to_dict('records'), tokenizer)
hp_test_dataset = LegalDataset(hp_test_df.to_dict('records'), tokenizer)


In [9]:
# Load evaluation metrics
accuracy_metric = load("accuracy")
f1_metric = load("f1")

In [10]:

# Metric computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro")
    return {'accuracy': accuracy["accuracy"], 'f1-score': f1["f1"]}


In [11]:
# Hyperparameter search space for Optuna
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.005, 0.05),
        "adam_beta1": trial.suggest_float("adam_beta1", 0.75, 0.95),
        "adam_beta2": trial.suggest_float("adam_beta2", 0.99, 0.9999),
        "adam_epsilon": trial.suggest_float("adam_epsilon", 1e-9, 1e-7, log=True)
    }

In [12]:
# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join("TFIDF-INDIC", "output"),
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=os.path.join("TFIDF-INDIC", "logs"),
    evaluation_strategy="epoch",
    logging_steps=250,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1-score",
    save_safetensors=False,
)


# Trainer for hyperparameter search
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_train_dataset,
    eval_dataset=hp_test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Perform hyperparameter search
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize", hp_space=my_hp_space)

[I 2024-10-07 09:34:46,020] A new study created in memory with name: no-name-9611d48c-a034-4bc3-9864-b1f5a0aad3c1
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.6788, 'grad_norm': 0.5041502714157104, 'learning_rate': 3.823437426552948e-05, 'epoch': 0.2}
{'loss': 0.6673, 'grad_norm': 0.20781925320625305, 'learning_rate': 7.646874853105895e-05, 'epoch': 0.4}
{'loss': 0.669, 'grad_norm': 0.20996421575546265, 'learning_rate': 7.055377726473323e-05, 'epoch': 0.6}
{'loss': 0.6697, 'grad_norm': 0.20376186072826385, 'learning_rate': 6.46388059984075e-05, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6663722395896912, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 221.3733, 'eval_samples_per_second': 12.847, 'eval_steps_per_second': 1.608, 'epoch': 1.0}
{'loss': 0.6653, 'grad_norm': 0.5340359807014465, 'learning_rate': 5.872383473208178e-05, 'epoch': 1.0}
{'loss': 0.6674, 'grad_norm': 0.20446573197841644, 'learning_rate': 5.2808863465756056e-05, 'epoch': 1.21}
{'loss': 0.6651, 'grad_norm': 0.46533510088920593, 'learning_rate': 4.689389219943034e-05, 'epoch': 1.41}
{'loss': 0.6703, 'grad_norm': 0.16566941142082214, 'learning_rate': 4.0978920933104617e-05, 'epoch': 1.61}
{'loss': 0.6545, 'grad_norm': 0.20004145801067352, 'learning_rate': 3.506394966677889e-05, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.666498064994812, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 218.1568, 'eval_samples_per_second': 13.036, 'eval_steps_per_second': 1.632, 'epoch': 2.0}
{'loss': 0.6774, 'grad_norm': 0.4276879131793976, 'learning_rate': 2.9148978400453167e-05, 'epoch': 2.01}
{'loss': 0.673, 'grad_norm': 0.5328633785247803, 'learning_rate': 2.323400713412744e-05, 'epoch': 2.21}
{'loss': 0.6624, 'grad_norm': 0.5240008234977722, 'learning_rate': 1.7319035867801718e-05, 'epoch': 2.41}
{'loss': 0.6698, 'grad_norm': 0.5653764605522156, 'learning_rate': 1.1404064601475995e-05, 'epoch': 2.61}
{'loss': 0.6578, 'grad_norm': 4.545071601867676, 'learning_rate': 5.489093335150271e-06, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6655568480491638, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 223.7732, 'eval_samples_per_second': 12.709, 'eval_steps_per_second': 1.591, 'epoch': 3.0}


[I 2024-10-07 11:32:04,733] Trial 0 finished with value: 1.230661040787623 and parameters: {'learning_rate': 7.646874853105895e-05, 'weight_decay': 0.007266970660715497, 'adam_beta1': 0.8677573568579126, 'adam_beta2': 0.9958853170122732, 'adam_epsilon': 1.4717535558267976e-08}. Best is trial 0 with value: 1.230661040787623.


{'train_runtime': 7037.3551, 'train_samples_per_second': 4.243, 'train_steps_per_second': 0.53, 'train_loss': 0.6678296218986552, 'epoch': 3.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.6788, 'grad_norm': 1.6254963874816895, 'learning_rate': 2.547909307035399e-05, 'epoch': 0.2}
{'loss': 0.6665, 'grad_norm': 0.18234169483184814, 'learning_rate': 5.095818614070798e-05, 'epoch': 0.4}
{'loss': 0.6679, 'grad_norm': 0.18612316250801086, 'learning_rate': 4.701649476224975e-05, 'epoch': 0.6}
{'loss': 0.6693, 'grad_norm': 0.20233172178268433, 'learning_rate': 4.307480338379152e-05, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6663767099380493, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 220.8566, 'eval_samples_per_second': 12.877, 'eval_steps_per_second': 1.612, 'epoch': 1.0}
{'loss': 0.6652, 'grad_norm': 0.5417895913124084, 'learning_rate': 3.913311200533329e-05, 'epoch': 1.0}
{'loss': 0.6674, 'grad_norm': 0.20214825868606567, 'learning_rate': 3.519142062687506e-05, 'epoch': 1.21}
{'loss': 0.665, 'grad_norm': 0.46758222579956055, 'learning_rate': 3.124972924841684e-05, 'epoch': 1.41}
{'loss': 0.6702, 'grad_norm': 0.17234039306640625, 'learning_rate': 2.730803786995861e-05, 'epoch': 1.61}
{'loss': 0.6545, 'grad_norm': 0.18183135986328125, 'learning_rate': 2.336634649150038e-05, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6666224002838135, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 221.6234, 'eval_samples_per_second': 12.833, 'eval_steps_per_second': 1.606, 'epoch': 2.0}
{'loss': 0.6774, 'grad_norm': 0.4249871075153351, 'learning_rate': 1.942465511304215e-05, 'epoch': 2.01}
{'loss': 0.673, 'grad_norm': 0.540720522403717, 'learning_rate': 1.5482963734583922e-05, 'epoch': 2.21}
{'loss': 0.6624, 'grad_norm': 0.5242999792098999, 'learning_rate': 1.1541272356125693e-05, 'epoch': 2.41}
{'loss': 0.6698, 'grad_norm': 0.5669934749603271, 'learning_rate': 7.5995809776674645e-06, 'epoch': 2.61}
{'loss': 0.6578, 'grad_norm': 0.2035922259092331, 'learning_rate': 3.657889599209236e-06, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6655935049057007, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 218.6211, 'eval_samples_per_second': 13.009, 'eval_steps_per_second': 1.628, 'epoch': 3.0}


[I 2024-10-07 13:30:12,681] Trial 1 finished with value: 1.230661040787623 and parameters: {'learning_rate': 5.095818614070798e-05, 'weight_decay': 0.03431183228346335, 'adam_beta1': 0.8300069547892607, 'adam_beta2': 0.994837228693831, 'adam_epsilon': 9.354225721041998e-08}. Best is trial 0 with value: 1.230661040787623.


{'train_runtime': 7086.5454, 'train_samples_per_second': 4.213, 'train_steps_per_second': 0.527, 'train_loss': 0.6676642488436684, 'epoch': 3.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.682, 'grad_norm': 1.9161343574523926, 'learning_rate': 1.2107957302499056e-05, 'epoch': 0.2}
{'loss': 0.6716, 'grad_norm': 0.597087025642395, 'learning_rate': 2.4215914604998112e-05, 'epoch': 0.4}
{'loss': 0.6746, 'grad_norm': 0.7952197194099426, 'learning_rate': 2.2342777646071896e-05, 'epoch': 0.6}
{'loss': 0.673, 'grad_norm': 0.599053680896759, 'learning_rate': 2.046964068714568e-05, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6665228605270386, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 218.3233, 'eval_samples_per_second': 13.027, 'eval_steps_per_second': 1.631, 'epoch': 1.0}
{'loss': 0.6689, 'grad_norm': 1.109226107597351, 'learning_rate': 1.8596503728219465e-05, 'epoch': 1.0}
{'loss': 0.6696, 'grad_norm': 0.6464194059371948, 'learning_rate': 1.672336676929325e-05, 'epoch': 1.21}
{'loss': 0.6664, 'grad_norm': 0.6577869057655334, 'learning_rate': 1.4850229810367036e-05, 'epoch': 1.41}
{'loss': 0.6732, 'grad_norm': 0.410763144493103, 'learning_rate': 1.297709285144082e-05, 'epoch': 1.61}
{'loss': 0.6545, 'grad_norm': 0.41636529564857483, 'learning_rate': 1.1103955892514604e-05, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6664108633995056, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 223.7566, 'eval_samples_per_second': 12.71, 'eval_steps_per_second': 1.591, 'epoch': 2.0}
{'loss': 0.6782, 'grad_norm': 0.8310196399688721, 'learning_rate': 9.230818933588389e-06, 'epoch': 2.01}
{'loss': 0.6736, 'grad_norm': 1.0135592222213745, 'learning_rate': 7.357681974662174e-06, 'epoch': 2.21}
{'loss': 0.6638, 'grad_norm': 4.482280254364014, 'learning_rate': 5.484545015735959e-06, 'epoch': 2.41}
{'loss': 0.6212, 'grad_norm': 12.403129577636719, 'learning_rate': 3.611408056809743e-06, 'epoch': 2.61}
{'loss': 0.5652, 'grad_norm': 14.847344398498535, 'learning_rate': 1.7382710978835278e-06, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.5750898718833923, 'eval_accuracy': 0.7021800281293952, 'eval_f1-score': 0.7021800281293952, 'eval_runtime': 200.6568, 'eval_samples_per_second': 14.173, 'eval_steps_per_second': 1.774, 'epoch': 3.0}


[I 2024-10-07 15:27:30,588] Trial 2 finished with value: 1.4043600562587903 and parameters: {'learning_rate': 2.4215914604998112e-05, 'weight_decay': 0.03430236461534156, 'adam_beta1': 0.7694096806477002, 'adam_beta2': 0.9926678458745857, 'adam_epsilon': 2.8826904213128078e-08}. Best is trial 2 with value: 1.4043600562587903.


{'train_runtime': 7035.0941, 'train_samples_per_second': 4.244, 'train_steps_per_second': 0.53, 'train_loss': 0.6552073692329955, 'epoch': 3.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.6799, 'grad_norm': 1.9066578149795532, 'learning_rate': 1.7037552893984485e-05, 'epoch': 0.2}
{'loss': 0.6711, 'grad_norm': 0.8031437993049622, 'learning_rate': 3.407510578796897e-05, 'epoch': 0.4}
{'loss': 0.6786, 'grad_norm': 0.7027639150619507, 'learning_rate': 3.1439345748676815e-05, 'epoch': 0.6}
{'loss': 0.6746, 'grad_norm': 0.8678291440010071, 'learning_rate': 2.880358570938466e-05, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6663063764572144, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 217.439, 'eval_samples_per_second': 13.08, 'eval_steps_per_second': 1.637, 'epoch': 1.0}
{'loss': 0.6697, 'grad_norm': 1.122818946838379, 'learning_rate': 2.6167825670092507e-05, 'epoch': 1.0}
{'loss': 0.6703, 'grad_norm': 0.8163560628890991, 'learning_rate': 2.353206563080035e-05, 'epoch': 1.21}
{'loss': 0.6667, 'grad_norm': 0.7110451459884644, 'learning_rate': 2.08963055915082e-05, 'epoch': 1.41}
{'loss': 0.6743, 'grad_norm': 0.6900691390037537, 'learning_rate': 1.8260545552216044e-05, 'epoch': 1.61}
{'loss': 0.6511, 'grad_norm': 1.575913667678833, 'learning_rate': 1.562478551292389e-05, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6520214080810547, 'eval_accuracy': 0.6237693389592124, 'eval_f1-score': 0.6237693389592124, 'eval_runtime': 220.0067, 'eval_samples_per_second': 12.927, 'eval_steps_per_second': 1.618, 'epoch': 2.0}
{'loss': 0.6659, 'grad_norm': 8.741456031799316, 'learning_rate': 1.2989025473631736e-05, 'epoch': 2.01}
{'loss': 0.6428, 'grad_norm': 2.6647322177886963, 'learning_rate': 1.0353265434339582e-05, 'epoch': 2.21}
{'loss': 0.6204, 'grad_norm': 8.341981887817383, 'learning_rate': 7.717505395047427e-06, 'epoch': 2.41}
{'loss': 0.5767, 'grad_norm': 11.973782539367676, 'learning_rate': 5.081745355755273e-06, 'epoch': 2.61}
{'loss': 0.5341, 'grad_norm': 102.30635833740234, 'learning_rate': 2.4459853164631187e-06, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.5360414981842041, 'eval_accuracy': 0.7387482419127989, 'eval_f1-score': 0.7387482419127989, 'eval_runtime': 221.1731, 'eval_samples_per_second': 12.859, 'eval_steps_per_second': 1.61, 'epoch': 3.0}


[I 2024-10-07 17:24:44,617] Trial 3 finished with value: 1.4774964838255977 and parameters: {'learning_rate': 3.407510578796897e-05, 'weight_decay': 0.03531185348182403, 'adam_beta1': 0.9206002002776718, 'adam_beta2': 0.99340586644022, 'adam_epsilon': 2.691886913031349e-08}. Best is trial 3 with value: 1.4774964838255977.


{'train_runtime': 7032.6055, 'train_samples_per_second': 4.245, 'train_steps_per_second': 0.531, 'train_loss': 0.6422663890024984, 'epoch': 3.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.6922, 'grad_norm': 1.6862844228744507, 'learning_rate': 1.104436851827902e-06, 'epoch': 0.2}
{'loss': 0.6722, 'grad_norm': 4.360257625579834, 'learning_rate': 2.208873703655804e-06, 'epoch': 0.4}
{'loss': 0.6616, 'grad_norm': 3.796442747116089, 'learning_rate': 2.038014042172527e-06, 'epoch': 0.6}
{'loss': 0.6508, 'grad_norm': 19.051544189453125, 'learning_rate': 1.8671543806892502e-06, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.629963219165802, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 217.5873, 'eval_samples_per_second': 13.071, 'eval_steps_per_second': 1.636, 'epoch': 1.0}
{'loss': 0.6415, 'grad_norm': 21.101470947265625, 'learning_rate': 1.6962947192059732e-06, 'epoch': 1.0}
{'loss': 0.6345, 'grad_norm': 12.217804908752441, 'learning_rate': 1.5254350577226962e-06, 'epoch': 1.21}
{'loss': 0.6197, 'grad_norm': 9.992962837219238, 'learning_rate': 1.3545753962394194e-06, 'epoch': 1.41}
{'loss': 0.6376, 'grad_norm': 26.90701675415039, 'learning_rate': 1.1837157347561426e-06, 'epoch': 1.61}
{'loss': 0.5953, 'grad_norm': 41.668479919433594, 'learning_rate': 1.0128560732728656e-06, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6059439182281494, 'eval_accuracy': 0.6642053445850914, 'eval_f1-score': 0.6642053445850914, 'eval_runtime': 217.3146, 'eval_samples_per_second': 13.087, 'eval_steps_per_second': 1.638, 'epoch': 2.0}
{'loss': 0.6264, 'grad_norm': 29.919490814208984, 'learning_rate': 8.419964117895887e-07, 'epoch': 2.01}
{'loss': 0.6065, 'grad_norm': 29.250368118286133, 'learning_rate': 6.711367503063117e-07, 'epoch': 2.21}
{'loss': 0.5927, 'grad_norm': 25.964027404785156, 'learning_rate': 5.002770888230348e-07, 'epoch': 2.41}
{'loss': 0.5826, 'grad_norm': 59.27925491333008, 'learning_rate': 3.2941742733975786e-07, 'epoch': 2.61}
{'loss': 0.5775, 'grad_norm': 96.77484893798828, 'learning_rate': 1.5855776585648097e-07, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.592306911945343, 'eval_accuracy': 0.6958509142053446, 'eval_f1-score': 0.6958509142053446, 'eval_runtime': 227.4902, 'eval_samples_per_second': 12.502, 'eval_steps_per_second': 1.565, 'epoch': 3.0}


[I 2024-10-07 19:22:03,646] Trial 4 finished with value: 1.3917018284106892 and parameters: {'learning_rate': 2.208873703655804e-06, 'weight_decay': 0.04287427710865937, 'adam_beta1': 0.8824516184010137, 'adam_beta2': 0.9971084500877749, 'adam_epsilon': 8.042512066006131e-08}. Best is trial 3 with value: 1.4774964838255977.


{'train_runtime': 7037.6457, 'train_samples_per_second': 4.242, 'train_steps_per_second': 0.53, 'train_loss': 0.6251812206076145, 'epoch': 3.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.6846, 'grad_norm': 4.233879566192627, 'learning_rate': 4.799660172905951e-06, 'epoch': 0.2}
{'loss': 0.6663, 'grad_norm': 1.1304798126220703, 'learning_rate': 9.599320345811902e-06, 'epoch': 0.4}
{'loss': 0.6698, 'grad_norm': 4.157764911651611, 'learning_rate': 8.856798660646995e-06, 'epoch': 0.6}
{'loss': 0.6733, 'grad_norm': 0.847204327583313, 'learning_rate': 8.11427697548209e-06, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6704636216163635, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 227.7733, 'eval_samples_per_second': 12.486, 'eval_steps_per_second': 1.563, 'epoch': 1.0}
{'loss': 0.6695, 'grad_norm': 3.229316473007202, 'learning_rate': 7.371755290317184e-06, 'epoch': 1.0}
{'loss': 0.6709, 'grad_norm': 1.2789870500564575, 'learning_rate': 6.629233605152278e-06, 'epoch': 1.21}
{'loss': 0.6624, 'grad_norm': 6.066472053527832, 'learning_rate': 5.886711919987373e-06, 'epoch': 1.41}
{'loss': 0.6686, 'grad_norm': 2.796973705291748, 'learning_rate': 5.144190234822467e-06, 'epoch': 1.61}
{'loss': 0.6451, 'grad_norm': 5.11477518081665, 'learning_rate': 4.401668549657562e-06, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6268689036369324, 'eval_accuracy': 0.6343178621659634, 'eval_f1-score': 0.6343178621659634, 'eval_runtime': 220.5235, 'eval_samples_per_second': 12.897, 'eval_steps_per_second': 1.614, 'epoch': 2.0}
{'loss': 0.659, 'grad_norm': 6.728235244750977, 'learning_rate': 3.6591468644926555e-06, 'epoch': 2.01}
{'loss': 0.629, 'grad_norm': 16.033184051513672, 'learning_rate': 2.9166251793277498e-06, 'epoch': 2.21}
{'loss': 0.6068, 'grad_norm': 34.65510177612305, 'learning_rate': 2.174103494162844e-06, 'epoch': 2.41}
{'loss': 0.5781, 'grad_norm': 30.347795486450195, 'learning_rate': 1.4315818089979383e-06, 'epoch': 2.61}
{'loss': 0.5502, 'grad_norm': 28.71012306213379, 'learning_rate': 6.890601238330325e-07, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.5605212450027466, 'eval_accuracy': 0.7218706047819972, 'eval_f1-score': 0.7218706047819972, 'eval_runtime': 220.1567, 'eval_samples_per_second': 12.918, 'eval_steps_per_second': 1.617, 'epoch': 3.0}


[I 2024-10-07 22:28:49,817] Trial 5 finished with value: 1.4437412095639943 and parameters: {'learning_rate': 9.599320345811902e-06, 'weight_decay': 0.044925758402875854, 'adam_beta1': 0.836436577347852, 'adam_beta2': 0.999541155094239, 'adam_epsilon': 4.012298217161426e-09}. Best is trial 3 with value: 1.4774964838255977.


{'train_runtime': 11187.3212, 'train_samples_per_second': 2.669, 'train_steps_per_second': 0.334, 'train_loss': 0.640534040117826, 'epoch': 3.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.6793, 'grad_norm': 0.6494998931884766, 'learning_rate': 2.2677128530668073e-05, 'epoch': 0.2}
{'loss': 0.6692, 'grad_norm': 0.22072833776474, 'learning_rate': 4.5354257061336145e-05, 'epoch': 0.4}
{'loss': 0.6693, 'grad_norm': 0.23259572684764862, 'learning_rate': 4.184603791983428e-05, 'epoch': 0.6}
{'loss': 0.6699, 'grad_norm': 0.22088739275932312, 'learning_rate': 3.833781877833241e-05, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6660535931587219, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 218.04, 'eval_samples_per_second': 13.043, 'eval_steps_per_second': 1.633, 'epoch': 1.0}
{'loss': 0.6658, 'grad_norm': 0.6191624999046326, 'learning_rate': 3.482959963683054e-05, 'epoch': 1.0}
{'loss': 0.6674, 'grad_norm': 0.21107158064842224, 'learning_rate': 3.132138049532867e-05, 'epoch': 1.21}
{'loss': 0.665, 'grad_norm': 0.5341098308563232, 'learning_rate': 2.781316135382681e-05, 'epoch': 1.41}
{'loss': 0.6701, 'grad_norm': 0.181875079870224, 'learning_rate': 2.4304942212324942e-05, 'epoch': 1.61}
{'loss': 0.6546, 'grad_norm': 0.2089805155992508, 'learning_rate': 2.0796723070823074e-05, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

[I 2024-10-07 23:47:03,376] Trial 6 pruned. 


{'eval_loss': 0.6660272479057312, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 218.3733, 'eval_samples_per_second': 13.024, 'eval_steps_per_second': 1.63, 'epoch': 2.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.6933, 'grad_norm': 1.9709734916687012, 'learning_rate': 7.993324440647616e-07, 'epoch': 0.2}
{'loss': 0.6773, 'grad_norm': 2.909510612487793, 'learning_rate': 1.5986648881295231e-06, 'epoch': 0.4}
{'loss': 0.6612, 'grad_norm': 8.1226167678833, 'learning_rate': 1.4750057847779202e-06, 'epoch': 0.6}
{'loss': 0.6561, 'grad_norm': 5.385683059692383, 'learning_rate': 1.351346681426317e-06, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6399262547492981, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 221.2736, 'eval_samples_per_second': 12.853, 'eval_steps_per_second': 1.609, 'epoch': 1.0}
{'loss': 0.6467, 'grad_norm': 25.98251724243164, 'learning_rate': 1.2276875780747142e-06, 'epoch': 1.0}
{'loss': 0.6426, 'grad_norm': 19.737024307250977, 'learning_rate': 1.1040284747231113e-06, 'epoch': 1.21}
{'loss': 0.6287, 'grad_norm': 8.75995922088623, 'learning_rate': 9.803693713715084e-07, 'epoch': 1.41}
{'loss': 0.6394, 'grad_norm': 14.924677848815918, 'learning_rate': 8.567102680199053e-07, 'epoch': 1.61}
{'loss': 0.6131, 'grad_norm': 38.41958236694336, 'learning_rate': 7.330511646683024e-07, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6410147547721863, 'eval_accuracy': 0.6223628691983122, 'eval_f1-score': 0.6223628691983122, 'eval_runtime': 222.3068, 'eval_samples_per_second': 12.793, 'eval_steps_per_second': 1.601, 'epoch': 2.0}
{'loss': 0.6373, 'grad_norm': 25.605606079101562, 'learning_rate': 6.093920613166994e-07, 'epoch': 2.01}
{'loss': 0.6257, 'grad_norm': 20.130859375, 'learning_rate': 4.857329579650964e-07, 'epoch': 2.21}
{'loss': 0.6103, 'grad_norm': 51.88593292236328, 'learning_rate': 3.620738546134935e-07, 'epoch': 2.41}
{'loss': 0.6063, 'grad_norm': 65.66912841796875, 'learning_rate': 2.384147512618905e-07, 'epoch': 2.61}
{'loss': 0.5985, 'grad_norm': 28.246807098388672, 'learning_rate': 1.1475564791028754e-07, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

[I 2024-10-08 01:44:31,409] Trial 7 pruned. 


{'eval_loss': 0.608388364315033, 'eval_accuracy': 0.6606891701828411, 'eval_f1-score': 0.6606891701828411, 'eval_runtime': 219.126, 'eval_samples_per_second': 12.979, 'eval_steps_per_second': 1.625, 'epoch': 3.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.692, 'grad_norm': 2.4319980144500732, 'learning_rate': 1.187567621332949e-06, 'epoch': 0.2}
{'loss': 0.6732, 'grad_norm': 2.8468563556671143, 'learning_rate': 2.375135242665898e-06, 'epoch': 0.4}
{'loss': 0.6595, 'grad_norm': 8.989683151245117, 'learning_rate': 2.1914150042171124e-06, 'epoch': 0.6}
{'loss': 0.6527, 'grad_norm': 8.29416561126709, 'learning_rate': 2.007694765768327e-06, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.632744312286377, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 217.5067, 'eval_samples_per_second': 13.075, 'eval_steps_per_second': 1.637, 'epoch': 1.0}
{'loss': 0.6458, 'grad_norm': 18.114973068237305, 'learning_rate': 1.8239745273195415e-06, 'epoch': 1.0}
{'loss': 0.6395, 'grad_norm': 7.920067310333252, 'learning_rate': 1.6402542888707562e-06, 'epoch': 1.21}
{'loss': 0.6233, 'grad_norm': 10.667436599731445, 'learning_rate': 1.456534050421971e-06, 'epoch': 1.41}
{'loss': 0.6412, 'grad_norm': 6.184597969055176, 'learning_rate': 1.2728138119731855e-06, 'epoch': 1.61}
{'loss': 0.6094, 'grad_norm': 52.545352935791016, 'learning_rate': 1.0890935735244e-06, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6259570717811584, 'eval_accuracy': 0.6526019690576652, 'eval_f1-score': 0.6526019690576652, 'eval_runtime': 219.4069, 'eval_samples_per_second': 12.962, 'eval_steps_per_second': 1.623, 'epoch': 2.0}
{'loss': 0.6298, 'grad_norm': 18.486547470092773, 'learning_rate': 9.053733350756146e-07, 'epoch': 2.01}
{'loss': 0.6189, 'grad_norm': 23.88905906677246, 'learning_rate': 7.216530966268292e-07, 'epoch': 2.21}
{'loss': 0.605, 'grad_norm': 50.71733093261719, 'learning_rate': 5.379328581780437e-07, 'epoch': 2.41}
{'loss': 0.5961, 'grad_norm': 48.17784118652344, 'learning_rate': 3.542126197292583e-07, 'epoch': 2.61}
{'loss': 0.586, 'grad_norm': 32.58961486816406, 'learning_rate': 1.7049238128047285e-07, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

[I 2024-10-08 03:41:22,782] Trial 8 pruned. 


{'eval_loss': 0.5952850580215454, 'eval_accuracy': 0.6782700421940928, 'eval_f1-score': 0.6782700421940928, 'eval_runtime': 222.1236, 'eval_samples_per_second': 12.804, 'eval_steps_per_second': 1.603, 'epoch': 3.0}


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3732 [00:00<?, ?it/s]

{'loss': 0.6864, 'grad_norm': 2.902397871017456, 'learning_rate': 3.6200583643177606e-06, 'epoch': 0.2}
{'loss': 0.6651, 'grad_norm': 3.196052074432373, 'learning_rate': 7.240116728635521e-06, 'epoch': 0.4}
{'loss': 0.6639, 'grad_norm': 2.8371500968933105, 'learning_rate': 6.680082947026957e-06, 'epoch': 0.6}
{'loss': 0.6655, 'grad_norm': 3.726550579071045, 'learning_rate': 6.120049165418392e-06, 'epoch': 0.8}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6442258358001709, 'eval_accuracy': 0.6153305203938115, 'eval_f1-score': 0.6153305203938115, 'eval_runtime': 219.1736, 'eval_samples_per_second': 12.976, 'eval_steps_per_second': 1.624, 'epoch': 1.0}
{'loss': 0.6527, 'grad_norm': 14.38537883758545, 'learning_rate': 5.560015383809828e-06, 'epoch': 1.0}
{'loss': 0.6533, 'grad_norm': 6.946536064147949, 'learning_rate': 4.999981602201263e-06, 'epoch': 1.21}
{'loss': 0.6325, 'grad_norm': 13.751725196838379, 'learning_rate': 4.439947820592699e-06, 'epoch': 1.41}
{'loss': 0.6216, 'grad_norm': 12.731352806091309, 'learning_rate': 3.879914038984135e-06, 'epoch': 1.61}
{'loss': 0.6029, 'grad_norm': 38.140438079833984, 'learning_rate': 3.31988025737557e-06, 'epoch': 1.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.6013708710670471, 'eval_accuracy': 0.7060478199718706, 'eval_f1-score': 0.7060478199718706, 'eval_runtime': 211.5565, 'eval_samples_per_second': 13.443, 'eval_steps_per_second': 1.683, 'epoch': 2.0}
{'loss': 0.6195, 'grad_norm': 8.861154556274414, 'learning_rate': 2.7598464757670056e-06, 'epoch': 2.01}
{'loss': 0.5973, 'grad_norm': 10.730295181274414, 'learning_rate': 2.199812694158441e-06, 'epoch': 2.21}
{'loss': 0.5768, 'grad_norm': 27.689355850219727, 'learning_rate': 1.6397789125498767e-06, 'epoch': 2.41}
{'loss': 0.5472, 'grad_norm': 23.123262405395508, 'learning_rate': 1.0797451309413122e-06, 'epoch': 2.61}
{'loss': 0.5419, 'grad_norm': 35.70475769042969, 'learning_rate': 5.197113493327477e-07, 'epoch': 2.81}


  0%|          | 0/356 [00:00<?, ?it/s]

{'eval_loss': 0.5515127182006836, 'eval_accuracy': 0.7433192686357243, 'eval_f1-score': 0.7433192686357243, 'eval_runtime': 217.4566, 'eval_samples_per_second': 13.078, 'eval_steps_per_second': 1.637, 'epoch': 3.0}


[I 2024-10-08 05:38:26,743] Trial 9 finished with value: 1.4866385372714486 and parameters: {'learning_rate': 7.240116728635521e-06, 'weight_decay': 0.022606675299539355, 'adam_beta1': 0.8211102421288813, 'adam_beta2': 0.9969784333856463, 'adam_epsilon': 2.7064505537870976e-09}. Best is trial 9 with value: 1.4866385372714486.


{'train_runtime': 7022.511, 'train_samples_per_second': 4.251, 'train_steps_per_second': 0.531, 'train_loss': 0.6188587115125257, 'epoch': 3.0}


In [14]:
# Print the best hyperparameters
print("Best Hyperparameters:", best_run)

Best Hyperparameters: BestRun(run_id='9', objective=1.4866385372714486, hyperparameters={'learning_rate': 7.240116728635521e-06, 'weight_decay': 0.022606675299539355, 'adam_beta1': 0.8211102421288813, 'adam_beta2': 0.9969784333856463, 'adam_epsilon': 2.7064505537870976e-09}, run_summary=None)


In [15]:
# Garbage collection to free memory after hyperparameter tuning
del trainer
del training_args
import gc
gc.collect()

119

In [16]:
# Training with best hyperparameters
print("Starting final training...")

# Reload training arguments
training_args = TrainingArguments(
    output_dir=os.path.join("TFIDF-INDIC", "output"),
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=os.path.join("TFIDF-INDIC", "logs"),
    evaluation_strategy="epoch",
    logging_steps=250,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1-score",
    save_safetensors=False,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

Starting final training...


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Load the best hyperparameters and start training
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

In [18]:
# Train the model
trainer.train()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/12441 [00:00<?, ?it/s]

{'loss': 0.6874, 'grad_norm': 2.6423277854919434, 'learning_rate': 3.6200583643177606e-06, 'epoch': 0.02}
{'loss': 0.6681, 'grad_norm': 10.021020889282227, 'learning_rate': 7.240116728635521e-06, 'epoch': 0.04}
{'loss': 0.6673, 'grad_norm': 2.565160036087036, 'learning_rate': 7.088535690015733e-06, 'epoch': 0.06}
{'loss': 0.6558, 'grad_norm': 6.731729507446289, 'learning_rate': 6.936954651395947e-06, 'epoch': 0.08}
{'loss': 0.6687, 'grad_norm': 15.670312881469727, 'learning_rate': 6.785373612776159e-06, 'epoch': 0.1}
{'loss': 0.6509, 'grad_norm': 3.0118961334228516, 'learning_rate': 6.633792574156372e-06, 'epoch': 0.12}
{'loss': 0.6631, 'grad_norm': 2.480739116668701, 'learning_rate': 6.482211535536585e-06, 'epoch': 0.14}
{'loss': 0.6418, 'grad_norm': 8.68790340423584, 'learning_rate': 6.330630496916797e-06, 'epoch': 0.16}
{'loss': 0.6095, 'grad_norm': 20.069482803344727, 'learning_rate': 6.1790494582970105e-06, 'epoch': 0.18}
{'loss': 0.5676, 'grad_norm': 14.53165054321289, 'learning_

  0%|          | 0/3555 [00:00<?, ?it/s]

{'eval_loss': 0.43305346369743347, 'eval_accuracy': 0.7984596989731326, 'eval_f1-score': 0.7984596989731326, 'eval_runtime': 2219.5695, 'eval_samples_per_second': 12.811, 'eval_steps_per_second': 1.602, 'epoch': 1.0}
{'train_runtime': 23687.7936, 'train_samples_per_second': 4.201, 'train_steps_per_second': 0.525, 'train_loss': 0.5067445521818059, 'epoch': 1.0}


TrainOutput(global_step=12441, training_loss=0.5067445521818059, metrics={'train_runtime': 23687.7936, 'train_samples_per_second': 4.201, 'train_steps_per_second': 0.525, 'total_flos': 2378386883358720.0, 'train_loss': 0.5067445521818059, 'epoch': 1.0})

In [19]:
trainer.save_model(r"D:/BP/model")