In [1]:
from transformers import TrainingArguments,Trainer
from transformers import AutoModel, AutoModelForSequenceClassification,AutoTokenizer
import torch
import os

models = [
    'microsoft/deberta-v3-xsmall',
    'microsoft/deberta-v3-small',
    'microsoft/deberta-v3-large',
    'microsoft/deberta-v3-base',
]

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

device = 'cuda'
attn_implementation = 'eager'# 'sdpa' #('flash_attention_2' if device in {'cuda', 'auto'} else 'sdpa')
torch_dtype = (torch.bfloat16 if device in {'cuda', 'auto'} else torch.float16)
torch_dtype = torch.bfloat16
torch_dtype = torch.float32

model_id = 'microsoft/deberta-v3-base'
model_id = 'microsoft/deberta-v3-small'
model_id = 'microsoft/deberta-v3-xsmall'
#model_id = 'distilbert-base-uncased'
#deberta_clf = AutoModelForSequenceClassification.from_pretrained(model_id)
deberta = AutoModel.from_pretrained(model_id, 
                                   attn_implementation=attn_implementation,
                                   torch_dtype=torch_dtype,
                                   #num_labels=3, 
                                   )

tokenizer = AutoTokenizer.from_pretrained(model_id)

2025-06-09 01:15:45.348377: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-09 01:15:45.355639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749446145.363999   37147 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749446145.366560   37147 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749446145.373343   37147 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
tokenizer.decode(tokenizer(['Hello world.'])['input_ids'][0])

'[CLS] Hello world.[SEP]'

In [3]:
model_inputs = tokenizer([ 'Hello world.', 'A news article.'], truncation=True, padding=True, return_tensors='pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
import torch
from torch import nn
from torch import logit
from transformers.modeling_outputs import SequenceClassifierOutput

def _class_probabilities(cumulative_probabilities):
    P = cumulative_probabilities
    K = P.shape[-1]+1
    result = []
    for k in range(K):
        if k == 0:
            result.append( P[:,k].unsqueeze(1) )
        elif k < K-1:
            result.append( (P[:,k] - P[:,k-1]).unsqueeze(1) )
        else:
            result.append( (1 - P[:,k-1]).unsqueeze(1) )
    
    result = torch.cat(result, dim=-1)
    return result

def _predict_class(cumulative_probabilities):
    class_probabilities = _class_probabilities(cumulative_probabilities)
    return class_probabilities.argmax(dim=-1)

# define ordinal classification head
class OrdinalRegressionHead(nn.Module):
    def __init__(self, hidden_dim, num_classes, link_function=nn.Sigmoid(), 
                 dtype=torch_dtype, device='cpu'):
        super().__init__()
        self.num_classes = num_classes
        self.linear = nn.Linear(hidden_dim, 1, bias=True)
        
        thresh_init = torch.tensor([0]+[1]*(num_classes-2), dtype=torch.float32)
        self.raw_thresholds = nn.Parameter(thresh_init, requires_grad=True)
        self.link_function = link_function

        if isinstance(link_function, nn.Sigmoid):
            self.loss_func = nn.BCEWithLogitsLoss()
        else:
            self.loss_func = nn.BCELoss()
        #self = self.to(torch_dtype)
        self.device = device
        self = self.to(device)
        
    @property
    def theta(self):
        return torch.cumsum(self.raw_thresholds**2, dim=0)
    
    def forward(self, x, targets=None, verbose=False):
        # x is the [CLS] hidden states
        # upcast to float32 generally
        logits = self.linear(x.to(self.raw_thresholds.dtype)).squeeze(-1)  # shape: [batch]
        thresholds = self.theta 
        #thresholds = torch.cumsum(self.raw_thresholds**2, dim=0)
        #thresholds = self.raw_thresholds
        #print(thresholds)
        logits = logits.unsqueeze(1)#.repeat(1, thresholds.size(0))
        thresholds = thresholds.unsqueeze(0).repeat(logits.size(0), 1)
        
        #print('logits_shape', logits.shape)
        #print('thresholds_shape', thresholds.shape)
        
        batch_size = x.shape[0]
        
        threshold_logits = thresholds - logits
        probs = self.link_function(threshold_logits)
        
        if targets is not None:
            #print(targets, type(targets))
            if not isinstance(targets, torch.Tensor):
                targets = torch.LongTensor(targets)
            
            targets = targets.to(x.device).unsqueeze(-1)
            range_ = torch.arange(self.num_classes-1).unsqueeze(0).repeat_interleave(batch_size, 0).to(x.device)

            #print(targets.shape, range_.shape)
            bce_targets = (targets <= range_).to(x.dtype)
            
            #print(bce_targets)
            if verbose:
                print('targets', targets)
                #print('range', range_)
                print('bce_targets', bce_targets)
                print('logits', logits)
                print('cum_probs', probs)
                print('class probabilities', _class_probabilities(probs))
                print('theta', self.theta)
                print(self.link_function, self.loss_func)
            
            if isinstance(self.link_function, nn.Sigmoid):
                # use BCEWithLogitsLoss for numerical stability
                loss = self.loss_func(threshold_logits, bce_targets)
            else:
                loss = self.loss_func(probs, bce_targets)
        else:
            loss = None
        
        return threshold_logits, probs, loss, logits

class PretrainedModelForOrdinalSequenceClassification(nn.Module):
    def __init__(self, model, num_classes=3, link_function=nn.Sigmoid()):
        super(PretrainedModelForOrdinalSequenceClassification, self).__init__()
        self.device = model.device
        self.model = model
        self.num_classes = num_classes
        self.hidden_dim = model.config.hidden_size
        self.clf_head = OrdinalRegressionHead(self.hidden_dim, 
                                              num_classes, 
                                              link_function=link_function,
                                              dtype=torch_dtype,
                                              device=self.model.device)
        self.device = self.model.device
    def gradient_checkpointing_enable(self, *args, **kwargs):
        return self.model.gradient_checkpointing_enable(*args, **kwargs)
    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        targets = labels
        dev = self.model.device
        outputs = self.model(input_ids=input_ids.to(dev), 
                             attention_mask=attention_mask.to(dev), 
                             **kwargs)
        x = outputs.last_hidden_state[:,0,:] # [CLS] token embedding
        #print(x.shape)
        threshold_logits, probs, loss, logits = self.clf_head(x, targets=targets)
        
        clf_outputs = SequenceClassifierOutput(loss=loss, 
                                               logits=threshold_logits, 
                                               hidden_states=x, 
                                               attentions=outputs.attentions)
        class_probabilities = _class_probabilities(probs)
        class_predictions = _predict_class(probs)
        clf_outputs.class_probabilities = class_probabilities
        clf_outputs.predicted_class = class_predictions
        clf_outputs.base_logits = logits
        return clf_outputs

In [5]:
# freeze embeddings
deberta.embeddings.word_embeddings.weight.requires_grad = False

In [6]:
model_clf = PretrainedModelForOrdinalSequenceClassification(deberta, num_classes=3)
#model_clf = deberta

### Import the dataset

In [7]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pickle as pkl
from trl import SFTConfig, SFTTrainer
import re
from tqdm import tqdm; tqdm.pandas()
import os.path as op
import os

df_dataset = pd.read_csv('Dataset-framing_annotations-Llama-3.3-70B-Instruct-Turbo.csv')

output_dir = f'model_training-OrdinalClassifier-{model_id.split("/")[-1]}'

# induce partitions
try: os.makedirs(output_dir)
except FileExistsError: pass

In [8]:
df_dataset_ = df_dataset[['concept', 'source', 'dateTimePub', 'FRAMING_CLASS']]

test_size = .025
seed = 125

try:
    with open(op.join(output_dir, 'train_test_part.pkl'), 'rb') as file:
        partition_ids = pkl.load(file)
    train, val = partition_ids['train'], partition_ids['validation']
except FileNotFoundError:
    train, val = train_test_split(np.array(range(len(df_dataset_))), test_size=test_size, random_state=seed)
    train, val = train.squeeze(), val.squeeze()
    with open(op.join(output_dir, 'train_test_part.pkl'), 'wb') as file:
        pkl.dump({'train': train, 'validation': val}, file)

def shorten_to_n_words(text, n=1500):
    words = re.findall(r'\b\w+\b', text)
    if len(words) <= n:
        return text  # no truncation needed
    
    # Find the index where the n-th word ends
    count = 0
    end_index = len(text)
    for match in re.finditer(r'\b\w+\b', text):
        count += 1
        if count == n:
            end_index = match.end()
            break
    
    return text[:end_index].rstrip() + "[truncated]..."

def format_prompt_with_article(title, body, max_words=2000):
    body = shorten_to_n_words(body, n=max_words)
    article_input = f'Title: {title}[SEP]{body}'
    return article_input

def format_prompt_from_row(row, max_words=2000):
    return format_prompt_with_article(row.title, row.body, max_words=max_words)


In [9]:
from datasets import NamedSplit, DatasetDict, load_from_disk

try:
    ds = load_from_disk(op.join(output_dir, 'train_val_dataset.ds'))
except FileNotFoundError:
    df_dataset_['text'] = [ format_prompt_from_row(row) for row in tqdm(df_dataset.iloc, total=len(df_dataset)) ]
    
    class_order = [ 'NEUTRAL', 'LOADED', 'ALARMIST' ]
    df_dataset_['labels'] = df_dataset_.FRAMING_CLASS.progress_apply(lambda s: class_order.index(s.strip().upper()))
        
    ds_train = Dataset.from_pandas(df_dataset_.iloc[train], split=NamedSplit('train'))
    ds_val = Dataset.from_pandas(df_dataset_.iloc[val], split=NamedSplit('validation'))
    
    #assert False
    def get_max_length(dataset, tokenizer=tokenizer):
        return max(len(tokenizer(example["text"])["input_ids"]) for example in tqdm(dataset))
    
    #max_length = max(get_max_length(ds_train), get_max_length(ds_val))
    max_length = 1500
    
    print('max length of:', max_length)
    
    # Tokenize with static padding
    def tokenize_row(example, max_length=max_length, padding='max_length'):
        tok = tokenizer(example["text"], padding=padding, truncation=True, max_length=max_length)
        #print(tok['input_ids'])
        #print(len(tok['input_ids'][0]), len(tokenizer(example['text'])['input_ids'][0]))
        return tok
    
    tok_train = lambda ex: tokenize_row(ex, padding='longest')
    tok_val = tok_train # lambda ex: tokenize_row(ex, padding='max_length')
    
    ds_train = ds_train.map(tok_train, batched=True, batch_size=1, num_proc=1)
    ds_val = ds_val.map(tok_val, batched=True, batch_size=1, num_proc=1)
    
    ds = DatasetDict({'train': ds_train, 'val': ds_val})
    
    ds.save_to_disk(op.join(output_dir, 'train_val_dataset.ds'))

In [10]:
#tokenizer.decode(tok_train(ds_train[1])['input_ids'])

In [11]:
ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

ds_train, ds_val = ds['train'], ds['val']

#ds_train = ds['train']
lens_ = []
for ex in tqdm(ds_train):
    l = len(ex['input_ids'])
    lens_.append(l)
    if l != 1500:
        break


  0%|                                      | 2/121888 [00:00<01:05, 1862.89it/s]


In [12]:
ds_train[0]['input_ids'].shape

torch.Size([1500])

In [13]:
import numpy as np
#from datasets import load_metric
import evaluate
from transformers import Trainer
import numpy as np

batch_size = 1
eval_batch_size = 1
gradient_accumulation_steps = 5

save_steps = 2_500
eval_steps = save_steps

#eval_steps = 5

Num_train_examples = len(ds_train)
optim = "paged_adamw_32bit"
learning_rate = 1e-5
weight_decay = 0#.00001
gradient_checkpointing = False
warmup_steps = 1_000


num_epochs = 25
max_steps = int(Num_train_examples/(batch_size*gradient_accumulation_steps)*num_epochs)

metric = evaluate.load("accuracy")

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    max_steps=max_steps,
    #num_train_epochs=EPOCHS,
    eval_steps=eval_steps,
    save_steps=save_steps, 
    eval_strategy="steps",
    save_strategy="steps",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=weight_decay,
    #optim=optim, 
    lr_scheduler_type='linear',
    warmup_steps=warmup_steps,
    gradient_checkpointing=gradient_checkpointing,
)


class OrdinalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        #labels = inputs.pop("labels")
        outputs = model(**inputs)
        #logits = outputs[0][:, 0]
        #loss = torch.nn.functional.mse_loss(logits, labels)
        loss = outputs.loss
        if num_items_in_batch is not None:
            loss = loss / num_items_in_batch
        return (loss, outputs) if return_outputs else loss

def compute_per_class_metrics(preds, targets, num_classes=None):
    """
    Computes precision, recall, and F1 for each class.
    
    Args:
        preds: np.ndarray of shape (N,), predicted class indices
        targets: np.ndarray of shape (N,), ground-truth class indices
        num_classes: int, total number of classes (optional if all classes are present in data)

    Returns:
        metrics: dict with precision, recall, and f1 arrays of shape (num_classes,)
    """
    if num_classes is None:
        num_classes = max(np.max(preds), np.max(targets)) + 1
    
    precision = np.zeros(num_classes)
    recall = np.zeros(num_classes)
    f1 = np.zeros(num_classes)
    
    for cls in range(num_classes):
        tp = np.sum((preds == cls) & (targets == cls))
        fp = np.sum((preds == cls) & (targets != cls))
        fn = np.sum((preds != cls) & (targets == cls))
        #print(cls, tp, fp, fn)
    
        precision[cls] = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall[cls] = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        if precision[cls] + recall[cls] > 0:
            f1[cls] = 2 * precision[cls] * recall[cls] / (precision[cls] + recall[cls])
        else:
            f1[cls] = 0.0
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

#def _predict_class(logits):
#    return logits.argmax(dim=-1)

def compute_metrics(eval_pred, num_classes=3):
    logits, labels = eval_pred
    #print(eval_pred)
    #print('logits', logits)
    #print('labels', labels)
    #print(logits, labels)

    #print(logits[0].shape, logits[1].shape)
    logits = logits[0]
    #predictions = np.argmax(logits, axis=-1)
    with torch.no_grad():
        predictions = _predict_class(torch.sigmoid(torch.tensor(logits))).detach().cpu().numpy()
    
    result = metric.compute(predictions=predictions, references=labels) # dict with 'accuracy'
    # partition the labels by targets and measure accuracy for each to ensure balance
    per_class_metrics = compute_per_class_metrics(predictions, labels, num_classes=num_classes)
    for cls in range(num_classes):
        for metric_name in [ 'precision', 'recall', 'f1' ]:
            metric_label = f'class{cls}_{metric_name}'
            result[metric_label] = per_class_metrics[metric_name][cls]
    
    return result

from transformers import TrainerCallback

class EvaluateAtStepOneCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step == 1:
            control.should_evaluate = True
        return control

#model_clf.model.enable_input_requires_grad()
from torch.optim import AdamW, Adam, SGD
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW([ p for p in model_clf.parameters() if p.requires_grad ], 
                  lr=learning_rate, weight_decay=weight_decay, )
#optimizer = SGD([ model_clf.clf_head.raw_thresholds, ], lr=1, weight_decay=0.)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, max_steps, -1).step()

In [14]:
trainer = OrdinalTrainer(
    model=model_clf,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val, #.select(range(250)),
    compute_metrics=compute_metrics,
    #callbacks=[EvaluateAtStepOneCallback()],
    optimizers=(optimizer, scheduler), 
)

In [16]:
pre_pre = model_clf.model.embeddings.word_embeddings.weight.detach().cpu().to(torch.float32).numpy()
pre_pre_l = model_clf.model.encoder.layer[1].attention.self.query_proj.weight.detach().cpu().to(torch.float32).numpy()
pre_pre_t = model_clf.clf_head.raw_thresholds.detach().cpu().to(torch.float32).numpy()

In [17]:
model_clf.model.encoder.layer[1].attention.self.query_proj.weight

Parameter containing:
tensor([[ 0.0213,  0.0244, -0.3035,  ..., -0.0818,  0.0801, -0.2236],
        [ 0.3567, -0.0761, -0.0009,  ...,  0.1059, -0.0573,  0.0448],
        [-0.1027, -0.0442,  0.0193,  ...,  0.0282,  0.2102,  0.1320],
        ...,
        [-0.0829, -0.0483, -0.0480,  ...,  0.0344,  0.0036, -0.0932],
        [ 0.0728,  0.1213, -0.1473,  ..., -0.0896, -0.0208, -0.1312],
        [ 0.1752,  0.1418,  0.1919,  ...,  0.0189,  0.1738,  0.1304]],
       device='cuda:0', requires_grad=True)

In [18]:
os.environ['WANDB_RESUME'] = 'must'
os.environ['WANDB_RUN_ID'] = 'qmt1qeqy'

In [None]:
ckpt = op.join(output_dir, 'checkpoint-365000')
#ckpt = False
trainer.train(resume_from_checkpoint=ckpt)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
367500,0.0513,0.334678,0.882598,0.940666,0.923295,0.9319,0.790805,0.8028,0.796757,0.661202,0.770701,0.711765
370000,0.0482,0.342348,0.881958,0.938609,0.92661,0.932571,0.795082,0.792299,0.793688,0.647059,0.770701,0.703488
372500,0.0452,0.341849,0.883237,0.941233,0.925189,0.933142,0.791234,0.800467,0.795824,0.661202,0.770701,0.711765
375000,0.0498,0.333483,0.890595,0.936049,0.935606,0.935828,0.809133,0.806301,0.807715,0.726708,0.745223,0.735849
377500,0.0516,0.327757,0.891555,0.936019,0.935133,0.935576,0.811986,0.806301,0.809133,0.733333,0.770701,0.751553
380000,0.0387,0.303721,0.885797,0.946751,0.917614,0.931955,0.778867,0.834306,0.805634,0.720497,0.738854,0.72956
382500,0.0446,0.373887,0.889635,0.921983,0.951231,0.936378,0.837596,0.764294,0.799268,0.709091,0.745223,0.726708
385000,0.0498,0.335923,0.884837,0.940556,0.928977,0.934731,0.797897,0.796966,0.797431,0.657609,0.770701,0.709677
387500,0.0479,0.30919,0.886756,0.945933,0.919508,0.932533,0.783991,0.834306,0.808366,0.714286,0.732484,0.72327
390000,0.0493,0.312557,0.888996,0.947522,0.923295,0.935252,0.793722,0.826138,0.809605,0.6875,0.770701,0.726727


In [19]:
ckpt = op.join(output_dir, 'checkpoint-352500')
#ckpt = False
trainer.train(resume_from_checkpoint=ckpt)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
355000,0.0384,0.337473,0.892834,0.938776,0.936553,0.937663,0.815511,0.809802,0.812646,0.708333,0.757962,0.732308
357500,0.0376,0.337279,0.893794,0.938418,0.937973,0.938196,0.818396,0.809802,0.814076,0.712575,0.757962,0.734568
360000,0.0406,0.332885,0.895074,0.938389,0.9375,0.937944,0.817757,0.816803,0.81728,0.7375,0.751592,0.744479
362500,0.0382,0.334167,0.894434,0.938834,0.9375,0.938166,0.817116,0.813302,0.815205,0.72561,0.757962,0.741433
365000,0.0377,0.33512,0.893474,0.938805,0.937027,0.937915,0.815728,0.810968,0.813341,0.716867,0.757962,0.736842


Could not locate the best model at model_training-OrdinalClassifier-deberta-v3-xsmall/checkpoint-107500/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=365664, training_loss=0.001416962195738434, metrics={'train_runtime': 8837.6479, 'train_samples_per_second': 206.879, 'train_steps_per_second': 41.376, 'total_flos': 0.0, 'train_loss': 0.001416962195738434, 'epoch': 15.000369191388815})

In [None]:
ckpt = op.join(output_dir, 'checkpoint-237500')
#ckpt = False
trainer.train(resume_from_checkpoint=ckpt)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
240000,0.0858,0.26964,0.890275,0.947087,0.923769,0.935283,0.796193,0.829638,0.812571,0.699422,0.770701,0.733333
242500,0.0668,0.267697,0.887076,0.95304,0.912879,0.932527,0.778256,0.843641,0.80963,0.701149,0.77707,0.73716
245000,0.0567,0.298816,0.889315,0.943296,0.929451,0.936322,0.801843,0.812135,0.806957,0.683616,0.770701,0.724551
247500,0.0541,0.294807,0.894114,0.940561,0.936553,0.938553,0.813737,0.815636,0.814685,0.719512,0.751592,0.735202
250000,0.0612,0.290952,0.881638,0.953639,0.905777,0.929092,0.766206,0.841307,0.802002,0.681564,0.77707,0.72619
252500,0.0592,0.318336,0.883557,0.942085,0.924242,0.933078,0.790138,0.803967,0.796992,0.664835,0.770701,0.713864
255000,0.0614,0.284197,0.887076,0.948267,0.919981,0.93391,0.785635,0.829638,0.807037,0.69186,0.757962,0.723404
257500,0.0614,0.285775,0.896993,0.942721,0.935133,0.938911,0.813212,0.833139,0.823055,0.751634,0.732484,0.741935
260000,0.0595,0.278173,0.889955,0.945789,0.925189,0.935376,0.791111,0.830805,0.810472,0.725,0.738854,0.731861
262500,0.0537,0.312375,0.888996,0.939943,0.933712,0.936817,0.808009,0.800467,0.80422,0.675978,0.770701,0.720238


In [20]:
output_dir

'model_training-OrdinalClassifier-deberta-v3-xsmall'

In [None]:
ckpt = op.join(output_dir, 'checkpoint-215000')
#ckpt = False
trainer.train(resume_from_checkpoint=ckpt)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
217500,0.0783,0.298002,0.888996,0.933712,0.933712,0.933712,0.807018,0.805134,0.806075,0.735849,0.745223,0.740506
220000,0.0728,0.281716,0.888036,0.943133,0.92661,0.934798,0.797714,0.814469,0.806005,0.6875,0.770701,0.726727
222500,0.0647,0.279663,0.880678,0.962379,0.896307,0.928169,0.751788,0.85881,0.801743,0.688889,0.789809,0.735905
225000,0.0565,0.328868,0.889955,0.938776,0.936553,0.937663,0.811098,0.801634,0.806338,0.680233,0.745223,0.711246
227500,0.0702,0.276041,0.889315,0.945278,0.924242,0.934642,0.790233,0.830805,0.810011,0.725,0.738854,0.731861
230000,0.0678,0.274709,0.888996,0.944848,0.924716,0.934673,0.791946,0.826138,0.808681,0.715152,0.751592,0.732919
232500,0.0667,0.302787,0.885797,0.947164,0.925189,0.936048,0.790857,0.807468,0.799076,0.654255,0.783439,0.713043
235000,0.0657,0.29971,0.896033,0.936941,0.942708,0.939816,0.827751,0.807468,0.817484,0.715152,0.751592,0.732919
237500,0.0697,0.249365,0.896353,0.953431,0.920928,0.936898,0.794181,0.859977,0.82577,0.759494,0.764331,0.761905


In [None]:
ckpt = op.join(output_dir, 'checkpoint-15000-shiftLR')
ckpt = False
trainer.train(resume_from_checkpoint=ckpt)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
2500,0.2952,0.265326,0.703135,0.827669,0.980114,0.897464,0.0,0.0,0.0,0.2048,0.815287,0.327366
5000,0.2716,0.273856,0.710173,0.739558,0.997633,0.849426,0.0,0.0,0.0,0.407942,0.719745,0.520737
7500,0.2725,0.275872,0.698656,0.853383,0.96733,0.906791,0.0,0.0,0.0,0.192623,0.898089,0.31721
10000,0.2573,0.256707,0.708253,0.811719,0.983902,0.889555,0.0,0.0,0.0,0.240283,0.866242,0.37621
12500,0.2559,0.297255,0.714331,0.759494,0.994318,0.861185,0.0,0.0,0.0,0.368421,0.847134,0.513514
15000,0.2459,0.249019,0.706334,0.855546,0.978693,0.912986,0.0,0.0,0.0,0.198592,0.898089,0.32526
17500,0.242,0.245539,0.715931,0.811146,0.992424,0.892675,0.0,0.0,0.0,0.261993,0.904459,0.406295
20000,0.2336,0.257899,0.716891,0.796893,0.995739,0.885287,0.0,0.0,0.0,0.283368,0.878981,0.428571
22500,0.2427,0.250116,0.713372,0.822695,0.988636,0.898065,0.0,0.0,0.0,0.241497,0.904459,0.381208
25000,0.2157,0.248025,0.709213,0.851457,0.982481,0.912288,0.0,0.0,0.0,0.206096,0.904459,0.335697
