In [1]:
from transformers import TrainingArguments,Trainer
from transformers import AutoModel, AutoModelForSequenceClassification,AutoTokenizer
import torch
import os

models = [
    'microsoft/deberta-v3-xsmall',
    'microsoft/deberta-v3-small',
    'microsoft/deberta-v3-large',
    'microsoft/deberta-v3-base',
]

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

device = 'cuda'
attn_implementation = 'eager'# 'sdpa' #('flash_attention_2' if device in {'cuda', 'auto'} else 'sdpa')
torch_dtype = (torch.bfloat16 if device in {'cuda', 'auto'} else torch.float16)
torch_dtype = torch.bfloat16
torch_dtype = torch.float32

model_id = 'microsoft/deberta-v3-base'
model_id = 'microsoft/deberta-v3-small'
model_id = 'microsoft/deberta-v3-xsmall'
#deberta_clf = AutoModelForSequenceClassification.from_pretrained(model_id)
deberta = AutoModelForSequenceClassification.from_pretrained(model_id, 
                                   attn_implementation=attn_implementation,
                                   torch_dtype=torch_dtype,
                                   num_labels=3, 
                                   )

tokenizer = AutoTokenizer.from_pretrained(model_id)

  warn(
2025-06-09 18:56:17.486020: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
tokenizer.decode(tokenizer(['Hello world.'])['input_ids'][0])

'[CLS] Hello world.[SEP]'

In [3]:
model_inputs = tokenizer([ 'Hello world.', 'A news article.'], truncation=True, padding=True, return_tensors='pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
import torch
from torch import nn
from torch import logit
from transformers.modeling_outputs import SequenceClassifierOutput

def _class_probabilities(cumulative_probabilities):
    P = cumulative_probabilities
    K = P.shape[-1]+1
    result = []
    for k in range(K):
        if k == 0:
            result.append( P[:,k].unsqueeze(1) )
        elif k < K-1:
            result.append( (P[:,k] - P[:,k-1]).unsqueeze(1) )
        else:
            result.append( (1 - P[:,k-1]).unsqueeze(1) )
    
    result = torch.cat(result, dim=-1)
    return result

def _predict_class(cumulative_probabilities):
    class_probabilities = _class_probabilities(cumulative_probabilities)
    return class_probabilities.argmax(dim=-1)

# define ordinal classification head
class OrdinalRegressionHead(nn.Module):
    def __init__(self, hidden_dim, num_classes, link_function=nn.Sigmoid(), 
                 dtype=torch_dtype, device='cpu'):
        super().__init__()
        self.num_classes = num_classes
        self.linear = nn.Linear(hidden_dim, 1, bias=True)
        
        thresh_init = torch.tensor([0]+[1]*(num_classes-2), dtype=torch.float32)
        self.raw_thresholds = nn.Parameter(thresh_init, requires_grad=True)
        self.link_function = link_function

        if isinstance(link_function, nn.Sigmoid):
            self.loss_func = nn.BCEWithLogitsLoss()
        else:
            self.loss_func = nn.BCELoss()
        #self = self.to(torch_dtype)
        self.device = device
        self = self.to(device)
        
    @property
    def theta(self):
        return torch.cumsum(self.raw_thresholds**2, dim=0)
    
    def forward(self, x, targets=None, verbose=False):
        # x is the [CLS] hidden states
        # upcast to float32 generally
        logits = self.linear(x.to(self.raw_thresholds.dtype)).squeeze(-1)  # shape: [batch]
        thresholds = self.theta 
        #thresholds = torch.cumsum(self.raw_thresholds**2, dim=0)
        #thresholds = self.raw_thresholds
        #print(thresholds)
        logits = logits.unsqueeze(1).repeat(1, thresholds.size(0))
        thresholds = thresholds.unsqueeze(0).repeat(logits.size(0), 1)
        
        #print('logits_shape', logits.shape)
        #print('thresholds_shape', thresholds.shape)
        
        batch_size = x.shape[0]

        threshold_logits = thresholds - logits
        probs = self.link_function(threshold_logits)
        
        if targets is not None:
            #print(targets, type(targets))
            if not isinstance(targets, torch.Tensor):
                targets = torch.LongTensor(targets)

            targets = targets.to(x.device).unsqueeze(-1)
            range_ = torch.arange(self.num_classes-1).unsqueeze(0).repeat_interleave(batch_size, 0).to(x.device)

            #print(targets.shape, range_.shape)
            bce_targets = (targets <= range_).to(x.dtype)
            
            #print(bce_targets)
            if verbose:
                print('targets', targets)
                #print('range', range_)
                print('bce_targets', bce_targets)
                print('class probabilities', _class_probabilities(probs))
                print('theta', self.theta)
            
            if isinstance(self.link_function, nn.Sigmoid):
                # use BCEWithLogitsLoss for numerical stability
                loss = self.loss_func(threshold_logits, bce_targets)
            else:
                loss = self.loss_func(probs, bce_targets)
        else:
            loss = None
        
        return threshold_logits, probs, loss


class PretrainedModelForOrdinalSequenceClassification(nn.Module):
    def __init__(self, model, num_classes=3, link_function=nn.Sigmoid()):
        super(PretrainedModelForOrdinalSequenceClassification, self).__init__()
        self.device = model.device
        self.model = model
        self.num_classes = num_classes
        self.hidden_dim = model.config.hidden_size
        self.clf_head = OrdinalRegressionHead(self.hidden_dim, 
                                              num_classes, 
                                              link_function=link_function,
                                              dtype=torch_dtype,
                                              device=self.model.device)
        self.device = self.model.device
    def gradient_checkpointing_enable(self, *args, **kwargs):
        return self.model.gradient_checkpointing_enable(*args, **kwargs)
    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        targets = labels
        dev = self.model.device
        outputs = self.model(input_ids=input_ids.to(dev), 
                             attention_mask=attention_mask.to(dev), 
                             **kwargs)
        x = outputs.last_hidden_state[:,0,:] # [CLS] token embedding
        #print(x.shape)
        threshold_logits, probs, loss = self.clf_head(x, targets=targets)
        
        clf_outputs = SequenceClassifierOutput(loss=loss, 
                                               logits=threshold_logits, 
                                               hidden_states=x, 
                                               attentions=outputs.attentions)
        class_probabilities = _class_probabilities(probs)
        class_predictions = _predict_class(probs)
        clf_outputs.class_probabilities = class_probabilities
        clf_outputs.predicted_class = class_predictions
        return clf_outputs

In [5]:
# freeze embeddings
deberta.deberta.embeddings.word_embeddings.weight.requires_grad = False

In [6]:
deberta

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 384, padding_idx=0)
      (LayerNorm): LayerNorm((384,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=384, out_features=384, bias=True)
              (key_proj): Linear(in_features=384, out_features=384, bias=True)
              (value_proj): Linear(in_features=384, out_features=384, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): Layer

In [7]:
#model_clf = PretrainedModelForOrdinalSequenceClassification(deberta, num_classes=3)
model_clf = deberta

### Import the dataset

In [8]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pickle as pkl
from trl import SFTConfig, SFTTrainer
import re
from tqdm import tqdm; tqdm.pandas()
import os.path as op
import os

df_dataset = pd.read_csv('Dataset-framing_annotations-Llama-3.3-70B-Instruct-Turbo.csv')

output_dir = f'model_training-StandardClassifierSmallLR-{model_id.split("/")[-1]}'

# induce partitions
try: os.makedirs(output_dir)
except FileExistsError: pass

In [9]:
df_dataset_ = df_dataset[['concept', 'source', 'dateTimePub', 'FRAMING_CLASS']]

test_size = .025
seed = 125

try:
    with open(op.join(output_dir, 'train_test_part.pkl'), 'rb') as file:
        partition_ids = pkl.load(file)
    train, val = partition_ids['train'], partition_ids['validation']
except FileNotFoundError:
    train, val = train_test_split(np.array(range(len(df_dataset_))), test_size=test_size, random_state=seed)
    train, val = train.squeeze(), val.squeeze()
    with open(op.join(output_dir, 'train_test_part.pkl'), 'wb') as file:
        pkl.dump({'train': train, 'validation': val}, file)

def shorten_to_n_words(text, n=1500):
    words = re.findall(r'\b\w+\b', text)
    if len(words) <= n:
        return text  # no truncation needed
    
    # Find the index where the n-th word ends
    count = 0
    end_index = len(text)
    for match in re.finditer(r'\b\w+\b', text):
        count += 1
        if count == n:
            end_index = match.end()
            break
    
    return text[:end_index].rstrip() + "[truncated]..."

def format_prompt_with_article(title, body, max_words=2000):
    body = shorten_to_n_words(body, n=max_words)
    article_input = f'Title: {title}[SEP]{body}'
    return article_input

def format_prompt_from_row(row, max_words=2000):
    return format_prompt_with_article(row.title, row.body, max_words=max_words)


In [10]:
from datasets import NamedSplit, DatasetDict, load_from_disk

try:
    ds = load_from_disk(op.join(output_dir, 'train_val_dataset.ds'))
except FileNotFoundError:
    df_dataset_['text'] = [ format_prompt_from_row(row) for row in tqdm(df_dataset.iloc, total=len(df_dataset)) ]
    
    class_order = [ 'NEUTRAL', 'LOADED', 'ALARMIST' ]
    df_dataset_['labels'] = df_dataset_.FRAMING_CLASS.progress_apply(lambda s: class_order.index(s.strip().upper()))
        
    ds_train = Dataset.from_pandas(df_dataset_.iloc[train], split=NamedSplit('train'))
    ds_val = Dataset.from_pandas(df_dataset_.iloc[val], split=NamedSplit('validation'))
    
    #assert False
    def get_max_length(dataset, tokenizer=tokenizer):
        return max(len(tokenizer(example["text"])["input_ids"]) for example in tqdm(dataset))
    
    #max_length = max(get_max_length(ds_train), get_max_length(ds_val))
    max_length = 1500
    
    print('max length of:', max_length)
    
    # Tokenize with static padding
    def tokenize_row(example, max_length=max_length, padding='max_length'):
        tok = tokenizer(example["text"], padding=padding, truncation=True, max_length=max_length)
        #print(tok['input_ids'])
        #print(len(tok['input_ids'][0]), len(tokenizer(example['text'])['input_ids'][0]))
        return tok
    
    tok_train = lambda ex: tokenize_row(ex, padding='longest')
    tok_val = tok_train # lambda ex: tokenize_row(ex, padding='max_length')
    
    ds_train = ds_train.map(tok_train, batched=True, batch_size=1, num_proc=1)
    ds_val = ds_val.map(tok_val, batched=True, batch_size=1, num_proc=1)
    
    ds = DatasetDict({'train': ds_train, 'val': ds_val})
    
    ds.save_to_disk(op.join(output_dir, 'train_val_dataset.ds'))

In [11]:
#tokenizer.decode(tok_train(ds_train[1])['input_ids'])

In [12]:
ds.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

ds_train, ds_val = ds['train'], ds['val']

#ds_train = ds['train']
lens_ = []
for ex in tqdm(ds_train):
    l = len(ex['input_ids'])
    lens_.append(l)
    if l != 1500:
        break


  0%|                                       | 2/121888 [00:00<03:34, 568.18it/s]


In [13]:
ds_train[0]['input_ids'].shape

torch.Size([1500])

In [14]:
import numpy as np
#from datasets import load_metric
import evaluate
from transformers import Trainer
import numpy as np

batch_size = 1
eval_batch_size = 1
gradient_accumulation_steps = 5

save_steps = 2_500
eval_steps = save_steps

#eval_steps = 5

Num_train_examples = len(ds_train)
optim = "paged_adamw_32bit"
learning_rate = 1e-5#.005
weight_decay= 0#.00001
gradient_checkpointing = False
warmup_steps = 1_000


num_epochs = 15
max_steps = int(Num_train_examples/(batch_size*gradient_accumulation_steps)*num_epochs)

metric = evaluate.load("accuracy")

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    max_steps=max_steps,
    max_grad_norm=50.,
    #num_train_epochs=EPOCHS,
    eval_steps=eval_steps,
    save_steps=save_steps, 
    eval_strategy="steps",
    save_strategy="steps",
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=weight_decay,
    #optim=optim, 
    lr_scheduler_type='linear',
    warmup_steps=warmup_steps,
    gradient_checkpointing=gradient_checkpointing,
)


class OrdinalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        #labels = inputs.pop("labels")
        outputs = model(**inputs)
        #logits = outputs[0][:, 0]
        #loss = torch.nn.functional.mse_loss(logits, labels)
        loss = outputs.loss
        if num_items_in_batch is not None:
            loss = loss / num_items_in_batch
        return (loss, outputs) if return_outputs else loss

def compute_per_class_metrics(preds, targets, num_classes=None):
    """
    Computes precision, recall, and F1 for each class.
    
    Args:
        preds: np.ndarray of shape (N,), predicted class indices
        targets: np.ndarray of shape (N,), ground-truth class indices
        num_classes: int, total number of classes (optional if all classes are present in data)

    Returns:
        metrics: dict with precision, recall, and f1 arrays of shape (num_classes,)
    """
    if num_classes is None:
        num_classes = max(np.max(preds), np.max(targets)) + 1
    
    precision = np.zeros(num_classes)
    recall = np.zeros(num_classes)
    f1 = np.zeros(num_classes)
    
    for cls in range(num_classes):
        tp = np.sum((preds == cls) & (targets == cls))
        fp = np.sum((preds == cls) & (targets != cls))
        fn = np.sum((preds != cls) & (targets == cls))
        #print(cls, tp, fp, fn)
    
        precision[cls] = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall[cls] = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        if precision[cls] + recall[cls] > 0:
            f1[cls] = 2 * precision[cls] * recall[cls] / (precision[cls] + recall[cls])
        else:
            f1[cls] = 0.0
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

def _predict_class(logits):
    return logits.argmax(dim=-1)

def compute_metrics(eval_pred, num_classes=3):
    logits, labels = eval_pred
    #print(eval_pred)
    #print('logits', logits)
    #print('labels', labels)
    #print(logits, labels)

    #print(logits[0].shape, logits[1].shape)
    logits = logits
    #predictions = np.argmax(logits, axis=-1)
    with torch.no_grad():
        predictions = _predict_class(torch.sigmoid(torch.tensor(logits))).detach().cpu().numpy()
    
    result = metric.compute(predictions=predictions, references=labels) # dict with 'accuracy'
    # partition the labels by targets and measure accuracy for each to ensure balance
    per_class_metrics = compute_per_class_metrics(predictions, labels, num_classes=num_classes)
    for cls in range(num_classes):
        for metric_name in [ 'precision', 'recall', 'f1' ]:
            metric_label = f'class{cls}_{metric_name}'
            result[metric_label] = per_class_metrics[metric_name][cls]
    
    return result

from transformers import TrainerCallback

class EvaluateAtStepOneCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step == 1:
            control.should_evaluate = True
        return control

#model_clf.model.enable_input_requires_grad()
from torch.optim import AdamW, Adam, SGD
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW([ p for p in model_clf.parameters() if p.requires_grad ], 
                  lr=learning_rate, weight_decay=weight_decay, )
#optimizer = SGD([ model_clf.clf_head.raw_thresholds, ], lr=1, weight_decay=0.)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, max_steps, -1).step()



In [15]:
trainer = Trainer(
    model=model_clf,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val, #.select(range(1000)),
    compute_metrics=compute_metrics,
    #callbacks=[EvaluateAtStepOneCallback()],
    optimizers=(optimizer, scheduler), 
)

In [16]:
pre_pre = model_clf.deberta.embeddings.word_embeddings.weight.detach().cpu().to(torch.float32).numpy()
pre_pre_l = model_clf.deberta.encoder.layer[1].attention.self.query_proj.weight.detach().cpu().to(torch.float32).numpy()
#pre_pre_t = model_clf.clf_head.raw_thresholds.detach().cpu().to(torch.float32).numpy()

In [17]:
model_clf.deberta.encoder.layer[1].attention.self.query_proj.weight

Parameter containing:
tensor([[ 0.0213,  0.0244, -0.3035,  ..., -0.0818,  0.0801, -0.2236],
        [ 0.3567, -0.0761, -0.0009,  ...,  0.1059, -0.0573,  0.0448],
        [-0.1027, -0.0442,  0.0193,  ...,  0.0282,  0.2102,  0.1320],
        ...,
        [-0.0829, -0.0483, -0.0480,  ...,  0.0344,  0.0036, -0.0932],
        [ 0.0728,  0.1213, -0.1473,  ..., -0.0896, -0.0208, -0.1312],
        [ 0.1752,  0.1418,  0.1919,  ...,  0.0189,  0.1738,  0.1304]],
       device='cuda:0', requires_grad=True)

In [18]:
os.environ['WANDB_RESUME'] = 'must'
os.environ['WANDB_RUN_ID'] = 'xp5ilvcy'

In [None]:
ckpt = op.join(output_dir, 'checkpoint-70000')
trainer.train(resume_from_checkpoint=ckpt)

[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
72500,0.1917,0.265984,0.895074,0.94648,0.929451,0.937888,0.792026,0.857643,0.823529,0.806452,0.636943,0.711744


In [19]:
ckpt = op.join(output_dir, 'checkpoint-60000')
trainer.train(resume_from_checkpoint=ckpt)

[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
62500,0.172,0.293485,0.888676,0.945358,0.925663,0.935407,0.797069,0.824971,0.81078,0.678363,0.738854,0.707317
65000,0.1944,0.290214,0.889635,0.909051,0.97017,0.938617,0.845347,0.752625,0.796296,0.798165,0.55414,0.654135
67500,0.2155,0.286112,0.889635,0.914711,0.959754,0.936691,0.828784,0.779463,0.803367,0.826923,0.547771,0.659004
70000,0.2085,0.273236,0.894114,0.929564,0.949811,0.939578,0.81733,0.814469,0.815897,0.798246,0.579618,0.671587
72500,0.0,,0.675624,0.675624,1.0,0.806415,0.0,0.0,0.0,0.0,0.0,0.0
75000,0.0,,0.675624,0.675624,1.0,0.806415,0.0,0.0,0.0,0.0,0.0,0.0
77500,0.0,,0.675624,0.675624,1.0,0.806415,0.0,0.0,0.0,0.0,0.0,0.0
80000,0.0,,0.675624,0.675624,1.0,0.806415,0.0,0.0,0.0,0.0,0.0,0.0
82500,0.0,,0.675624,0.675624,1.0,0.806415,0.0,0.0,0.0,0.0,0.0,0.0
85000,0.0,,0.675624,0.675624,1.0,0.806415,0.0,0.0,0.0,0.0,0.0,0.0



KeyboardInterrupt



In [19]:
ckpt = op.join(output_dir, 'checkpoint-25000')
trainer.train(resume_from_checkpoint=ckpt)

[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
27500,0.2772,0.283701,0.889955,0.92086,0.953125,0.936715,0.821385,0.788798,0.804762,0.794872,0.592357,0.678832
30000,0.2458,0.317294,0.882278,0.929478,0.93608,0.932767,0.806804,0.774796,0.790476,0.664773,0.745223,0.702703
32500,0.2466,0.294688,0.888676,0.934628,0.934186,0.934407,0.80533,0.810968,0.80814,0.723684,0.700637,0.711974
35000,0.2581,0.294108,0.891875,0.922477,0.952178,0.937092,0.825455,0.794632,0.80975,0.793388,0.611465,0.690647
37500,0.2812,0.293011,0.888356,0.945631,0.922348,0.933845,0.773013,0.86231,0.815223,0.818182,0.573248,0.674157
40000,0.2562,0.296146,0.889315,0.9097,0.963542,0.935847,0.840412,0.76196,0.799266,0.821429,0.585987,0.684015
42500,0.2395,0.285557,0.895074,0.935651,0.943182,0.939401,0.807736,0.828471,0.817972,0.813559,0.611465,0.698182
45000,0.2632,0.30671,0.881318,0.949729,0.912405,0.930693,0.776561,0.827305,0.80113,0.646739,0.757962,0.697947
47500,0.2547,0.302406,0.887076,0.907877,0.965909,0.935994,0.833119,0.757293,0.793399,0.84,0.535032,0.653696
50000,0.2142,0.313085,0.888036,0.943424,0.923769,0.933493,0.781148,0.841307,0.810112,0.77037,0.66242,0.712329


KeyboardInterrupt: 

In [None]:
ckpt = op.join(output_dir, 'checkpoint-25000')
trainer.train(resume_from_checkpoint=None)

[34m[1mwandb[0m: Currently logged in as: [33mmatthias-lalisse[0m ([33mmatthias-lalisse-inet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,Class0 Precision,Class0 Recall,Class0 F1,Class1 Precision,Class1 Recall,Class1 F1,Class2 Precision,Class2 Recall,Class2 F1
2500,0.5252,0.541941,0.805822,0.947056,0.830019,0.884683,0.600784,0.893816,0.718574,0.0,0.0,0.0
5000,0.4534,0.493655,0.852207,0.882867,0.956439,0.918182,0.761665,0.704784,0.732121,0.888889,0.254777,0.39604
7500,0.4619,0.429608,0.84453,0.931085,0.901989,0.916306,0.742169,0.718786,0.73029,0.476,0.757962,0.584767
10000,0.4273,0.490577,0.856686,0.936064,0.887311,0.911035,0.703244,0.859977,0.773753,0.881579,0.426752,0.575107
12500,0.4096,0.444643,0.87524,0.898095,0.959754,0.927901,0.847339,0.705951,0.77021,0.670968,0.66242,0.666667
15000,0.3921,0.531862,0.852527,0.951738,0.868371,0.908146,0.702729,0.841307,0.765799,0.635838,0.700637,0.666667
17500,0.3894,0.392205,0.885157,0.929742,0.939867,0.934777,0.810386,0.782964,0.796439,0.680982,0.707006,0.69375
20000,0.371,0.443447,0.879079,0.916206,0.942235,0.929038,0.795943,0.778296,0.787021,0.784483,0.579618,0.666667
22500,0.4102,0.452266,0.870761,0.949424,0.897727,0.922852,0.736308,0.847141,0.787846,0.699301,0.636943,0.666667
25000,0.3374,0.528345,0.864363,0.95063,0.893466,0.921162,0.733542,0.819137,0.77398,0.61413,0.719745,0.662757


In [46]:
ds_val[10]['input_ids'].shape

torch.Size([1253])

In [19]:
torch_dtype

torch.float32

In [None]:
model_clf.model.get_trainable_parameters()

In [None]:
for i, p in enumerate(optimizer.param_groups[0]['params']):
    if p is model_clf.clf_head.raw_thresholds:
        state = optimizer.state[p]

In [None]:
state

In [None]:
optimizer.zero_grad()
model_clf.clf_head.raw_thresholds.grad

In [None]:
model_clf.clf_head.raw_thresholds

In [None]:
for ex in tqdm(ds_train):
    if ex['labels'] == 2: break

inputs = { k: x.unsqueeze(0).cuda() for k, x in ex.items() }

In [None]:
ex['input_ids'].shape

In [None]:
o = model_clf(**inputs)
#loss = o.loss
#loss = o.logits.sum()
loss = model_clf.clf_head.raw_thresholds.sum()
loss.backward()

with torch.no_grad():
    before = model_clf.clf_head.raw_thresholds.clone().detach().cpu().numpy()
    #loss.backward()
    optimizer.step()
    after = model_clf.clf_head.raw_thresholds.clone().detach().cpu().numpy()
    print('grad', model_clf.clf_head.raw_thresholds.grad)
    print("delta:", after - before)
    #optimizer.zero_grad()

In [None]:
for ex in tqdm(ds_train):
    if ex['labels'] == 2: break

inputs = { k: x.unsqueeze(0).cuda() for k, x in ex.items() }

o = model_clf(**inputs)
#loss = o.loss
#loss = o.logits.sum()
loss = model_clf.clf_head.raw_thresholds.sum()
loss.backward()

with torch.no_grad():
    before = model_clf.clf_head.raw_thresholds.clone().detach().cpu().numpy()
    #loss.backward()
    optimizer.step()
    after = model_clf.clf_head.raw_thresholds.clone().detach().cpu().numpy()
    print('grad', model_clf.clf_head.raw_thresholds.grad)
    print("delta:", after - before)
    #optimizer.zero_grad()

In [None]:
for group in optimizer.param_groups:
    for p in group["params"]:
        print("match:", id(p) == id(model_clf.clf_head.raw_thresholds))

In [25]:
model_clf

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [None]:
optimizer.param_groups[0]

In [None]:
loss

In [None]:
(after - before)[0]

In [None]:
loss.dtype, model_clf.clf_head.raw_thresholds.data.dtype

In [None]:
model_clf.clf_head.raw_thresholds.grad

In [None]:
o.loss

In [None]:
o.loss.backward()
model_clf.clf_head.raw_thresholds.grad

In [None]:
optimizer.step()
optimizer.zero_grad()

In [None]:
model_clf.clf_head.raw_thresholds.grad

In [None]:
model_clf.clf_head.raw_thresholds

In [None]:
pre_pre_t - model_clf.clf_head.raw_thresholds.detach().cpu().to(torch.float32).numpy()

In [None]:
optimizer.param_groups

In [None]:
for p in optimizer.param_groups[0]['params']: print(p.shape)

In [None]:
optimizer == trainer.optimizer.optimizer

In [None]:
optimizer.step()

In [None]:
model_clf.clf_head.raw_thresholds.grad

In [None]:
model_clf.clf_head.raw_thresholds.detach().cpu().to(torch.float32).numpy() - post_post_t

In [None]:
model_clf.clf_head.theta

In [None]:
post_post = model_clf.model.embeddings.word_embeddings.weight.detach().cpu().to(torch.float32).numpy()
post_post_l = model_clf.model.encoder.layer[1].attention.self.query_proj.weight.detach().cpu().to(torch.float32).numpy()
post_post_t = model_clf.clf_head.raw_thresholds.detach().cpu().to(torch.float32).numpy()

pre_pre - post_post

In [None]:
post_post_t

In [None]:
pre_pre_t - post_post_t

In [None]:
np.abs(post_post_l - pre_pre_l).sum()

In [None]:
post_post_t - pre_pre_t

In [None]:
post_post_t, pre_pre_t

In [None]:
trainer.optimizer.optimizer.parameter_groups

In [None]:
model_clf.clf_head.raw_thresholds

In [None]:
model_clf

In [None]:
optimizer = trainer.optimizer

for i, group in enumerate(optimizer.param_groups):
    print(f"Group {i}: lr={group['lr']} | {len(group['params'])} params")

In [None]:
next(optimizer.param_groups)

In [None]:
group['params']

In [None]:
trainer.optimizer

In [None]:
trainer.optimizer.optimizer.parameter_groups

In [None]:
model_clf.clf_head.raw_thresholds

In [None]:
model_clf.clf_head.theta

In [None]:
ds_train[10000]['input_ids'].shape

In [None]:
trainer.train()

In [None]:
df_dataset.iloc[val].FRAMING_CLASS.value_counts()

In [None]:
deberta.gradient_checkpointing_enable()

In [None]:
targets

In [None]:
ds = load_from_disk(op.join(output_dir, 'train_val_dataset.ds'))

In [None]:
from datasets import list_metrics
list_metrics()

In [None]:
per_device_train_batch_size = 1
per_device_eval_batch_size=1
gradient_accumulation_steps = 5 # @@CH
optim = "paged_adamw_8bit"
save_steps = 500
logging_steps = 5
learning_rate = 1e-4
max_grad_norm = 0.3
max_steps = 30_000
warmup_ratio = 0.01
lr_scheduler_type = "linear"
eval_steps = 500


sft_config = SFTConfig(
    #dataset_text_field="text",
    max_seq_length=max_input_len+100,
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True, #False,#True,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True, # @@@@checkpoint
    report_to=None,
    do_eval=True, #@@@
    eval_strategy='steps', #@@@
    eval_steps=eval_steps,
    #load_best_model_at_end=False,
    logging_first_step=True,
    #use_cache=False, 
    #padding_free=True
)


In [None]:
ds_train

In [None]:
from datasets import NamedSplit

In [None]:
df_dataset_.input

In [None]:
df_dataset_.class

In [None]:
df_dataset.FRAMING_CLASS.value_counts()

In [None]:
df_dataset.keys()

In [None]:
clf = deberta_clf(**model_inputs)

In [None]:
clf.class_probabilities

In [None]:
deberta_clf.clf_head.theta

In [None]:
deberta_clf.clf_head.raw_thresholds = nn.Parameter(torch.tensor([-1.,2.]))

In [None]:
clf.predicted_class

In [None]:
clf.predicted_class

In [None]:
clf.class_probabilities.sum(axis=-1)

In [None]:
op.last_hidden_state[:,0,:].shape

In [None]:
import transformers.modeling_outputs

In [None]:
deberta_clf.device

In [None]:
type(deberta_clf)

In [None]:
deberta_clf