In [1]:
!pip install transformers
!pip install sentencepiece
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
#!pip install protobuf==3.20.*
!pip install accelerate -U


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import torch
import transformers
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

from transformers import Trainer, TrainingArguments
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from transformers import BertPreTrainedModel, BertModel

import pandas as pd
import numpy as np
import os
from pathlib import Path

from sklearn.metrics import classification_report

if not torch.cuda.is_available():
    if torch.backends.mps.is_available():
        DEVICE = 'mps'
    else:
        DEVICE = 'cpu'
else:
    DEVICE = 'cuda:0'
print("Device:", DEVICE)
device = torch.device(DEVICE)


Device: mps


In [4]:
# Initialization Cell
WORKING_ENV = 'PAPERSPACE' # Can be LABS, COLAB, PAPERSPACE, SAGEMAKER
USERNAME = '' # If working on Lab Machines - Your college username
assert WORKING_ENV in ['LABS', 'COLAB', 'PAPERSPACE', 'SAGEMAKER']

if WORKING_ENV == 'PAPERSPACE': # Using Paperspace
    !pip install ipywidgets
    content_path = '/notebooks/'
    data_path = './data/'
    
else:
  raise NotImplementedError()

content_path = Path(content_path)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
# running locally
import os
content_path = os.getcwd()
data_path = f'{content_path}/data/'
content_path = Path(content_path)

# Setting up data and utils

In [6]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

# Defining dataset class

In [7]:
from torch.utils.data import Dataset

# Define the custom dataset class
class PCLDataset(Dataset):
    def __init__(self, tokenizer, dataframe, is_multiclass=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.is_multiclass = is_multiclass

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        # Use 'orig_label' for multi-class and 'label' for binary
        if self.is_multiclass:
            dict_item = {'text': item['text'], 'label': item['label'], 'orig_label': item['orig_label']}
        else:
            dict_item = {'text': item['text'], 'label': item['label']}
        return dict_item

    def collate_fn(self, batch):
        texts = [item['text'] for item in batch]
        labels = torch.tensor([item['label'] for item in batch], dtype=torch.long)
        encodings = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
        encodings['labels'] = labels
        if self.is_multiclass:
            orig_labels = torch.tensor([item['orig_label'] for item in batch], dtype=torch.long)
            encodings['orig_labels'] = orig_labels
        return encodings


# Custom Roberta

In [8]:
from transformers import RobertaModel, RobertaPreTrainedModel

class RoBERTaForPCL(RobertaPreTrainedModel):
    def __init__(self, config, dropout_rate=0.1, num_frozen_layers=0, is_multiclass=False):
        super().__init__(config)
        self.roberta = RobertaModel(config)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.is_multiclass = is_multiclass
        output_dim = 5 if is_multiclass else 1

        self.classifier = torch.nn.Linear(config.hidden_size, output_dim)

        # Freeze specified bottom layers
        if num_frozen_layers > 0:
            # Freeze embeddings if num_frozen_layers includes them
            if num_frozen_layers >= 1:
                for param in self.roberta.embeddings.parameters():
                    param.requires_grad = False
            
            # Freeze bottom transformer layers as specified by num_frozen_layers
            for layer in self.roberta.encoder.layer[:num_frozen_layers]:
                for param in layer.parameters():
                    param.requires_grad = False

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                               position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds,
                               output_attentions=output_attentions, output_hidden_states=output_hidden_states,
                               return_dict=return_dict)
        pooled_output = self.dropout(outputs[1])
        logits = self.classifier(pooled_output)

        return logits


In [9]:
from transformers import AdamW
def get_groupwise_lr_decay_optimizer(model, learning_rate=1e-5, weight_decay=0.01, lr_decay=0.95, num_groups=3):

    n_layers = len(model.roberta.encoder.layer)  # Total number of layers
    layers_per_group = max(n_layers // num_groups, 1)  # Ensure at least one layer per group

    # Initialize grouped parameters list
    grouped_parameters = []

    # Embeddings parameters
    embedding_decayed_lr = learning_rate * (lr_decay ** num_groups)
    grouped_parameters.append({"params": model.roberta.embeddings.parameters(), 'lr': embedding_decayed_lr})

    # Encoder layers parameters
    for group_idx in range(num_groups):
        # Calculate decayed learning rate for this group
        decayed_lr = learning_rate * (lr_decay ** (num_groups - 1 - group_idx))
        
        # Calculate the start and end layer index for this group
        start_layer = group_idx * layers_per_group
        end_layer = (group_idx + 1) * layers_per_group if group_idx < num_groups - 1 else n_layers
        
        # Aggregate parameters from layers in this group
        group_params = []
        for layer_idx in range(start_layer, end_layer):
            group_params.extend(list(model.roberta.encoder.layer[layer_idx].parameters()))
        
        grouped_parameters.append({"params": group_params, 'lr': decayed_lr})

    # Pooler parameters
    pooled_decayed_lr = learning_rate 
    grouped_parameters.append({"params": model.roberta.pooler.parameters(), 'lr': pooled_decayed_lr})

    # Classifier parameters
    classifier_lr = learning_rate 
    grouped_parameters.append({"params": model.classifier.parameters(), 'lr': classifier_lr})
    
    optimizer = AdamW(grouped_parameters, lr=learning_rate, weight_decay=weight_decay)

    return optimizer


# Trainer and evaluation function definition

In [10]:
def evaluate(model, tokenizer, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = batch['label'].to(device)
            outputs = model(**inputs)
            
            if model.is_multiclass:
                # Convert logits to probabilities and then to multi-class predictions
                preds = outputs.argmax(dim=1)
                # Map multi-class labels to binary
                mapped_preds = preds.clone()
                mapped_preds[mapped_preds < 2] = 0  # Map 0,1 to 0
                mapped_preds[mapped_preds >= 2] = 1  # Map 2,3,4 to 1
                preds = mapped_preds

            else:
                # Convert logits to probabilities and then to binary predictions for binary classification
                probs = torch.sigmoid(outputs).squeeze()
                preds = (probs > 0.5).long()
            
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    # Compute classification report based on actual task type
    report = classification_report(all_labels, all_preds, target_names=["Not PCL", "PCL"], output_dict=True, zero_division=0)
    model.train()
    return report


In [11]:
import wandb

class PCLTrainer(Trainer):
    def __init__(self, *args, optimizer_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        if optimizer_function:
            self.optimizer = optimizer_function(self.model)
            
    def compute_loss(self, model, inputs, return_outputs=False):
    
        if model.is_multiclass:
            loss_labels = inputs.pop("orig_labels")
            outputs = model(**inputs)
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(outputs, loss_labels)
        else:
            labels = inputs.pop("labels")
            outputs = model(**inputs)
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(outputs.view(-1), labels.float().view(-1))
        return (loss, outputs) if return_outputs else loss
            

    def evaluate(self, ignore_keys=None):
        eval_results = evaluate(self.model, self.tokenizer, self.eval_dataset)
        f1_score = eval_results['PCL']['f1-score']
        precision = eval_results['PCL']['precision']
        recall = eval_results['PCL']['recall']
        accuracy = eval_results['accuracy']

        print(f"Accuracy: {accuracy}, F1 Score: {f1_score}, Precision: {precision}, Recall: {recall}")

        # Log the results with wandb
        wandb.log({"eval_f1": f1_score, "precision": precision, "recall": recall, "accuracy": accuracy})
        return {"eval_f1": f1_score}

## Weights and Biases Hyperparameter Search

In [12]:
from dont_patronize_me import DontPatronizeMe


def load_datasets(downsample=False):
    dpm = DontPatronizeMe('.', '.')
    dpm.load_task1()
    trids = pd.read_csv('internal_train_par_ids.csv')
    teids = pd.read_csv('internal_dev_par_ids.csv')

    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)

    data=dpm.train_task1_df

    rows = [] # will contain par_id, label and text
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        orig_label = int(data.loc[data.par_id == parid].orig_label.values[0])
        label = data.loc[data.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'community':keyword,
            'text':text,
            'label':label,
            'orig_label':orig_label
        })

    trdf1 = pd.DataFrame(rows)

    if downsample:
        # downsample negative instances

        pcldf = trdf1[trdf1.label==1]
        npos = len(pcldf)

        training_set1 = pd.concat([pcldf,trdf1[trdf1.label==0][:npos*2]])
        trdf1 = training_set1

    rows = [] # will contain par_id, label and text
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        #print(parid)
        # select row from original dataset
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        orig_label = int(data.loc[data.par_id == parid].orig_label.values[0])
        label = data.loc[data.par_id == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'community':keyword,
            'text':text,
            'label':label,
            'orig_label':orig_label
        })

    tedf1 = pd.DataFrame(rows)

    return trdf1, tedf1



In [13]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_df, dev_df = load_datasets(downsample=True)

In [14]:
import wandb
os.environ["WANDB_NOTEBOOK_NAME"] = "roberta_finetuning.ipynb"
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33malan-picucci[0m ([33malans-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [15]:
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'f1',
      'goal': 'maximize'   
    },
    'parameters': {
        'num_train_epochs': {
            'values': [3, 5, 10]
        },
        'learning_rate': {
            'values': [5e-5, 1e-5, 5e-4, 1e-4]
        },
        'per_device_train_batch_size': {
            'values': [16, 32, 64]
        },
        'frozen_layers': {
            'values': [0, 1, 4, 8, 10]
        },
        'dropout_rate': {
            'values': [0, 0.1, 0.3, 0.5]
        },
        'weight_decay': {
            'values': [0, 0.01, 0.001, 0.0001]
        },
        'scheduler': {
            'values': ['linear', 'cosine']
        },
        'lr_decay': {
            'values': [0.8, 0.85, 0.9, 0.95, 0.99]
        },
        'num_groups': {
            'values': [1, 2, 3, 4, 6, 12]
        },
        'is_multiclass': {
            'values': [False, True]
        }
    }
}


In [16]:
def tune_hyperparameters(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        config = wandb.config
        torch.manual_seed(6)

        # Load the datasets
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        train_set = PCLDataset(tokenizer, train_df, is_multiclass=config.is_multiclass)
        dev_set_PCL = PCLDataset(tokenizer, dev_df, is_multiclass=config.is_multiclass)
        dev_set = DataLoader(dev_set_PCL, batch_size=32)

        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=config.num_train_epochs,
            learning_rate=config.learning_rate,
            per_device_train_batch_size=config.per_device_train_batch_size,
            weight_decay=config.weight_decay,
            lr_scheduler_type=config.scheduler,
            overwrite_output_dir=True,
            evaluation_strategy="epoch",
            report_to="wandb",
            run_name="roberta-finetuning-test",
            remove_unused_columns=False,
            logging_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1",
            greater_is_better=True,
            save_strategy="epoch",
            save_total_limit=1
        )


        model = RoBERTaForPCL.from_pretrained('roberta-base', dropout_rate=config.dropout_rate, num_frozen_layers=config.frozen_layers, is_multiclass=config.is_multiclass).to(device)

        def optimizer_function(model):
            return get_groupwise_lr_decay_optimizer(
                model, 
                learning_rate=config.learning_rate, 
                weight_decay=config.weight_decay, 
                lr_decay=config.lr_decay,
                num_groups=config.num_groups
    )


        print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters")
        # Initialize Trainer
        trainer = PCLTrainer(
            model=model,
            args=training_args,
            train_dataset=train_set,
            eval_dataset=dev_set,
            data_collator=train_set.collate_fn,
            tokenizer=tokenizer,
            optimizer_function=optimizer_function
        )
        # Train the model
        trainer.train()
        # Evaluate the model
        results = trainer.evaluate()

        # Save the best model manually if it's better than the previous best
        if results["eval_f1"] > wandb.run.summary.get('best_f1', 0):
            wandb.run.summary['best_f1'] = results["eval_f1"]
            model_path = os.path.join('./best_model', wandb.run.name) 
            model.save_pretrained(model_path)

In [17]:
#sweep_id = wandb.sweep(sweep=sweep_config, project="NLP_CW_NEW")

In [18]:
wandb.agent(sweep_id="yn77y8ik", function=tune_hyperparameters, count=3, project="NLP_CW_NEW")

[34m[1mwandb[0m: Agent Starting Run: 2gefh30q with config:
[34m[1mwandb[0m: 	dropout_rate: 0.1
[34m[1mwandb[0m: 	frozen_layers: 10
[34m[1mwandb[0m: 	is_multiclass: False
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	lr_decay: 0.85
[34m[1mwandb[0m: 	num_groups: 3
[34m[1mwandb[0m: 	num_train_epochs: 5
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	scheduler: cosine
[34m[1mwandb[0m: 	weight_decay: 0.01


Some weights of RoBERTaForPCL were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 14767105 trainable parameters


  0%|          | 0/140 [00:00<?, ?it/s]

{'loss': 0.6514, 'grad_norm': 0.34549185633659363, 'learning_rate': 2.777406404176286e-05, 'epoch': 1.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.9049665711556829, F1 Score: 0.0, Precision: 0.0, Recall: 0.0
{'loss': 0.5661, 'grad_norm': 1.588516116142273, 'learning_rate': 2.0097501541762864e-05, 'epoch': 2.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.9106972301814709, F1 Score: 0.42813455657492355, Precision: 0.546875, Recall: 0.35175879396984927
{'loss': 0.4082, 'grad_norm': 6.188845634460449, 'learning_rate': 1.0608748458237135e-05, 'epoch': 3.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8576886341929322, F1 Score: 0.4808362369337979, Precision: 0.368, Recall: 0.6934673366834171
{'loss': 0.3708, 'grad_norm': 1.6785045862197876, 'learning_rate': 2.9321859582371364e-06, 'epoch': 4.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8782234957020058, F1 Score: 0.5124282982791587, Precision: 0.41358024691358025, Recall: 0.6733668341708543
{'loss': 0.3595, 'grad_norm': 5.092230796813965, 'learning_rate': 0.0, 'epoch': 5.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8605539637058262, F1 Score: 0.4859154929577465, Precision: 0.37398373983739835, Recall: 0.6934673366834171
{'train_runtime': 173.061, 'train_samples_per_second': 51.571, 'train_steps_per_second': 0.809, 'train_loss': 0.47116973740713936, 'epoch': 5.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8782234957020058, F1 Score: 0.5124282982791587, Precision: 0.41358024691358025, Recall: 0.6733668341708543


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▇█▁▄▁▄
eval_f1,▁▇████
precision,▁█▆▆▆▆
recall,▁▅████
train/epoch,▁▃▅▆██
train/global_step,▁▁▃▃▅▅▆▆████
train/grad_norm,▁▂█▃▇
train/learning_rate,█▆▄▂▁
train/loss,█▆▂▁▁
train/total_flos,▁

0,1
accuracy,0.87822
best_f1,0.51243
eval_f1,0.51243
precision,0.41358
recall,0.67337
train/epoch,5.0
train/global_step,140.0
train/grad_norm,5.09223
train/learning_rate,0.0
train/loss,0.3595


[34m[1mwandb[0m: Agent Starting Run: dmcfnzg6 with config:
[34m[1mwandb[0m: 	dropout_rate: 0.5
[34m[1mwandb[0m: 	frozen_layers: 8
[34m[1mwandb[0m: 	is_multiclass: False
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	lr_decay: 0.8
[34m[1mwandb[0m: 	num_groups: 1
[34m[1mwandb[0m: 	num_train_epochs: 5
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	scheduler: cosine
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of RoBERTaForPCL were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 28942849 trainable parameters


  0%|          | 0/560 [00:00<?, ?it/s]

{'loss': 0.5316, 'grad_norm': 13.026575088500977, 'learning_rate': 3.6180339887498953e-05, 'epoch': 1.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Checkpoint destination directory ./results/checkpoint-112 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Accuracy: 0.7144221585482331, F1 Score: 0.37184873949579833, Precision: 0.2350597609561753, Recall: 0.8894472361809045
{'loss': 0.3615, 'grad_norm': 6.944854736328125, 'learning_rate': 2.618033988749895e-05, 'epoch': 2.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8815663801337154, F1 Score: 0.5320754716981132, Precision: 0.4259818731117825, Recall: 0.7085427135678392
{'loss': 0.2639, 'grad_norm': 2.829892158508301, 'learning_rate': 1.3819660112501054e-05, 'epoch': 3.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8514804202483286, F1 Score: 0.505564387917329, Precision: 0.3697674418604651, Recall: 0.7989949748743719
{'loss': 0.1882, 'grad_norm': 11.194293975830078, 'learning_rate': 3.819660112501053e-06, 'epoch': 4.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8806112702960841, F1 Score: 0.5335820895522388, Precision: 0.42433234421364985, Recall: 0.7185929648241206
{'loss': 0.1371, 'grad_norm': 7.966089725494385, 'learning_rate': 0.0, 'epoch': 5.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8629417382999045, F1 Score: 0.5192629815745393, Precision: 0.38944723618090454, Recall: 0.7788944723618091
{'train_runtime': 211.9238, 'train_samples_per_second': 42.114, 'train_steps_per_second': 2.642, 'train_loss': 0.2964510083198547, 'epoch': 5.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8806112702960841, F1 Score: 0.5335820895522388, Precision: 0.42433234421364985, Recall: 0.7185929648241206


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁█▇█▇█
eval_f1,▁█▇█▇█
precision,▁█▆█▇█
recall,█▁▅▁▄▁
train/epoch,▁▃▅▆██
train/global_step,▁▁▃▃▅▅▆▆████
train/grad_norm,█▄▁▇▅
train/learning_rate,█▆▄▂▁
train/loss,█▅▃▂▁
train/total_flos,▁

0,1
accuracy,0.88061
best_f1,0.53358
eval_f1,0.53358
precision,0.42433
recall,0.71859
train/epoch,5.0
train/global_step,560.0
train/grad_norm,7.96609
train/learning_rate,0.0
train/loss,0.1371


[34m[1mwandb[0m: Agent Starting Run: 84bpamhg with config:
[34m[1mwandb[0m: 	dropout_rate: 0.3
[34m[1mwandb[0m: 	frozen_layers: 10
[34m[1mwandb[0m: 	is_multiclass: True
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	lr_decay: 0.95
[34m[1mwandb[0m: 	num_groups: 12
[34m[1mwandb[0m: 	num_train_epochs: 3
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	scheduler: cosine
[34m[1mwandb[0m: 	weight_decay: 0.0001


Some weights of RoBERTaForPCL were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 14770181 trainable parameters


  0%|          | 0/84 [00:00<?, ?it/s]

{'loss': 1.439, 'grad_norm': 3.554593086242676, 'learning_rate': 4.052700657469775e-06, 'epoch': 1.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.9049665711556829, F1 Score: 0.0, Precision: 0.0, Recall: 0.0
{'loss': 1.2804, 'grad_norm': 2.181725025177002, 'learning_rate': 1.3509002191565924e-06, 'epoch': 2.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.9049665711556829, F1 Score: 0.0, Precision: 0.0, Recall: 0.0
{'loss': 1.2164, 'grad_norm': 4.125517845153809, 'learning_rate': 0.0, 'epoch': 3.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.9049665711556829, F1 Score: 0.0, Precision: 0.0, Recall: 0.0
{'train_runtime': 102.0871, 'train_samples_per_second': 52.455, 'train_steps_per_second': 0.823, 'train_loss': 1.311917395818801, 'epoch': 3.0}


Evaluating:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.9049665711556829, F1 Score: 0.0, Precision: 0.0, Recall: 0.0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁▁▁▁
eval_f1,▁▁▁▁
precision,▁▁▁▁
recall,▁▁▁▁
train/epoch,▁▅██
train/global_step,▁▁▅▅████
train/grad_norm,▆▁█
train/learning_rate,█▃▁
train/loss,█▃▁
train/total_flos,▁

0,1
accuracy,0.90497
eval_f1,0.0
precision,0.0
recall,0.0
train/epoch,3.0
train/global_step,84.0
train/grad_norm,4.12552
train/learning_rate,0.0
train/loss,1.2164
train/total_flos,348225041790798.0
