In [None]:
%%capture
! pip install transformers datasets evaluate
! pip install accelerate
! pip install --upgrade accelerate
! pip install huggingface_hub
! pip install wandb

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import wandb
wandb.login()

%env WANDB_PROJECT=distilBERT_finetuning
%env WANDB_LOG_MODEL=true

[34m[1mwandb[0m: Currently logged in as: [33mchristian-159[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=distilBERT_finetuning
env: WANDB_LOG_MODEL=true


In [None]:
from datasets import load_dataset, Dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import accelerate
from transformers import DataCollatorWithPadding

# Load data
labels = ClassLabel(num_classes=3, names=["negative", "neutral", "positive"])
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}


# Load dataset
dataset = load_dataset("csv", data_files="/content/drive/MyDrive/masterProject/av_train.csv")
dataset = dataset.rename_column("finBERT", "label")
dataset = dataset.rename_column("summary", "text")

# Split into 80% training and 20% validation
dataset = dataset["train"].train_test_split(train_size=0.8)

# Tokenize dataset using distilbert-base-uncased
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    examples["label"] = labels.str2int(examples["label"])
    return tokenizer(examples["text"], padding=True, truncation=True)


tokenized_train = dataset["train"].map(tokenize_function, batched=True)
tokenized_test = dataset["test"].map(tokenize_function, batched=True)

# Convert to PyTorch tensors for faster training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [None]:
def model_init():
  model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", id2label=id2label, label2id=label2id, num_labels=3)
  return model

In [None]:
sweep_config = {
    'method': 'grid',
    'name': 'distilBERT_finetuning',
    'metric': {
        'goal': 'maximize',
        'name': 'eval/f1'
        },
    'parameters': {
        'per_device_train_batch_size': {
            'values': [32, 64, 128]
            },
        'learning_rate': {
            'values': [1e-5, 3e-5, 5e-5]
        },
        'weight_decay': {
            'values': [0.0, 0.25]
        }
     },
    'early_terminate': {
        'type': 'hyperband',
        'min_iter': 5
    }
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project='distilBERT_finetuning')

Create sweep with ID: uahb15q7
Sweep URL: https://wandb.ai/christian-159/distilBERT_finetuning/sweeps/uahb15q7


In [1]:
def compute_metrics(eval_preds):
  metrics = dict()

  accuracy_metric = evaluate.load("accuracy")
  f1_metric = evaluate.load("f1")


  logits = eval_preds.predictions
  labels = eval_preds.label_ids
  preds = np.argmax(logits, axis=-1)

  metrics.update(accuracy_metric.compute(predictions=preds, references=labels))
  metrics.update(f1_metric.compute(predictions=preds, references=labels, average='weighted'))

  return metrics



def train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config


    # set training arguments
    training_args = TrainingArguments(
        output_dir="distilBERT_finetuning",
        learning_rate=config.learning_rate,
        per_device_train_batch_size=config.per_device_train_batch_size,
        per_device_eval_batch_size=16,
        num_train_epochs=1,
        weight_decay=config.weight_decay,
        evaluation_strategy="steps",
        report_to="wandb",
        eval_steps=25,
        max_steps = 200,
        save_steps = 0,
        load_best_model_at_end=True,
        logging_steps=1
    )


    # define training loop
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )


    # start training loop
    trainer.train()

In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: xh8a03lp with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1
25,1.0568,1.052234,0.42375,0.258076
50,0.9393,0.952162,0.58,0.503611
75,0.8122,0.809131,0.7175,0.6909
100,0.7575,0.71085,0.74,0.726773
125,0.6117,0.647887,0.75375,0.741447
150,0.5218,0.613825,0.77875,0.774719
175,0.6462,0.594484,0.78125,0.776899
200,0.4918,0.590579,0.77875,0.774286


0,1
eval/accuracy,▁▄▇▇▇███
eval/f1,▁▄▇▇████
eval/loss,█▆▄▃▂▁▁▁
eval/runtime,█▁▁▁▁▄▂▁
eval/samples_per_second,▁▇█▇█▄▇█
eval/steps_per_second,▁▇█▇█▄▇█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇██▇▇▇▇▆▇▆▇▅▅▄▆▄▄▃▄▄▂▃▂▂▂▃▂▂▂▃▁▂▁▃▁▁▂▁

0,1
eval/accuracy,0.77875
eval/f1,0.77429
eval/loss,0.59058
eval/runtime,3.8165
eval/samples_per_second,209.615
eval/steps_per_second,13.101
train/epoch,2.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.4918


[34m[1mwandb[0m: Agent Starting Run: 1d9md36s with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,1.0567,1.052248,0.42375,0.258076
50,0.9393,0.95217,0.58,0.503484
75,0.8122,0.809214,0.71875,0.691987
100,0.7577,0.711014,0.74,0.726773
125,0.6115,0.648125,0.75375,0.741447
150,0.5222,0.614106,0.77875,0.774719
175,0.6458,0.594745,0.78125,0.776899
200,0.4918,0.590833,0.7775,0.772887


0,1
eval/accuracy,▁▄▇▇▇███
eval/f1,▁▄▇▇████
eval/loss,█▆▄▃▂▁▁▁
eval/runtime,▁▃▂▃▃▇▇█
eval/samples_per_second,█▆▆▆▆▂▂▁
eval/steps_per_second,█▆▇▆▆▂▂▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇██▇▇▇▇▆▇▆▇▅▅▄▆▄▄▃▄▄▂▃▂▂▂▃▂▂▂▃▁▂▁▃▁▁▂▁

0,1
eval/accuracy,0.7775
eval/f1,0.77289
eval/loss,0.59083
eval/runtime,3.8453
eval/samples_per_second,208.047
eval/steps_per_second,13.003
train/epoch,2.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.4918


[34m[1mwandb[0m: Agent Starting Run: tiqz2qvq with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,1.028,1.035053,0.45,0.31205
50,0.8818,0.866572,0.71375,0.689946
75,0.699,0.704664,0.75875,0.748799
100,0.6822,0.612762,0.77125,0.762123
125,0.6099,0.555349,0.805,0.802196
150,0.5121,0.530686,0.80625,0.805273
175,0.4609,0.514426,0.81,0.808335
200,0.4369,0.511185,0.8,0.798136


0,1
eval/accuracy,▁▆▇▇████
eval/f1,▁▆▇▇████
eval/loss,█▆▄▂▂▁▁▁
eval/runtime,▄▁▅▂▁█▁▁
eval/samples_per_second,▄█▃▇█▁██
eval/steps_per_second,▄█▃▇█▁██
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,████▇█▇▇▇▆▆▅▅▅▄▄▄▄▃▃▃▃▃▂▄▃▂▂▃▂▃▁▃▁▂▂▂▁▃▂

0,1
eval/accuracy,0.8
eval/f1,0.79814
eval/loss,0.51118
eval/runtime,3.5232
eval/samples_per_second,227.063
eval/steps_per_second,14.191
train/epoch,4.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.4369


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ecx6ulv4 with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,1.028,1.035008,0.4525,0.317063
50,0.8818,0.866543,0.71375,0.689946
75,0.6988,0.704613,0.75875,0.748799
100,0.6822,0.612842,0.77125,0.762123
125,0.6099,0.555405,0.805,0.802196
150,0.5123,0.53073,0.805,0.804063
175,0.4609,0.514515,0.80875,0.807115
200,0.4368,0.511236,0.8,0.798136


0,1
eval/accuracy,▁▆▇▇████
eval/f1,▁▆▇▇████
eval/loss,█▆▄▂▂▁▁▁
eval/runtime,▁█▅▁▃▄▁▅
eval/samples_per_second,█▁▄█▆▅█▄
eval/steps_per_second,█▁▄█▆▅█▄
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,████▇█▇▇▇▆▆▅▅▅▄▄▄▄▃▃▃▃▃▂▄▃▂▂▃▂▃▁▃▁▂▂▂▁▃▂

0,1
eval/accuracy,0.8
eval/f1,0.79814
eval/loss,0.51124
eval/runtime,3.8791
eval/samples_per_second,206.231
eval/steps_per_second,12.889
train/epoch,4.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.4368


[34m[1mwandb[0m: Agent Starting Run: 6r0af1ns with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 128
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,1.0211,1.001426,0.47875,0.365863
50,0.7625,0.774662,0.71875,0.687657
75,0.6525,0.622844,0.7675,0.75802
100,0.4924,0.549649,0.78875,0.784697
125,0.4342,0.499853,0.7975,0.796222
150,0.4468,0.469137,0.8125,0.811873


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▆▇███
eval/f1,▁▆▇███
eval/loss,█▅▃▂▁▁
eval/runtime,▃▃▄█▂▁
eval/samples_per_second,▆▆▅▁▇█
eval/steps_per_second,▆▆▅▁▇█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█████▇▇▇▇▆▆▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▃▂▂▂▁▁▂▂▁▂▂▁

0,1
eval/accuracy,0.8125
eval/f1,0.81187
eval/loss,0.46914
eval/runtime,3.5017
eval/samples_per_second,228.463
eval/steps_per_second,14.279
train/epoch,6.2
train/global_step,155.0
train/learning_rate,0.0
train/loss,0.3385


[34m[1mwandb[0m: Agent Starting Run: 4offykpj with config:
[34m[1mwandb[0m: 	learning_rate: 1e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 128
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,1.0211,1.001449,0.47875,0.365863
50,0.7627,0.774815,0.71875,0.687657
75,0.6526,0.623001,0.76875,0.759547
100,0.4925,0.549699,0.78875,0.784697
125,0.4342,0.499883,0.7975,0.796222
150,0.4469,0.469175,0.81375,0.813088
175,0.3428,0.455284,0.82,0.81912
200,0.3316,0.452684,0.81875,0.818466


0,1
eval/accuracy,▁▆▇▇████
eval/f1,▁▆▇▇████
eval/loss,█▅▃▂▂▁▁▁
eval/runtime,▅▁▄▁▄▆▃█
eval/samples_per_second,▄█▅█▅▃▆▁
eval/steps_per_second,▄█▅█▅▃▆▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,████▇▇▇▆▅▆▅▄▄▄▄▃▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▂▁▂▁▁▂▂▁▁

0,1
eval/accuracy,0.81875
eval/f1,0.81847
eval/loss,0.45268
eval/runtime,3.5753
eval/samples_per_second,223.758
eval/steps_per_second,13.985
train/epoch,8.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.3316


[34m[1mwandb[0m: Agent Starting Run: 0pr4qxhl with config:
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.8958,0.868266,0.69625,0.684205
50,0.5281,0.585604,0.78375,0.775582
75,0.5119,0.487917,0.80375,0.802552
100,0.6151,0.458494,0.82375,0.825314
125,0.1876,0.439527,0.82,0.819723
150,0.1814,0.412543,0.845,0.845167
175,0.4064,0.406956,0.835,0.834365
200,0.1582,0.401713,0.83875,0.838654


0,1
eval/accuracy,▁▅▆▇▇███
eval/f1,▁▅▆▇▇███
eval/loss,█▄▂▂▂▁▁▁
eval/runtime,▁█▄▃▃▄▄▄
eval/samples_per_second,█▁▅▆▆▅▅▅
eval/steps_per_second,█▁▅▆▆▅▅▅
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇▇▇▆▆▅▅▄▅▄▄▄▃▄▆▂▃▃▂▂▂▂▂▂▄▂▁▃▁▃▂▃▂▂▂▁▃▁

0,1
eval/accuracy,0.83875
eval/f1,0.83865
eval/loss,0.40171
eval/runtime,3.5138
eval/samples_per_second,227.675
eval/steps_per_second,14.23
train/epoch,2.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.1582


[34m[1mwandb[0m: Agent Starting Run: 9o0dzddn with config:
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.8958,0.868363,0.6975,0.68542
50,0.5287,0.585707,0.7825,0.774095
75,0.5109,0.487741,0.80375,0.802552
100,0.6154,0.458891,0.8225,0.824092
125,0.1866,0.440237,0.81875,0.818463
150,0.1827,0.413064,0.8425,0.842771
175,0.4095,0.407237,0.83625,0.835611
200,0.158,0.401882,0.83875,0.838654


0,1
eval/accuracy,▁▅▆▇▇███
eval/f1,▁▅▆▇▇███
eval/loss,█▄▂▂▂▁▁▁
eval/runtime,▁▇█▁▅▁█▇
eval/samples_per_second,█▂▁█▄█▁▂
eval/steps_per_second,█▂▁█▄█▁▂
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇▇▇▆▆▅▅▄▅▄▄▄▃▄▆▂▃▃▂▂▂▂▂▂▃▂▁▃▁▃▂▃▂▂▂▁▃▁

0,1
eval/accuracy,0.83875
eval/f1,0.83865
eval/loss,0.40188
eval/runtime,3.5327
eval/samples_per_second,226.454
eval/steps_per_second,14.153
train/epoch,2.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.158


[34m[1mwandb[0m: Agent Starting Run: ujpcnjs0 with config:
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.7227,0.762208,0.69,0.637717
50,0.5621,0.532943,0.78625,0.788836
75,0.39,0.439047,0.83125,0.830141
100,0.4437,0.407983,0.8325,0.831303
125,0.2338,0.400854,0.84375,0.844148
150,0.1601,0.390418,0.84875,0.848437
175,0.1623,0.392975,0.84625,0.846328
200,0.1434,0.393332,0.84625,0.846213


0,1
eval/accuracy,▁▅▇▇████
eval/f1,▁▆▇▇████
eval/loss,█▄▂▁▁▁▁▁
eval/runtime,▂█▁▂▇▁▃█
eval/samples_per_second,▇▁█▇▂█▅▁
eval/steps_per_second,▇▁█▇▂█▅▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,███▇▆▆▅▄▅▄▅▃▄▃▃▃▃▃▂▃▂▂▂▂▃▂▂▂▂▂▂▁▁▂▂▁▁▁▃▁

0,1
eval/accuracy,0.84625
eval/f1,0.84621
eval/loss,0.39333
eval/runtime,3.9152
eval/samples_per_second,204.33
eval/steps_per_second,12.771
train/epoch,4.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.1434


[34m[1mwandb[0m: Agent Starting Run: n9rh1an3 with config:
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.7228,0.76244,0.69,0.637717
50,0.5624,0.533505,0.785,0.787629
75,0.39,0.439286,0.82875,0.827717
100,0.4428,0.407634,0.8325,0.83135
125,0.2341,0.401212,0.84375,0.844148
150,0.1607,0.390454,0.8475,0.847209
175,0.1627,0.393165,0.8475,0.847668
200,0.1437,0.393368,0.84625,0.846213


0,1
eval/accuracy,▁▅▇▇████
eval/f1,▁▆▇▇████
eval/loss,█▄▂▁▁▁▁▁
eval/runtime,█▂▅▂▂▇▁▁
eval/samples_per_second,▁▇▃▆▇▂██
eval/steps_per_second,▁▇▃▆▇▂██
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,███▇▆▆▅▄▅▄▅▃▄▃▃▃▃▃▂▂▂▂▂▂▃▂▂▂▂▂▂▁▁▂▂▁▁▁▃▁

0,1
eval/accuracy,0.84625
eval/f1,0.84621
eval/loss,0.39337
eval/runtime,3.5006
eval/samples_per_second,228.53
eval/steps_per_second,14.283
train/epoch,4.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.1437


[34m[1mwandb[0m: Agent Starting Run: 368syhfe with config:
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 128
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.7094,0.671746,0.76125,0.755896
50,0.4037,0.443352,0.8225,0.821827
75,0.3016,0.399959,0.84375,0.843833
100,0.1832,0.389825,0.84875,0.847649
125,0.0796,0.390036,0.865,0.864738
150,0.1707,0.40572,0.86,0.860261
175,0.0597,0.419935,0.8625,0.862219
200,0.0428,0.416277,0.865,0.865166


0,1
eval/accuracy,▁▅▇▇████
eval/f1,▁▅▇▇████
eval/loss,█▂▁▁▁▁▂▂
eval/runtime,▃▂▂▄▂▂▁█
eval/samples_per_second,▆▇▇▅▇▇█▁
eval/steps_per_second,▆▇▇▅▇▇█▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇▇▆▅▄▄▃▄▃▃▃▂▃▂▃▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.865
eval/f1,0.86517
eval/loss,0.41628
eval/runtime,3.6442
eval/samples_per_second,219.529
eval/steps_per_second,13.721
train/epoch,8.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.0428


[34m[1mwandb[0m: Agent Starting Run: j76ja80z with config:
[34m[1mwandb[0m: 	learning_rate: 3e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 128
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.7094,0.671792,0.76,0.754455
50,0.4039,0.443289,0.82125,0.820492
75,0.3017,0.399752,0.845,0.845054
100,0.1836,0.389826,0.8475,0.846305
125,0.0797,0.389903,0.865,0.864701
150,0.1708,0.405276,0.86125,0.861484
175,0.0597,0.419566,0.8625,0.862219
200,0.0429,0.415866,0.86375,0.863941


0,1
eval/accuracy,▁▅▇▇████
eval/f1,▁▅▇▇████
eval/loss,█▂▁▁▁▁▂▂
eval/runtime,█▆▄▄▂▂▁▁
eval/samples_per_second,▁▃▅▅▇▇██
eval/steps_per_second,▁▃▅▅▇▇██
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇▇▆▅▄▄▃▄▃▃▃▂▃▂▃▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.86375
eval/f1,0.86394
eval/loss,0.41587
eval/runtime,3.5219
eval/samples_per_second,227.152
eval/steps_per_second,14.197
train/epoch,8.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.0429


[34m[1mwandb[0m: Agent Starting Run: 9z2a0ksj with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.8671,0.735452,0.69,0.685211
50,0.534,0.56732,0.77125,0.759257
75,0.4767,0.444425,0.83,0.828382
100,0.5948,0.436632,0.81625,0.818139
125,0.1309,0.432447,0.82875,0.828873
150,0.0817,0.391767,0.84625,0.846371
175,0.3097,0.392596,0.84,0.839566
200,0.1116,0.397697,0.84,0.84026


0,1
eval/accuracy,▁▅▇▇▇███
eval/f1,▁▄▇▇▇███
eval/loss,█▅▂▂▂▁▁▁
eval/runtime,▂▅▃▂▂▁▄█
eval/samples_per_second,▇▄▆▇▇█▄▁
eval/steps_per_second,▇▄▆▇▇█▄▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇▆▆▅▅▅▄▄▅▄▃▃▂▄▆▂▃▃▂▂▂▂▂▁▃▁▁▃▁▂▁▃▂▂▂▁▂▁

0,1
eval/accuracy,0.84
eval/f1,0.84026
eval/loss,0.3977
eval/runtime,3.9372
eval/samples_per_second,203.193
eval/steps_per_second,12.7
train/epoch,2.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.1116


[34m[1mwandb[0m: Agent Starting Run: 0yjl9okn with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 32
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.8648,0.732332,0.69125,0.686265
50,0.5284,0.565112,0.7725,0.760446
75,0.4798,0.446333,0.82625,0.82474
100,0.593,0.438158,0.8175,0.819376
125,0.1282,0.432959,0.825,0.82509
150,0.0848,0.392107,0.84875,0.848853
175,0.3016,0.392042,0.84,0.839541
200,0.112,0.397911,0.84,0.84026


0,1
eval/accuracy,▁▅▇▇▇███
eval/f1,▁▄▇▇▇███
eval/loss,█▅▂▂▂▁▁▁
eval/runtime,█▃▃▂▂▂▁▇
eval/samples_per_second,▁▆▆▇▇▇█▂
eval/steps_per_second,▁▆▆▇▇▇█▂
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▇▆▆▅▅▅▄▄▅▄▃▃▂▄▆▂▃▃▂▂▂▂▂▁▃▁▁▃▁▂▁▃▂▂▂▁▂▁

0,1
eval/accuracy,0.84
eval/f1,0.84026
eval/loss,0.39791
eval/runtime,3.6703
eval/samples_per_second,217.963
eval/steps_per_second,13.623
train/epoch,2.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.112


[34m[1mwandb[0m: Agent Starting Run: 3g1c1jf7 with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.5582,0.591705,0.76625,0.755007
50,0.4674,0.490211,0.7975,0.798012
75,0.3144,0.426215,0.83125,0.829488
100,0.4094,0.402221,0.845,0.845159
125,0.1201,0.402254,0.85,0.850403
150,0.1447,0.399734,0.85625,0.856014
175,0.0424,0.41004,0.86,0.859999
200,0.1132,0.418501,0.855,0.854816


0,1
eval/accuracy,▁▃▆▇▇███
eval/f1,▁▄▆▇▇███
eval/loss,█▄▂▁▁▁▁▂
eval/runtime,▃▁▇▂▃█▁▃
eval/samples_per_second,▆█▂▇▆▁█▆
eval/steps_per_second,▆█▂▇▆▁█▆
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▇▇▆▅▅▄▄▅▄▄▂▃▃▃▂▂▃▂▃▂▂▂▂▃▂▂▂▁▂▂▁▁▁▁▁▁▁▂▂

0,1
eval/accuracy,0.855
eval/f1,0.85482
eval/loss,0.4185
eval/runtime,3.6368
eval/samples_per_second,219.971
eval/steps_per_second,13.748
train/epoch,4.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.1132


[34m[1mwandb[0m: Agent Starting Run: xkjb3g2l with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 64
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.5589,0.591253,0.765,0.753855
50,0.4682,0.490228,0.79875,0.799156
75,0.3178,0.426917,0.8325,0.830694
100,0.4109,0.40347,0.84625,0.846377
125,0.1218,0.403005,0.84875,0.849202
150,0.1438,0.401253,0.855,0.854712


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.5589,0.591253,0.765,0.753855
50,0.4682,0.490228,0.79875,0.799156
75,0.3178,0.426917,0.8325,0.830694
100,0.4109,0.40347,0.84625,0.846377
125,0.1218,0.403005,0.84875,0.849202
150,0.1438,0.401253,0.855,0.854712
175,0.0427,0.411972,0.85875,0.858786
200,0.1127,0.420169,0.8525,0.852294


0,1
eval/accuracy,▁▄▆▇▇███
eval/f1,▁▄▆▇▇███
eval/loss,█▄▂▁▁▁▁▂
eval/runtime,█▅▁▆▂▃▅▁
eval/samples_per_second,▁▄█▃▇▅▄█
eval/steps_per_second,▁▄█▃▇▅▄█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▇▇▆▅▅▄▄▅▄▄▂▃▃▃▂▂▃▂▃▂▂▂▂▃▂▂▂▁▂▂▁▁▁▁▁▁▁▂▂

0,1
eval/accuracy,0.8525
eval/f1,0.85229
eval/loss,0.42017
eval/runtime,3.5409
eval/samples_per_second,225.93
eval/steps_per_second,14.121
train/epoch,4.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.1127


[34m[1mwandb[0m: Agent Starting Run: rnjx6du0 with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 128
[34m[1mwandb[0m: 	weight_decay: 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.616,0.528648,0.785,0.781218
50,0.3095,0.422889,0.83125,0.831444
75,0.2183,0.432609,0.84,0.841677
100,0.0643,0.401695,0.8575,0.857022
125,0.0617,0.493376,0.85375,0.852633
150,0.0525,0.523972,0.84,0.8406
175,0.0156,0.493834,0.86,0.860186
200,0.0161,0.496551,0.8575,0.857884


0,1
eval/accuracy,▁▅▆█▇▆██
eval/f1,▁▅▆█▇▆██
eval/loss,█▂▃▁▆█▆▆
eval/runtime,█▂▁▂▅▇▂▅
eval/samples_per_second,▁▇█▇▄▂▇▄
eval/steps_per_second,▁▇█▇▄▂▇▄
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▆▅▅▄▃▃▃▃▂▃▂▂▂▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.8575
eval/f1,0.85788
eval/loss,0.49655
eval/runtime,3.8428
eval/samples_per_second,208.181
eval/steps_per_second,13.011
train/epoch,8.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.0161


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ekj4imha with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05
[34m[1mwandb[0m: 	per_device_train_batch_size: 128
[34m[1mwandb[0m: 	weight_decay: 0.25


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy,F1
25,0.6161,0.528935,0.785,0.781218
50,0.3093,0.422393,0.83125,0.831444
75,0.2169,0.431455,0.84375,0.845229
100,0.0639,0.401518,0.8575,0.857033
125,0.0617,0.490218,0.85125,0.850211
150,0.0564,0.518607,0.84,0.840807
175,0.0171,0.492759,0.8575,0.857725
200,0.0158,0.49325,0.85625,0.856615


0,1
eval/accuracy,▁▅▇█▇▆██
eval/f1,▁▆▇█▇▆██
eval/loss,█▂▃▁▆▇▆▆
eval/runtime,▄▁▁▅█▅▁▁
eval/samples_per_second,▄█▇▄▁▄██
eval/steps_per_second,▄█▇▄▁▄██
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,██▆▅▅▄▃▃▃▃▂▃▂▂▂▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.85625
eval/f1,0.85662
eval/loss,0.49325
eval/runtime,3.5331
eval/samples_per_second,226.43
eval/steps_per_second,14.152
train/epoch,8.0
train/global_step,200.0
train/learning_rate,0.0
train/loss,0.0158


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
