In [1]:
import os
import csv
import pandas as pd
import torch
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, DistilBertTokenizer, DistilBertForSequenceClassification
import torch.nn as nn
import wandb

from google.colab import auth
from google.colab import drive
from google.colab import userdata

In [2]:
# =========================
# STEP 0: mount to drive
# =========================
#auth.authenticate_user()
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# =========================
# STEP 1: Initialize WandB
# =========================
# Retrieve API Key from Colab Secrets
wandb_api_key = userdata.get('WANDB_API_KEY')

if wandb_api_key:
    os.environ["WANDB_API_KEY"] = wandb_api_key
    wandb.login(key=wandb_api_key)
    print("✅ WandB Logged in Securely")
else:
    print("❌ Error: WANDB_API_KEY not found. Set it in Colab Secrets.")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maalhaizaey[0m ([33mabdulrahim-alhaizaey[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ WandB Logged in Securely


In [4]:
# =========================
# STEP 2: LOAD DATA & CLEAN TEXT
# =========================
nltk.download('stopwords')
nltk.download('wordnet')

multi_data = pd.read_csv("/content/10006_dataset_Multi.csv", encoding='ISO-8859-1')

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return text

multi_data['cleaned_text'] = multi_data['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
# =========================
# STEP 3: PREPARE DATASETS & LABEL ENCODING (Train Only)
# =========================
multi_train, multi_val = train_test_split(multi_data, test_size=0.2, stratify=multi_data['label'], random_state=42)

# Fit LabelEncoder only on training data to prevent leakage
multi_le = LabelEncoder()
multi_le.fit(multi_train['label'])
multi_train['encoded_label'] = multi_le.transform(multi_train['label'])
multi_val['encoded_label'] = multi_le.transform(multi_val['label'])


In [6]:
# =========================
# STEP 4: TOKENIZATION (Train Data Only for max_length)
# =========================
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Calculate max_length using training data
max_length = min(tokenizer.model_max_length, max(multi_train['cleaned_text'].apply(lambda x: len(tokenizer.tokenize(x)))))
print(f"Using max_length from training set only: {max_length}")

def tokenize_data(df, label_col):
    encodings = tokenizer(df['cleaned_text'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    labels = torch.tensor(df[label_col].tolist())
    return encodings, labels

train_multi_enc, train_multi_labels = tokenize_data(multi_train, 'encoded_label')
val_multi_enc, val_multi_labels = tokenize_data(multi_val, 'encoded_label')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Using max_length from training set only: 128


In [7]:
# =========================
# STEP 5: DATASET CLASS & METRICS
# =========================
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(labels, predictions)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mcc": mcc
    }


In [8]:
# =========================
# STEP 6: FOCAL LOSS & CUSTOM TRAINER
# =========================
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean() if self.reduction == 'mean' else focal_loss.sum()

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.training_loss = []
        self.validation_loss = []
        self.results = []

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = FocalLoss(alpha=0.25, gamma=2.0)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

    def evaluate(self, *args, **kwargs):
        output = super().evaluate(*args, **kwargs)
        self.validation_loss.append(output['eval_loss'])
        self.results.append(output.copy())
        return output

    def log(self, logs, *args, **kwargs):
        super().log(logs, *args, **kwargs)
        if 'loss' in logs:
            self.training_loss.append(logs['loss'])


In [9]:
# =========================
# STEP 7: ADD PLOTTING (Confusion Matrices & Loss Curves)
# =========================
def plot_confusion_matrix(labels, predictions, label_encoder, output_dir, run_name):
    class_names = label_encoder.classes_
    conf_matrix = confusion_matrix(labels, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f"{output_dir}/{run_name}_confusion_numbers.png")
    plt.close()

def plot_confusion_matrix_percent(labels, predictions, label_encoder, output_dir, run_name):
    class_names = label_encoder.classes_
    conf_matrix = confusion_matrix(labels, predictions, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f"{output_dir}/{run_name}_confusion_percent.png")
    plt.close()

def plot_confusion_matrix_class_weighted(labels, predictions, label_encoder, output_dir, run_name):
    conf_matrix = confusion_matrix(labels, predictions, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='.2f', cmap='coolwarm', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f"{output_dir}/{run_name}_confusion_weighted.png")
    plt.close()

def plot_loss(training_loss, validation_loss, output_dir, run_name):
    min_length = min(len(training_loss), len(validation_loss))
    training_loss = training_loss[:min_length]
    validation_loss = validation_loss[:min_length]
    epochs = range(1, min_length + 1)
    plt.figure(figsize=(8, 6))
    plt.plot(epochs, training_loss, label='Training Loss', marker='o')
    plt.plot(epochs, validation_loss, label='Validation Loss', marker='s', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{output_dir}/{run_name}_loss_curves.png")
    plt.close()


In [10]:
# =========================
# STEP 8: TRAIN FUNCTION
# =========================
def train_distilbert(train_dataset, eval_dataset, num_labels, output_dir, label_encoder, config=None):
    with wandb.init(config=config):
        config = wandb.config
        run_name = wandb.run.name

        base_dir = "/content/drive/MyDrive/DistilBERT_8_Results"
        run_dir = os.path.join(base_dir, run_name)
        os.makedirs(run_dir, exist_ok=True)

        model = DistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased",
            num_labels=num_labels,
            dropout=config.dropout
        )

        num_devices = torch.cuda.device_count()  # Will be 1 on Colab
        examples_per_step = config.batch_size * num_devices * config.gradient_accumulation_steps
        total_steps = int(np.ceil(len(train_dataset) / examples_per_step) * config.num_train_epochs)

        warmup_steps = int(0.1 * total_steps)
        print(f"Total Steps: {total_steps}, Warmup Steps (10%): {warmup_steps}")

        training_args = TrainingArguments(
            output_dir=output_dir,
            run_name=run_name,
            num_train_epochs=config.num_train_epochs,
            per_device_train_batch_size=config.batch_size,
            per_device_eval_batch_size=config.batch_size,
            learning_rate=config.learning_rate,
            warmup_steps=warmup_steps,
            weight_decay=config.weight_decay,
            evaluation_strategy="epoch",
            logging_strategy="epoch",
            save_strategy="epoch",
            fp16=True,
            gradient_checkpointing=True,
            save_total_limit=2,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            max_grad_norm=config.max_grad_norm,
            lr_scheduler_type="cosine_with_restarts",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            report_to="wandb"
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        trainer.train()

        # Save Best Model
        best_model_path = os.path.join(run_dir, "best_model")
        trainer.save_model(best_model_path)
        print(f"✅ Best model saved to {best_model_path}")

        # Evaluate Best Model
        results = trainer.evaluate(eval_dataset=eval_dataset)

        predictions_obj = trainer.predict(eval_dataset)
        predictions = np.argmax(predictions_obj.predictions, axis=1)
        labels = predictions_obj.label_ids

        wandb.log({
            "eval_loss": results.get("eval_loss"),
            "eval_accuracy": results.get("eval_accuracy"),
            "eval_precision": results.get("eval_precision"),
            "eval_recall": results.get("eval_recall"),
            "eval_f1": results.get("eval_f1"),
            "eval_mcc": results.get("eval_mcc")
        })

        plot_confusion_matrix(labels, predictions, label_encoder, run_dir, run_name)
        plot_confusion_matrix_percent(labels, predictions, label_encoder, run_dir, run_name)
        plot_confusion_matrix_class_weighted(labels, predictions, label_encoder, run_dir, run_name)
        plot_loss(trainer.training_loss, trainer.validation_loss, run_dir, run_name)

        # Save run parameters and results to a CSV file
        results_csv = os.path.join(run_dir, "results.csv")
        with open(results_csv, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Parameter", "Value"])
            for key, value in config.items():
                writer.writerow([key, value])
            writer.writerow(["Epoch", "Training Loss", "Validation Loss", "Accuracy", "Precision", "Recall", "F1", "Mcc"])
            for epoch, result in enumerate(trainer.results, start=1):
                writer.writerow([
                    epoch,
                    trainer.training_loss[epoch - 1] if epoch - 1 < len(trainer.training_loss) else None,
                    trainer.validation_loss[epoch - 1] if epoch - 1 < len(trainer.validation_loss) else None,
                    result.get("eval_accuracy"),
                    result.get("eval_precision"),
                    result.get("eval_recall"),
                    result.get("eval_f1"),
                    result.get("eval_mcc")
                ])

        return results


In [11]:
# =========================
# STEP 9: RUN SWEEP CONFIGURATION
# =========================
train_dataset = TextDataset(train_multi_enc, train_multi_labels)
val_dataset = TextDataset(val_multi_enc, val_multi_labels)

sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'eval_loss', 'goal': 'minimize'},
    'parameters': {
        'num_train_epochs': {'values': [12]},
        'learning_rate': {'min': 1e-5, 'max': 5e-5},
        'batch_size': {'values': [8, 16, 32]},
        'weight_decay': {'min': 0.01, 'max': 0.05},
        'dropout': {'min': 0.05, 'max': 0.5},
        'gradient_accumulation_steps': {'values': [2, 4]},
        'max_grad_norm': {'values': [1.0, 2.0]}
    },
}

In [12]:
sweep_id = wandb.sweep(sweep=sweep_config, project='DistilBERT_8')

wandb.agent(sweep_id, function=lambda: train_distilbert(
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    num_labels=len(multi_le.classes_),
    output_dir="multi_output",
    label_encoder=multi_le
))


Create sweep with ID: 2p6dq1ds
Sweep URL: https://wandb.ai/abdulrahim-alhaizaey/DistilBERT_8/sweeps/2p6dq1ds


[34m[1mwandb[0m: Agent Starting Run: vu2d8npe with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.3245944036381566
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 3.6124132549024086e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.04725294921360067
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2134,0.090775,0.811189,0.803721,0.811189,0.802539,0.655183
2,0.0586,0.053151,0.850649,0.876225,0.850649,0.85698,0.754337
3,0.0281,0.059096,0.848651,0.877383,0.848651,0.855942,0.762452
4,0.0156,0.039495,0.888112,0.903331,0.888112,0.891628,0.814056
5,0.01,0.048471,0.882617,0.89868,0.882617,0.887139,0.805785
6,0.0062,0.043456,0.896104,0.905919,0.896104,0.899066,0.826589
7,0.0044,0.047547,0.885614,0.900956,0.885614,0.889698,0.812138


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/super-sweep-1/best_model


0,1
eval/accuracy,▁▄▄▇▇█▇▇
eval/f1,▁▅▅▇▇█▇▇
eval/loss,█▃▄▁▂▂▂▁
eval/mcc,▁▅▅▇▇█▇▇
eval/precision,▁▆▆█████
eval/recall,▁▄▄▇▇█▇▇
eval/runtime,▄▁▃▂█▄▄▅
eval/samples_per_second,▄█▆▇▁▅▅▄
eval/steps_per_second,▄█▆▇▁▅▅▄
eval_accuracy,▁

0,1
eval/accuracy,0.88811
eval/f1,0.89163
eval/loss,0.03949
eval/mcc,0.81406
eval/precision,0.90333
eval/recall,0.88811
eval/runtime,0.8987
eval/samples_per_second,2227.678
eval/steps_per_second,140.203
eval_accuracy,0.88811


[34m[1mwandb[0m: Agent Starting Run: jvbuh3jz with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.3592106730370679
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 4.66033951250658e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.021170780614055168


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2322,0.147487,0.705794,0.702826,0.705794,0.69542,0.494911
2,0.0807,0.052953,0.858142,0.874335,0.858142,0.857997,0.755862
3,0.0317,0.049224,0.867632,0.887199,0.867632,0.873308,0.783974
4,0.0184,0.038135,0.892607,0.902922,0.892607,0.895226,0.816923
5,0.0126,0.05555,0.840659,0.881634,0.840659,0.8514,0.754092
6,0.0093,0.039086,0.898102,0.907653,0.898102,0.900981,0.829586
7,0.0065,0.03957,0.891608,0.901692,0.891608,0.894424,0.817828


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/ethereal-sweep-2/best_model


0,1
eval/accuracy,▁▇▇█▆███
eval/f1,▁▇▇█▆███
eval/loss,█▂▂▁▂▁▁▁
eval/mcc,▁▆▇█▆███
eval/precision,▁▇▇█▇███
eval/recall,▁▇▇█▆███
eval/runtime,▂▅▁▄▃█▅█
eval/samples_per_second,▇▄█▅▅▁▃▁
eval/steps_per_second,▇▄█▅▅▁▃▁
eval_accuracy,▁

0,1
eval/accuracy,0.89261
eval/f1,0.89523
eval/loss,0.03814
eval/mcc,0.81692
eval/precision,0.90292
eval/recall,0.89261
eval/runtime,0.8992
eval/samples_per_second,2226.414
eval/steps_per_second,140.124
eval_accuracy,0.89261


[34m[1mwandb[0m: Agent Starting Run: dw5j1ovo with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.12693357322818857
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 2.2040750388114553e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.039913677796123405


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2217,0.107274,0.783716,0.756975,0.783716,0.756202,0.602208
2,0.0625,0.039461,0.892607,0.901673,0.892607,0.89333,0.814072
3,0.0218,0.040957,0.878621,0.892272,0.878621,0.881977,0.797288
4,0.0096,0.03177,0.914585,0.916627,0.914585,0.914472,0.849895
5,0.0053,0.037978,0.896104,0.908385,0.896104,0.899483,0.825577
6,0.0026,0.037745,0.923077,0.923701,0.923077,0.922221,0.862715
7,0.0015,0.035797,0.91958,0.921514,0.91958,0.920046,0.859282


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/wandering-sweep-3/best_model


0,1
eval/accuracy,▁▆▆█▇███
eval/f1,▁▇▆█▇███
eval/loss,█▂▂▁▂▂▁▁
eval/mcc,▁▇▆█▇███
eval/precision,▁▇▇█▇███
eval/recall,▁▆▆█▇███
eval/runtime,▁▂▄▂▂▅▁█
eval/samples_per_second,█▇▅▇▇▃█▁
eval/steps_per_second,█▇▅▇▇▃█▁
eval_accuracy,▁

0,1
eval/accuracy,0.91459
eval/f1,0.91447
eval/loss,0.03177
eval/mcc,0.84989
eval/precision,0.91663
eval/recall,0.91459
eval/runtime,1.8279
eval/samples_per_second,1095.252
eval/steps_per_second,137.317
eval_accuracy,0.91459


[34m[1mwandb[0m: Agent Starting Run: ibtspjgl with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.31461629590516493
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.148278128161659e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.026480087598756506


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.258,0.165821,0.686314,0.598686,0.686314,0.623102,0.36358
2,0.1236,0.074672,0.840659,0.84594,0.840659,0.838441,0.71623
3,0.0589,0.064593,0.827672,0.856634,0.827672,0.835784,0.726241
4,0.0321,0.038627,0.882617,0.893444,0.882617,0.884404,0.799824
5,0.024,0.048461,0.858641,0.882812,0.858641,0.865207,0.770887
6,0.0183,0.033579,0.8996,0.906038,0.8996,0.901658,0.828438
7,0.0144,0.03758,0.884615,0.897097,0.884615,0.888037,0.808103
8,0.0127,0.038118,0.880619,0.897753,0.880619,0.885294,0.803437
9,0.0105,0.037696,0.882118,0.898244,0.882118,0.886602,0.805338


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/bright-sweep-4/best_model


0,1
eval/accuracy,▁▆▆▇▇██▇▇█
eval/f1,▁▆▆█▇█████
eval/loss,█▃▃▁▂▁▁▁▁▁
eval/mcc,▁▆▆█▇█████
eval/precision,▁▇▇█▇█████
eval/recall,▁▆▆▇▇██▇▇█
eval/runtime,▂▂▁▂▃▂▃▃▃█
eval/samples_per_second,▇▇█▇▆▇▆▆▆▁
eval/steps_per_second,▇▇█▇▆▇▆▆▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.8996
eval/f1,0.90166
eval/loss,0.03358
eval/mcc,0.82844
eval/precision,0.90604
eval/recall,0.8996
eval/runtime,1.8222
eval/samples_per_second,1098.694
eval/steps_per_second,137.748
eval_accuracy,0.8996


[34m[1mwandb[0m: Agent Starting Run: w9igwoj1 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.3765339926005968
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.520879894346316e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.025676336596033952


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2428,0.152654,0.732268,0.682346,0.732268,0.702978,0.49225
2,0.1064,0.061998,0.842158,0.860384,0.842158,0.845275,0.732445
3,0.0473,0.074255,0.8002,0.853616,0.8002,0.813054,0.698677
4,0.029,0.044855,0.862637,0.883354,0.862637,0.867569,0.776992
5,0.0221,0.050924,0.84965,0.884618,0.84965,0.857969,0.764971
6,0.017,0.043597,0.866134,0.887817,0.866134,0.871876,0.785928
7,0.0136,0.045062,0.871129,0.892265,0.871129,0.876301,0.793749
8,0.012,0.046461,0.866633,0.890466,0.866633,0.872556,0.787594
9,0.0104,0.046014,0.867133,0.89206,0.867133,0.873305,0.789361


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/sweet-sweep-5/best_model


0,1
eval/accuracy,▁▇▄█▇█████
eval/f1,▁▇▅█▇█████
eval/loss,█▂▃▁▁▁▁▁▁▁
eval/mcc,▁▇▆█▇█████
eval/precision,▁▇▇███████
eval/recall,▁▇▄█▇█████
eval/runtime,▇▄▃▆▃▅▇██▁
eval/samples_per_second,▂▅▆▃▆▄▂▁▁█
eval/steps_per_second,▂▅▆▃▆▄▂▁▁█
eval_accuracy,▁

0,1
eval/accuracy,0.86613
eval/f1,0.87188
eval/loss,0.0436
eval/mcc,0.78593
eval/precision,0.88782
eval/recall,0.86613
eval/runtime,1.7095
eval/samples_per_second,1171.119
eval/steps_per_second,146.829
eval_accuracy,0.86613


[34m[1mwandb[0m: Agent Starting Run: 1m9ndrdn with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.4764892013991829
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.7424884435051778e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.034219326890440195


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2672,0.186433,0.671828,0.578518,0.671828,0.580674,0.280427
2,0.1461,0.126836,0.734765,0.799824,0.734765,0.738284,0.567618
3,0.089,0.102033,0.764236,0.824701,0.764236,0.779439,0.64427
4,0.0533,0.067407,0.813187,0.855375,0.813187,0.822664,0.706624
5,0.041,0.090686,0.747253,0.83716,0.747253,0.765848,0.644554
6,0.0334,0.064567,0.807692,0.857103,0.807692,0.819394,0.712155
7,0.0287,0.068284,0.803696,0.858718,0.803696,0.816172,0.709835
8,0.0252,0.063713,0.814685,0.86314,0.814685,0.826175,0.720769
9,0.025,0.067699,0.805195,0.860711,0.805195,0.81785,0.710283
10,0.0216,0.074621,0.789211,0.857576,0.789211,0.804355,0.69621


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/balmy-sweep-6/best_model


0,1
eval/accuracy,▁▄▆█▅█▇██▇▇█
eval/f1,▁▅▇█▆████▇▇█
eval/loss,█▅▃▁▃▁▁▁▁▂▂▁
eval/mcc,▁▆▇█▇███████
eval/precision,▁▆▇█▇███████
eval/recall,▁▄▆█▅█▇██▇▇█
eval/runtime,▁▅▄▄▇▆▃▁▆█▄▃
eval/samples_per_second,█▄▅▄▂▃▆█▃▁▅▆
eval/steps_per_second,█▄▅▄▂▃▆█▃▁▅▆
eval_accuracy,▁

0,1
eval/accuracy,0.81469
eval/f1,0.82617
eval/loss,0.06371
eval/mcc,0.72077
eval/precision,0.86314
eval/recall,0.81469
eval/runtime,0.8919
eval/samples_per_second,2244.542
eval/steps_per_second,141.265
eval_accuracy,0.81469


[34m[1mwandb[0m: Agent Starting Run: 9c37on8q with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.192976532120409
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 2.360003150383677e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.01648886999900559


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2257,0.107322,0.782717,0.78402,0.782717,0.772914,0.604327
2,0.0623,0.037802,0.889111,0.895631,0.889111,0.888945,0.805313
3,0.0228,0.044072,0.872128,0.889679,0.872128,0.876924,0.792528
4,0.0115,0.032383,0.905594,0.911746,0.905594,0.90736,0.838051
5,0.007,0.035456,0.913586,0.917528,0.913586,0.914358,0.849672
6,0.004,0.033199,0.924575,0.92518,0.924575,0.924454,0.866426
7,0.0022,0.035773,0.912587,0.915765,0.912587,0.913526,0.848342


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/autumn-sweep-7/best_model


0,1
eval/accuracy,▁▆▅▇▇█▇▇
eval/f1,▁▆▆▇██▇▇
eval/loss,█▂▂▁▁▁▁▁
eval/mcc,▁▆▆▇███▇
eval/precision,▁▇▆▇███▇
eval/recall,▁▆▅▇▇█▇▇
eval/runtime,▂▂▂█▆▄▁▅
eval/samples_per_second,▇▇▇▁▃▅█▄
eval/steps_per_second,▇▇▇▁▃▅█▄
eval_accuracy,▁

0,1
eval/accuracy,0.90559
eval/f1,0.90736
eval/loss,0.03238
eval/mcc,0.83805
eval/precision,0.91175
eval/recall,0.90559
eval/runtime,0.9039
eval/samples_per_second,2214.9
eval/steps_per_second,139.399
eval_accuracy,0.90559


[34m[1mwandb[0m: Agent Starting Run: 4ew13c5y with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.1399249642973313
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 2.419688184108871e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.01775891204458119


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.218,0.10148,0.794705,0.805972,0.794705,0.7744,0.626285
2,0.0588,0.038391,0.893107,0.900454,0.893107,0.89348,0.813614
3,0.0216,0.040883,0.882118,0.896568,0.882118,0.885379,0.803264
4,0.0094,0.034375,0.91009,0.914235,0.91009,0.911051,0.843832
5,0.0051,0.037057,0.903097,0.910479,0.903097,0.905226,0.834973
6,0.0024,0.038606,0.915085,0.915456,0.915085,0.913896,0.848244
7,0.0016,0.037476,0.915584,0.918964,0.915584,0.916599,0.853973


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/hopeful-sweep-8/best_model


0,1
eval/accuracy,▁▇▆█▇███
eval/f1,▁▇▆█▇███
eval/loss,█▁▂▁▁▁▁▁
eval/mcc,▁▇▆█▇███
eval/precision,▁▇▇█▇███
eval/recall,▁▇▆█▇███
eval/runtime,▅▇▂█▇▃▂▁
eval/samples_per_second,▄▂▇▁▂▆▇█
eval/steps_per_second,▄▂▇▁▂▆▇█
eval_accuracy,▁

0,1
eval/accuracy,0.91009
eval/f1,0.91105
eval/loss,0.03438
eval/mcc,0.84383
eval/precision,0.91423
eval/recall,0.91009
eval/runtime,1.7281
eval/samples_per_second,1158.509
eval/steps_per_second,145.248
eval_accuracy,0.91009


[34m[1mwandb[0m: Agent Starting Run: mm5sm5ck with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.06580443235321605
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 3.113683044124743e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.02089670351351492


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2046,0.083928,0.823177,0.816492,0.823177,0.812079,0.671524
2,0.0453,0.037162,0.892607,0.903442,0.892607,0.894209,0.814449
3,0.0149,0.032471,0.908092,0.911822,0.908092,0.909169,0.840653
4,0.0061,0.035572,0.915584,0.91717,0.915584,0.91546,0.850778
5,0.0026,0.035583,0.914086,0.918056,0.914086,0.914817,0.848112
6,0.0012,0.037432,0.925075,0.927526,0.925075,0.924883,0.867257


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/noble-sweep-9/best_model


0,1
eval/accuracy,▁▆▇▇▇█▇
eval/f1,▁▆▇▇▇█▇
eval/loss,█▂▁▁▁▂▁
eval/mcc,▁▆▇▇▇█▇
eval/precision,▁▆▇▇▇█▇
eval/recall,▁▆▇▇▇█▇
eval/runtime,▁▂▁▂▂▂█
eval/samples_per_second,█▇█▆▇▇▁
eval/steps_per_second,█▇█▆▇▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.90809
eval/f1,0.90917
eval/loss,0.03247
eval/mcc,0.84065
eval/precision,0.91182
eval/recall,0.90809
eval/runtime,0.9431
eval/samples_per_second,2122.875
eval/steps_per_second,133.608
eval_accuracy,0.90809


[34m[1mwandb[0m: Agent Starting Run: fnewntrh with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.11240494464895526
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.3908418281028728e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.027977705012024956


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2482,0.132265,0.741259,0.711963,0.741259,0.705101,0.512134
2,0.0833,0.043327,0.886114,0.888789,0.886114,0.88361,0.794914
3,0.0291,0.04066,0.885614,0.895258,0.885614,0.887964,0.806029
4,0.0144,0.030542,0.911089,0.915768,0.911089,0.912067,0.845522
5,0.0082,0.040542,0.878122,0.896892,0.878122,0.883588,0.799035
6,0.0046,0.034127,0.919081,0.919742,0.919081,0.917033,0.854299
7,0.0032,0.031801,0.916084,0.917459,0.916084,0.916461,0.85283


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/skilled-sweep-10/best_model


0,1
eval/accuracy,▁▇▇█▆███
eval/f1,▁▇▇█▇███
eval/loss,█▂▂▁▂▁▁▁
eval/mcc,▁▇▇█▇███
eval/precision,▁▇▇█▇███
eval/recall,▁▇▇█▆███
eval/runtime,▄▄▄▄▃▁█▇
eval/samples_per_second,▅▅▅▅▆█▁▂
eval/steps_per_second,▅▅▅▅▆█▁▂
eval_accuracy,▁

0,1
eval/accuracy,0.91109
eval/f1,0.91207
eval/loss,0.03054
eval/mcc,0.84552
eval/precision,0.91577
eval/recall,0.91109
eval/runtime,1.7668
eval/samples_per_second,1133.141
eval/steps_per_second,142.067
eval_accuracy,0.91109


[34m[1mwandb[0m: Agent Starting Run: e64lj3he with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.0717332755436515
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.2506320103950937e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.0287220060001618


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2739,0.178402,0.645355,0.416483,0.645355,0.506253,0.0
2,0.1358,0.090286,0.833167,0.846174,0.833167,0.831636,0.70621
3,0.0584,0.042786,0.887612,0.887898,0.887612,0.886,0.799196
4,0.0268,0.034193,0.908092,0.907686,0.908092,0.906464,0.83492
5,0.0155,0.033884,0.907592,0.911136,0.907592,0.908314,0.838398
6,0.0101,0.032596,0.912088,0.913078,0.912088,0.910732,0.842805
7,0.0072,0.03196,0.917083,0.917323,0.917083,0.916245,0.852267
8,0.0053,0.032408,0.915085,0.917716,0.915085,0.915798,0.851546
9,0.0043,0.031341,0.915584,0.917222,0.915584,0.915725,0.851129
10,0.0038,0.031885,0.918581,0.920713,0.918581,0.919058,0.85715


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/bumbling-sweep-11/best_model


0,1
eval/accuracy,▁▆▇██████████
eval/f1,▁▆▇██████████
eval/loss,█▄▂▁▁▁▁▁▁▁▁▁▁
eval/mcc,▁▇▇██████████
eval/precision,▁▇███████████
eval/recall,▁▆▇██████████
eval/runtime,▆▅▃▅▆▆█▆▇█▁▇█
eval/samples_per_second,▃▄▆▄▃▃▁▃▂▁█▂▁
eval/steps_per_second,▃▄▆▄▃▃▁▃▂▁█▂▁
eval_accuracy,▁

0,1
eval/accuracy,0.91558
eval/f1,0.91573
eval/loss,0.03134
eval/mcc,0.85113
eval/precision,0.91722
eval/recall,0.91558
eval/runtime,0.9145
eval/samples_per_second,2189.29
eval/steps_per_second,137.787
eval_accuracy,0.91558


[34m[1mwandb[0m: Agent Starting Run: q6ino9iq with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.1269044011653552
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.546111785088522e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.035509247894209


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2464,0.139839,0.751249,0.744358,0.751249,0.723492,0.521458
2,0.0826,0.043202,0.882617,0.88702,0.882617,0.88223,0.793199
3,0.0288,0.039621,0.891109,0.899216,0.891109,0.892926,0.813647
4,0.0139,0.029814,0.912587,0.9132,0.912587,0.911789,0.84513
5,0.0081,0.044257,0.873626,0.897214,0.873626,0.880238,0.794111
6,0.0049,0.034745,0.919081,0.921075,0.919081,0.917638,0.85588
7,0.0029,0.032159,0.922577,0.923307,0.922577,0.922778,0.863639


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/drawn-sweep-12/best_model


0,1
eval/accuracy,▁▆▇█▆███
eval/f1,▁▇▇█▇███
eval/loss,█▂▂▁▂▁▁▁
eval/mcc,▁▇▇█▇███
eval/precision,▁▇▇█▇███
eval/recall,▁▆▇█▆███
eval/runtime,▄█▁▅▃▄▃█
eval/samples_per_second,▅▁█▄▆▅▆▁
eval/steps_per_second,▅▁█▄▆▅▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.91259
eval/f1,0.91179
eval/loss,0.02981
eval/mcc,0.84513
eval/precision,0.9132
eval/recall,0.91259
eval/runtime,1.7666
eval/samples_per_second,1133.24
eval/steps_per_second,142.079
eval_accuracy,0.91259


[34m[1mwandb[0m: Agent Starting Run: zlflbojy with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.10051019260850746
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.007206013823518e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.018553780082742624


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.283,0.186723,0.645355,0.416483,0.645355,0.506253,0.0
2,0.1525,0.113768,0.785215,0.808515,0.785215,0.769126,0.618598
3,0.0807,0.05566,0.875624,0.875862,0.875624,0.874728,0.778686
4,0.0398,0.03925,0.893107,0.892485,0.893107,0.890645,0.807522
5,0.0242,0.037903,0.895105,0.901846,0.895105,0.896821,0.81894
6,0.017,0.033087,0.913087,0.913785,0.913087,0.9123,0.845302
7,0.0125,0.032881,0.912088,0.91316,0.912088,0.911971,0.844792
8,0.0096,0.033582,0.908092,0.911753,0.908092,0.909131,0.840009
9,0.0083,0.031952,0.911588,0.913407,0.911588,0.911948,0.844595
10,0.0073,0.032807,0.911588,0.914388,0.911588,0.912376,0.845518


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/giddy-sweep-13/best_model


0,1
eval/accuracy,▁▅▇▇█████████
eval/f1,▁▆▇██████████
eval/loss,█▅▂▁▁▁▁▁▁▁▁▁▁
eval/mcc,▁▆▇██████████
eval/precision,▁▇▇██████████
eval/recall,▁▅▇▇█████████
eval/runtime,▃▄▃▁▄▃▄█▃▂▄▄█
eval/samples_per_second,▆▅▆█▅▆▅▁▆▇▅▅▁
eval/steps_per_second,▆▅▆█▅▆▅▁▆▇▅▅▁
eval_accuracy,▁

0,1
eval/accuracy,0.91159
eval/f1,0.91195
eval/loss,0.03195
eval/mcc,0.84459
eval/precision,0.91341
eval/recall,0.91159
eval/runtime,0.9157
eval/samples_per_second,2186.396
eval/steps_per_second,137.605
eval_accuracy,0.91159


[34m[1mwandb[0m: Agent Starting Run: igbjyarx with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.06534638460022768
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.0191160337213354e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.03887586582484019


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2975,0.182545,0.663337,0.479942,0.663337,0.5517,0.216954
2,0.15,0.106004,0.81019,0.82675,0.81019,0.800969,0.651711
3,0.0739,0.054983,0.871129,0.874969,0.871129,0.870407,0.773004
4,0.0356,0.038133,0.908092,0.907438,0.908092,0.904399,0.833087
5,0.0211,0.038019,0.887612,0.895161,0.887612,0.888606,0.804799
6,0.0142,0.032258,0.910589,0.912142,0.910589,0.910178,0.841301
7,0.0104,0.032905,0.90959,0.911268,0.90959,0.90976,0.841023
8,0.0079,0.031596,0.913087,0.914771,0.913087,0.913516,0.847016
9,0.0064,0.031589,0.915085,0.916244,0.915085,0.915253,0.850304
10,0.0059,0.031673,0.917582,0.918545,0.917582,0.917675,0.854488


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/proud-sweep-14/best_model


0,1
eval/accuracy,▁▅▇█▇████████
eval/f1,▁▆▇█▇████████
eval/loss,█▄▂▁▁▁▁▁▁▁▁▁▁
eval/mcc,▁▆▇█▇████████
eval/precision,▁▇▇██████████
eval/recall,▁▅▇█▇████████
eval/runtime,▂▁▂▃▃▂▇▄▇▄▃█▃
eval/samples_per_second,▇█▇▅▆▆▂▅▂▅▆▁▆
eval/steps_per_second,▇█▇▅▆▆▂▅▂▅▆▁▆
eval_accuracy,▁

0,1
eval/accuracy,0.91508
eval/f1,0.91525
eval/loss,0.03159
eval/mcc,0.8503
eval/precision,0.91624
eval/recall,0.91508
eval/runtime,0.9035
eval/samples_per_second,2215.921
eval/steps_per_second,139.464
eval_accuracy,0.91508


[34m[1mwandb[0m: Agent Starting Run: 2zri2g7m with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.1103567637653902
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.3276041700598086e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.03013755121284876


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2529,0.148715,0.723776,0.678863,0.723776,0.685959,0.46168
2,0.0914,0.047407,0.875125,0.878837,0.875125,0.874342,0.779021
3,0.0319,0.040887,0.888611,0.897678,0.888611,0.890997,0.809787
4,0.0157,0.030026,0.914585,0.915835,0.914585,0.913875,0.848947
5,0.0093,0.042523,0.875125,0.896661,0.875125,0.881418,0.795383
6,0.0056,0.034762,0.915085,0.918438,0.915085,0.913251,0.84832
7,0.0037,0.030893,0.926573,0.925933,0.926573,0.926137,0.869181


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/comic-sweep-15/best_model


0,1
eval/accuracy,▁▆▇█▆███
eval/f1,▁▆▇█▇███
eval/loss,█▂▂▁▂▁▁▁
eval/mcc,▁▆▇█▇███
eval/precision,▁▇▇█▇███
eval/recall,▁▆▇█▆███
eval/runtime,█▂█▁▅▃█▅
eval/samples_per_second,▁▇▁█▄▆▁▄
eval/steps_per_second,▁▇▁█▄▆▁▄
eval_accuracy,▁

0,1
eval/accuracy,0.91459
eval/f1,0.91387
eval/loss,0.03003
eval/mcc,0.84895
eval/precision,0.91583
eval/recall,0.91459
eval/runtime,1.7525
eval/samples_per_second,1142.363
eval/steps_per_second,143.223
eval_accuracy,0.91459


[34m[1mwandb[0m: Agent Starting Run: p23gbc6j with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.1035209655674956
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.474716677293401e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.04311066580523427


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2391,0.142147,0.728771,0.647304,0.728771,0.679158,0.467229
2,0.084,0.043324,0.891608,0.896913,0.891608,0.88991,0.806439
3,0.028,0.043341,0.873626,0.887598,0.873626,0.87661,0.788036
4,0.0132,0.030656,0.914086,0.91694,0.914086,0.914254,0.849354
5,0.0076,0.041542,0.881119,0.897885,0.881119,0.885781,0.802967
6,0.0042,0.035019,0.918581,0.919369,0.918581,0.917184,0.85403
7,0.0026,0.031633,0.924076,0.924399,0.924076,0.923915,0.865677


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/vivid-sweep-16/best_model


0,1
eval/accuracy,▁▇▆█▆███
eval/f1,▁▇▇█▇███
eval/loss,█▂▂▁▂▁▁▁
eval/mcc,▁▇▇█▇███
eval/precision,▁▇▇█▇███
eval/recall,▁▇▆█▆███
eval/runtime,▅▁▆▅▅█▁▇
eval/samples_per_second,▄█▃▄▄▁█▂
eval/steps_per_second,▄█▃▄▄▁█▂
eval_accuracy,▁

0,1
eval/accuracy,0.91409
eval/f1,0.91425
eval/loss,0.03066
eval/mcc,0.84935
eval/precision,0.91694
eval/recall,0.91409
eval/runtime,1.7612
eval/samples_per_second,1136.752
eval/steps_per_second,142.52
eval_accuracy,0.91409


[34m[1mwandb[0m: Agent Starting Run: awnivulo with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.08909482476552874
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.4259075192506923e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.024946088816666732


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2401,0.145317,0.728272,0.643283,0.728272,0.676636,0.463136
2,0.0849,0.043981,0.887612,0.893419,0.887612,0.885827,0.799326
3,0.0281,0.044133,0.873127,0.886988,0.873127,0.876307,0.787699
4,0.0132,0.03064,0.916084,0.918658,0.916084,0.916218,0.852301
5,0.0072,0.03972,0.889111,0.903076,0.889111,0.892982,0.81495
6,0.0043,0.03617,0.919081,0.919706,0.919081,0.917192,0.854523
7,0.0026,0.031786,0.921578,0.921671,0.921578,0.920995,0.860621


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/cerulean-sweep-17/best_model


0,1
eval/accuracy,▁▇▆█▇███
eval/f1,▁▇▇█▇███
eval/loss,█▂▂▁▂▁▁▁
eval/mcc,▁▇▇█▇███
eval/precision,▁▇▇█████
eval/recall,▁▇▆█▇███
eval/runtime,▁▂▅▄▃▂█▃
eval/samples_per_second,█▇▄▅▆▇▁▆
eval/steps_per_second,█▇▄▅▆▇▁▆
eval_accuracy,▁

0,1
eval/accuracy,0.91608
eval/f1,0.91622
eval/loss,0.03064
eval/mcc,0.8523
eval/precision,0.91866
eval/recall,0.91608
eval/runtime,1.7479
eval/samples_per_second,1145.386
eval/steps_per_second,143.602
eval_accuracy,0.91608


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: o0yjk5f5 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.06629592954398247
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.375803638151593e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.03209370135720997


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2406,0.146231,0.728272,0.640776,0.728272,0.674977,0.459475
2,0.0843,0.044148,0.886114,0.890556,0.886114,0.883762,0.795826
3,0.0272,0.042882,0.879121,0.890503,0.879121,0.881914,0.796249
4,0.0125,0.029794,0.919081,0.921582,0.919081,0.91918,0.85741
5,0.0067,0.037893,0.895604,0.907064,0.895604,0.898691,0.823547
6,0.0037,0.035073,0.91958,0.91969,0.91958,0.917528,0.855242
7,0.0024,0.031124,0.923576,0.922971,0.923576,0.922549,0.863129


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/pious-sweep-18/best_model


0,1
eval/accuracy,▁▇▆█▇███
eval/f1,▁▇▇█▇███
eval/loss,█▂▂▁▁▁▁▁
eval/mcc,▁▇▇█▇███
eval/precision,▁▇▇█████
eval/recall,▁▇▆█▇███
eval/runtime,▁▅▃▂█▃▂▇
eval/samples_per_second,█▃▆▇▁▆▇▂
eval/steps_per_second,█▃▆▇▁▆▇▂
eval_accuracy,▁

0,1
eval/accuracy,0.91908
eval/f1,0.91918
eval/loss,0.02979
eval/mcc,0.85741
eval/precision,0.92158
eval/recall,0.91908
eval/runtime,1.7527
eval/samples_per_second,1142.216
eval/steps_per_second,143.205
eval_accuracy,0.91908


[34m[1mwandb[0m: Agent Starting Run: kwv9c8ms with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.10592329650872234
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 1.228609193617499e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.029873264756975625


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 6012, Warmup Steps (10%): 601




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2214,0.111411,0.778721,0.770887,0.778721,0.749715,0.57894
2,0.0654,0.043619,0.870629,0.893204,0.870629,0.875699,0.785734
3,0.0229,0.03332,0.904595,0.906876,0.904595,0.905288,0.832614
4,0.0108,0.033263,0.905594,0.913774,0.905594,0.90768,0.839764
5,0.0062,0.035655,0.903097,0.911196,0.903097,0.905464,0.834546
6,0.003,0.03753,0.921578,0.921248,0.921578,0.919423,0.858257
7,0.0018,0.032696,0.924575,0.925623,0.924575,0.924772,0.867497
8,0.0013,0.033146,0.921578,0.923082,0.921578,0.921808,0.862284
9,0.0008,0.034064,0.921079,0.922468,0.921079,0.921381,0.861347
10,0.0007,0.035079,0.922577,0.924744,0.922577,0.923069,0.864546


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/radiant-sweep-19/best_model


0,1
eval/accuracy,▁▅▇▇▇██████
eval/f1,▁▆▇▇▇██████
eval/loss,█▂▁▁▁▁▁▁▁▁▁
eval/mcc,▁▆▇▇▇██████
eval/precision,▁▇▇▇▇██████
eval/recall,▁▅▇▇▇██████
eval/runtime,▃▃▃▁▅▄▄▃█▇▅
eval/samples_per_second,▆▅▆█▄▅▄▅▁▂▄
eval/steps_per_second,▆▅▆█▄▅▄▅▁▂▄
eval_accuracy,▁

0,1
eval/accuracy,0.92458
eval/f1,0.92477
eval/loss,0.0327
eval/mcc,0.8675
eval/precision,0.92562
eval/recall,0.92458
eval/runtime,1.7466
eval/samples_per_second,1146.235
eval/steps_per_second,143.709
eval_accuracy,0.92458


[34m[1mwandb[0m: Agent Starting Run: j4sclrtl with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.07542780865255073
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.1045120471514964e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.03833867224631064


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.251,0.153086,0.717283,0.650935,0.717283,0.676093,0.455794
2,0.0968,0.052816,0.87962,0.882818,0.87962,0.878492,0.785632
3,0.0353,0.043553,0.884116,0.893038,0.884116,0.886692,0.802982
4,0.0175,0.031125,0.916583,0.915948,0.916583,0.915008,0.850007
5,0.0098,0.034151,0.909091,0.914915,0.909091,0.910354,0.841996
6,0.0055,0.03405,0.916583,0.916498,0.916583,0.914314,0.849543
7,0.0037,0.031601,0.91958,0.920169,0.91958,0.919626,0.857575


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/gallant-sweep-20/best_model


0,1
eval/accuracy,▁▇▇█████
eval/f1,▁▇▇█████
eval/loss,█▂▂▁▁▁▁▁
eval/mcc,▁▇▇█████
eval/precision,▁▇▇█████
eval/recall,▁▇▇█████
eval/runtime,▁▇▃▃▇▄█▁
eval/samples_per_second,█▂▆▆▂▅▁█
eval/steps_per_second,█▂▆▆▂▅▁█
eval_accuracy,▁

0,1
eval/accuracy,0.91658
eval/f1,0.91501
eval/loss,0.03112
eval/mcc,0.85001
eval/precision,0.91595
eval/recall,0.91658
eval/runtime,1.7387
eval/samples_per_second,1151.416
eval/steps_per_second,144.358
eval_accuracy,0.91658


[34m[1mwandb[0m: Agent Starting Run: nazxm3he with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.05597009355361285
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.2968560226325156e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.025449738720148128


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2427,0.15051,0.712787,0.626311,0.712787,0.653989,0.422576
2,0.0877,0.045852,0.881618,0.887546,0.881618,0.878988,0.787832
3,0.0284,0.04234,0.879121,0.889997,0.879121,0.88174,0.795679
4,0.0131,0.02992,0.92008,0.921059,0.92008,0.919568,0.858066
5,0.0068,0.037644,0.893107,0.904728,0.893107,0.896384,0.819512
6,0.0038,0.034914,0.922078,0.923152,0.922078,0.920695,0.860351
7,0.0026,0.031605,0.924076,0.923687,0.924076,0.923134,0.864105


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/fanciful-sweep-21/best_model


0,1
eval/accuracy,▁▇▇█▇███
eval/f1,▁▇▇█▇███
eval/loss,█▂▂▁▁▁▁▁
eval/mcc,▁▇▇█▇███
eval/precision,▁▇▇█████
eval/recall,▁▇▇█▇███
eval/runtime,▃▃▁▃▃█▃▃
eval/samples_per_second,▆▆█▆▆▁▆▆
eval/steps_per_second,▆▆█▆▆▁▆▆
eval_accuracy,▁

0,1
eval/accuracy,0.92008
eval/f1,0.91957
eval/loss,0.02992
eval/mcc,0.85807
eval/precision,0.92106
eval/recall,0.92008
eval/runtime,1.7472
eval/samples_per_second,1145.831
eval/steps_per_second,143.658
eval_accuracy,0.92008


[34m[1mwandb[0m: Agent Starting Run: iwzjc072 with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.05272152404998275
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 1.040851268473058e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.017630531802060236


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 6012, Warmup Steps (10%): 601




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2252,0.120415,0.763736,0.757886,0.763736,0.728156,0.553854
2,0.0717,0.044972,0.875624,0.89014,0.875624,0.878275,0.78882
3,0.0256,0.033466,0.912088,0.911371,0.912088,0.911017,0.843116
4,0.0121,0.031711,0.915085,0.917718,0.915085,0.915022,0.849961
5,0.0061,0.03432,0.918581,0.921741,0.918581,0.918936,0.856306
6,0.0028,0.03723,0.923077,0.923558,0.923077,0.922142,0.862058
7,0.0017,0.034935,0.923576,0.924403,0.923576,0.923587,0.865002


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/generous-sweep-22/best_model


0,1
eval/accuracy,▁▆▇█████
eval/f1,▁▆██████
eval/loss,█▂▁▁▁▁▁▁
eval/mcc,▁▆██████
eval/precision,▁▇▇█████
eval/recall,▁▆▇█████
eval/runtime,▁▂▃▅▅▅▆█
eval/samples_per_second,█▇▅▄▄▄▃▁
eval/steps_per_second,█▇▅▄▄▄▃▁
eval_accuracy,▁

0,1
eval/accuracy,0.91508
eval/f1,0.91502
eval/loss,0.03171
eval/mcc,0.84996
eval/precision,0.91772
eval/recall,0.91508
eval/runtime,1.7566
eval/samples_per_second,1139.686
eval/steps_per_second,142.888
eval_accuracy,0.91508


[34m[1mwandb[0m: Agent Starting Run: bwnlbp2w with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.0947113067791214
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.2618458944103992e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.014716058861721488


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2454,0.15614,0.699301,0.607661,0.699301,0.632001,0.386892
2,0.0947,0.047669,0.886114,0.888973,0.886114,0.883448,0.794708
3,0.0322,0.04533,0.871628,0.884477,0.871628,0.874371,0.783837
4,0.0156,0.030115,0.914585,0.916419,0.914585,0.914492,0.849358
5,0.0086,0.042234,0.884116,0.900268,0.884116,0.888518,0.807
6,0.0054,0.034759,0.916084,0.917151,0.916084,0.914677,0.849704
7,0.0034,0.031336,0.921578,0.921454,0.921578,0.921073,0.860567


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/glad-sweep-23/best_model


0,1
eval/accuracy,▁▇▆█▇███
eval/f1,▁▇▇█▇███
eval/loss,█▂▂▁▂▁▁▁
eval/mcc,▁▇▇█▇███
eval/precision,▁▇▇█████
eval/recall,▁▇▆█▇███
eval/runtime,▁▁█▁▅▆▄▅
eval/samples_per_second,██▁█▄▃▅▄
eval/steps_per_second,██▁█▄▃▅▄
eval_accuracy,▁

0,1
eval/accuracy,0.91459
eval/f1,0.91449
eval/loss,0.03011
eval/mcc,0.84936
eval/precision,0.91642
eval/recall,0.91459
eval/runtime,1.7547
eval/samples_per_second,1140.919
eval/steps_per_second,143.042
eval_accuracy,0.91459


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7pdalmow with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.050284456224176155
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.162623298528116e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.04097070051717882


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.247,0.157403,0.697802,0.62371,0.697802,0.629767,0.381167
2,0.0948,0.049091,0.882118,0.886783,0.882118,0.879288,0.787817
3,0.0315,0.043338,0.878122,0.889375,0.878122,0.880819,0.794399
4,0.0149,0.030093,0.919081,0.919564,0.919081,0.918353,0.855731
5,0.0081,0.036446,0.898601,0.907674,0.898601,0.901062,0.826585
6,0.0046,0.03407,0.918581,0.918645,0.918581,0.917056,0.853673
7,0.003,0.031242,0.921578,0.921103,0.921578,0.920871,0.859843


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/apricot-sweep-24/best_model


0,1
eval/accuracy,▁▇▇█▇███
eval/f1,▁▇▇█████
eval/loss,█▂▂▁▁▁▁▁
eval/mcc,▁▇▇█████
eval/precision,▁▇▇█████
eval/recall,▁▇▇█▇███
eval/runtime,▁▄▃█▅▂▆▆
eval/samples_per_second,█▅▆▁▄▇▃▃
eval/steps_per_second,█▅▆▁▄▇▃▃
eval_accuracy,▁

0,1
eval/accuracy,0.91908
eval/f1,0.91835
eval/loss,0.03009
eval/mcc,0.85573
eval/precision,0.91956
eval/recall,0.91908
eval/runtime,1.756
eval/samples_per_second,1140.075
eval/steps_per_second,142.936
eval_accuracy,0.91908


[34m[1mwandb[0m: Agent Starting Run: kyzitu1o with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.18589929535875577
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.08647395666264e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.03158916512698756


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2548,0.167871,0.678821,0.557882,0.678821,0.596544,0.322497
2,0.1163,0.062861,0.863137,0.863132,0.863137,0.857935,0.750161
3,0.0477,0.055001,0.855145,0.874875,0.855145,0.86061,0.763165
4,0.0244,0.03354,0.900599,0.9066,0.900599,0.901292,0.827077
5,0.0167,0.042823,0.874126,0.89534,0.874126,0.879634,0.794173
6,0.0115,0.032711,0.914086,0.916552,0.914086,0.914732,0.850031
7,0.0083,0.033493,0.90959,0.913568,0.90959,0.910673,0.843383
8,0.0067,0.032665,0.914086,0.91898,0.914086,0.91533,0.851093
9,0.0053,0.033316,0.913087,0.917737,0.913087,0.914216,0.849863
10,0.0045,0.033613,0.915584,0.919777,0.915584,0.916648,0.853893


✅ Best model saved to /content/drive/MyDrive/DistilBERT_8_Results/zesty-sweep-25/best_model


0,1
eval/accuracy,▁▆▆█▇███████
eval/f1,▁▇▇█▇███████
eval/loss,█▃▂▁▂▁▁▁▁▁▁▁
eval/mcc,▁▇▇█▇███████
eval/precision,▁▇▇█████████
eval/recall,▁▆▆█▇███████
eval/runtime,▄▇▄▅█▅▁▂▄▇█▇
eval/samples_per_second,▅▂▅▄▁▄█▇▅▂▁▂
eval/steps_per_second,▅▂▅▄▁▄█▇▅▂▁▂
eval_accuracy,▁

0,1
eval/accuracy,0.91409
eval/f1,0.91533
eval/loss,0.03266
eval/mcc,0.85109
eval/precision,0.91898
eval/recall,0.91409
eval/runtime,1.7554
eval/samples_per_second,1140.504
eval/steps_per_second,142.99
eval_accuracy,0.91409


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
