In [1]:
import os
import csv
import pandas as pd
import torch
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, BertTokenizer, BertForSequenceClassification
import torch.nn as nn
import wandb

from google.colab import auth
from google.colab import drive
from google.colab import userdata


In [2]:
# =========================
# STEP 0: mount to drive
# =========================
#auth.authenticate_user()
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# =========================
# STEP 1: Initialize WandB
# =========================
# Retrieve API Key from Colab Secrets
wandb_api_key = userdata.get('WANDB_API_KEY')

if wandb_api_key:
    os.environ["WANDB_API_KEY"] = wandb_api_key
    wandb.login(key=wandb_api_key)
    print("✅ WandB Logged in Securely")
else:
    print("❌ Error: WANDB_API_KEY not found. Set it in Colab Secrets.")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maalhaizaey[0m ([33mabdulrahim-alhaizaey[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ WandB Logged in Securely


In [4]:
# =========================
# STEP 2: LOAD DATA & CLEAN TEXT
# =========================
nltk.download('stopwords')
nltk.download('wordnet')

multi_data = pd.read_csv("/content/10006_dataset_Multi.csv", encoding='ISO-8859-1')

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    return text

multi_data['cleaned_text'] = multi_data['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
# =========================
# STEP 3: PREPARE DATASETS & LABEL ENCODING (Train Only)
# =========================
multi_train, multi_val = train_test_split(multi_data, test_size=0.2, stratify=multi_data['label'], random_state=42)

# Fit LabelEncoder only on training data to prevent leakage
multi_le = LabelEncoder()
multi_le.fit(multi_train['label'])
multi_train['encoded_label'] = multi_le.transform(multi_train['label'])
multi_val['encoded_label'] = multi_le.transform(multi_val['label'])

In [6]:
# =========================
# STEP 4: TOKENIZATION (Train Data Only for max_length)
# =========================
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Calculate max_length using training data
max_length = min(tokenizer.model_max_length, max(multi_train['cleaned_text'].apply(lambda x: len(tokenizer.tokenize(x)))))
print(f"Using max_length from training set only: {max_length}")

def tokenize_data(df, label_col):
    encodings = tokenizer(df['cleaned_text'].tolist(), truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    labels = torch.tensor(df[label_col].tolist())
    return encodings, labels

train_multi_enc, train_multi_labels = tokenize_data(multi_train, 'encoded_label')
val_multi_enc, val_multi_labels = tokenize_data(multi_val, 'encoded_label')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using max_length from training set only: 128


In [7]:
# =========================
# STEP 5: DATASET CLASS & METRICS
# =========================
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(labels, predictions)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mcc": mcc
    }


In [8]:
# =========================
# STEP 6: FOCAL LOSS & CUSTOM TRAINER
# =========================
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean() if self.reduction == 'mean' else focal_loss.sum()

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.training_loss = []
        self.validation_loss = []
        self.results = []

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = FocalLoss(alpha=0.25, gamma=2.0)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

    def evaluate(self, *args, **kwargs):
        output = super().evaluate(*args, **kwargs)
        self.validation_loss.append(output['eval_loss'])
        self.results.append(output.copy())
        return output

    def log(self, logs, *args, **kwargs):
        super().log(logs, *args, **kwargs)
        if 'loss' in logs:
            self.training_loss.append(logs['loss'])


In [9]:
# =========================
# STEP 7: ADD PLOTTING (Confusion Matrices & Loss Curves)
# =========================
def plot_confusion_matrix(labels, predictions, label_encoder, output_dir, run_name):
    class_names = label_encoder.classes_
    conf_matrix = confusion_matrix(labels, predictions)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f"{output_dir}/{run_name}_confusion_numbers.png")
    plt.close()

def plot_confusion_matrix_percent(labels, predictions, label_encoder, output_dir, run_name):
    class_names = label_encoder.classes_
    conf_matrix = confusion_matrix(labels, predictions, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f"{output_dir}/{run_name}_confusion_percent.png")
    plt.close()


def plot_confusion_matrix_class_weighted(labels, predictions, label_encoder, output_dir, run_name):
    conf_matrix = confusion_matrix(labels, predictions, normalize='true')
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='.2f', cmap='coolwarm', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig(f"{output_dir}/{run_name}_confusion_weighted.png")
    plt.close()


def plot_loss(training_loss, validation_loss, output_dir, run_name):
    min_length = min(len(training_loss), len(validation_loss))
    training_loss = training_loss[:min_length]
    validation_loss = validation_loss[:min_length]
    epochs = range(1, min_length + 1)
    plt.figure(figsize=(8, 6))
    plt.plot(epochs, training_loss, label='Training Loss', marker='o')
    plt.plot(epochs, validation_loss, label='Validation Loss', marker='s', linestyle='--')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{output_dir}/{run_name}_loss_curves.png")
    plt.close()


In [10]:
# =========================
# STEP 8: TRAIN FUNCTION
# =========================
def train_bert(train_dataset, eval_dataset, num_labels, output_dir, label_encoder, config=None):
    with wandb.init(config=config):
        config = wandb.config
        run_name = wandb.run.name

        base_dir = "/content/drive/MyDrive/BERT_8_Results"
        run_dir = os.path.join(base_dir, run_name)
        os.makedirs(run_dir, exist_ok=True)

        model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=num_labels,
            hidden_dropout_prob=config.dropout,
            attention_probs_dropout_prob=config.dropout
        )


        num_devices = torch.cuda.device_count()  # Will be 1 on Colab
        examples_per_step = config.batch_size * num_devices * config.gradient_accumulation_steps
        total_steps = int(np.ceil(len(train_dataset) / examples_per_step) * config.num_train_epochs)

        warmup_steps = int(0.1 * total_steps)
        print(f"Total Steps: {total_steps}, Warmup Steps (10%): {warmup_steps}")


        training_args = TrainingArguments(
            output_dir=output_dir,
            run_name=run_name,
            num_train_epochs=config.num_train_epochs,
            per_device_train_batch_size=config.batch_size,
            per_device_eval_batch_size=config.batch_size,
            learning_rate=config.learning_rate,
            warmup_steps=warmup_steps,
            weight_decay=config.weight_decay,
            evaluation_strategy="epoch",
            logging_strategy="epoch",
            save_strategy="epoch",
            fp16=True,
            gradient_checkpointing=True,
            save_total_limit=2,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            max_grad_norm=config.max_grad_norm,
            lr_scheduler_type="cosine_with_restarts",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            report_to="wandb"
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        trainer.train()

        # Save Best Model
        best_model_path = os.path.join(run_dir, "best_model")
        trainer.save_model(best_model_path)
        print(f"✅ Best model saved to {best_model_path}")

        # Evaluate Best Model
        results = trainer.evaluate(eval_dataset=eval_dataset)


        # Evaluate the best model
        results = trainer.evaluate(eval_dataset=eval_dataset)

        predictions_obj = trainer.predict(eval_dataset)
        predictions = np.argmax(predictions_obj.predictions, axis=1)
        labels = predictions_obj.label_ids

        wandb.log({
            "eval_loss": results.get("eval_loss"),
            "eval_accuracy": results.get("eval_accuracy"),
            "eval_precision": results.get("eval_precision"),
            "eval_recall": results.get("eval_recall"),
            "eval_f1": results.get("eval_f1"),
            "eval_mcc": results.get("eval_mcc")
        })

        plot_confusion_matrix(labels, predictions, label_encoder, run_dir, run_name)
        plot_confusion_matrix_percent(labels, predictions, label_encoder, run_dir, run_name)
        plot_confusion_matrix_class_weighted(labels, predictions, label_encoder, run_dir, run_name)
        plot_loss(trainer.training_loss, trainer.validation_loss, run_dir, run_name)

        # Save run parameters and results to a CSV file
        results_csv = os.path.join(run_dir, "results.csv")
        with open(results_csv, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Parameter", "Value"])
            for key, value in config.items():
                writer.writerow([key, value])
            writer.writerow(["Epoch", "Training Loss", "Validation Loss", "Accuracy", "Precision", "Recall", "F1", "Mcc"])
            for epoch, result in enumerate(trainer.results, start=1):
                writer.writerow([
                    epoch,
                    trainer.training_loss[epoch - 1] if epoch - 1 < len(trainer.training_loss) else None,
                    trainer.validation_loss[epoch - 1] if epoch - 1 < len(trainer.validation_loss) else None,
                    result.get("eval_accuracy"),
                    result.get("eval_precision"),
                    result.get("eval_recall"),
                    result.get("eval_f1"),
                    result.get("eval_mcc")
                ])


        return results


In [11]:
# =========================
# STEP 9: RUN SWEEP CONFIGURATION
# =========================
train_dataset = TextDataset(train_multi_enc, train_multi_labels)
val_dataset = TextDataset(val_multi_enc, val_multi_labels)

sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'eval_loss', 'goal': 'minimize'},
    'parameters': {
        'num_train_epochs': {'values': [12]},
        'learning_rate': {'min': 1e-5, 'max': 5e-5},
        'batch_size': {'values': [8, 16, 32]},
        'weight_decay': {'min': 0.01, 'max': 0.05},
        'dropout': {'min': 0.05, 'max': 0.5},
        'gradient_accumulation_steps': {'values': [2, 4]},
        'max_grad_norm': {'values': [1.0, 2.0]}
    },
}

In [12]:
sweep_id = wandb.sweep(sweep=sweep_config, project='BERT_8')

wandb.agent(sweep_id, function=lambda: train_bert(
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    num_labels=len(multi_le.classes_),
    output_dir="multi_output",
    label_encoder=multi_le
))


Create sweep with ID: et31bcok
Sweep URL: https://wandb.ai/abdulrahim-alhaizaey/BERT_8/sweeps/et31bcok


[34m[1mwandb[0m: Agent Starting Run: q8glb30j with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.12432466295856226
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.0311635707000778e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.03754108407150582
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112778155554932, max=1.0…

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2284,0.162369,0.689311,0.714523,0.689311,0.679022,0.468448
2,0.1024,0.057037,0.863636,0.871801,0.863636,0.863728,0.7608
3,0.04,0.048006,0.864136,0.880494,0.864136,0.869215,0.775854
4,0.0201,0.033395,0.9001,0.903661,0.9001,0.899952,0.824417
5,0.0115,0.035017,0.906593,0.914133,0.906593,0.908943,0.841409
6,0.0072,0.031225,0.918082,0.920307,0.918082,0.918678,0.857032
7,0.0053,0.033582,0.908092,0.915949,0.908092,0.910459,0.843318
8,0.0032,0.035461,0.913586,0.919215,0.913586,0.915227,0.851743
9,0.0026,0.035279,0.914086,0.918998,0.914086,0.915742,0.852862


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/celestial-sweep-1/best_model


0,1
eval/accuracy,▁▆▆▇███████
eval/f1,▁▆▇▇███████
eval/loss,█▂▂▁▁▁▁▁▁▁▁
eval/mcc,▁▆▇▇███████
eval/precision,▁▆▇▇███████
eval/recall,▁▆▆▇███████
eval/runtime,▁▁▃▂▃▁▂▂▁▂█
eval/samples_per_second,█▇▆▇▆▇▇▇█▇▁
eval/steps_per_second,█▇▆▇▆▇▇▇█▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.91808
eval/f1,0.91868
eval/loss,0.03123
eval/mcc,0.85703
eval/precision,0.92031
eval/recall,0.91808
eval/runtime,3.6979
eval/samples_per_second,541.388
eval/steps_per_second,67.876
eval_accuracy,0.91808


[34m[1mwandb[0m: Agent Starting Run: 5uxtto66 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.3607907645422034
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 3.1616449610943626e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.03995035593826364


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2393,0.147683,0.718781,0.728095,0.718781,0.710603,0.517754
2,0.105,0.105563,0.719281,0.811486,0.719281,0.738984,0.59672
3,0.0526,0.109554,0.747752,0.831005,0.747752,0.766609,0.643203
4,0.0361,0.078201,0.796703,0.854526,0.796703,0.808611,0.703552
5,0.0281,0.067418,0.818681,0.872337,0.818681,0.830218,0.734594
6,0.0221,0.06317,0.832667,0.878374,0.832667,0.843218,0.750685
7,0.0173,0.066311,0.82967,0.876504,0.82967,0.839873,0.747872
8,0.0142,0.077847,0.80969,0.867453,0.80969,0.821896,0.72382
9,0.0116,0.080603,0.805694,0.868798,0.805694,0.818807,0.722146


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/proud-sweep-2/best_model


0,1
eval/accuracy,▁▁▃▆▇██▇▆██
eval/f1,▁▂▄▆▇██▇▇██
eval/loss,█▅▅▂▁▁▁▂▂▁▁
eval/mcc,▁▃▅▇███▇▇██
eval/precision,▁▅▆▇███▇███
eval/recall,▁▁▃▆▇██▇▆██
eval/runtime,▁▅▁▂▃▃▂▂▁▂█
eval/samples_per_second,█▄█▇▆▆▆▆█▇▁
eval/steps_per_second,█▄█▇▆▆▆▆█▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.83267
eval/f1,0.84322
eval/loss,0.06317
eval/mcc,0.75069
eval/precision,0.87837
eval/recall,0.83267
eval/runtime,1.8505
eval/samples_per_second,1081.867
eval/steps_per_second,68.09
eval_accuracy,0.83267


[34m[1mwandb[0m: Agent Starting Run: 2t4xjpvf with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.08015150902533692
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 4.0388407301960246e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.029565793403874885


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2248,0.091074,0.80969,0.810888,0.80969,0.797351,0.657666
2,0.046,0.037027,0.899101,0.903985,0.899101,0.899081,0.822455
3,0.0154,0.028569,0.916583,0.917275,0.916583,0.915797,0.850888
4,0.0058,0.032676,0.915085,0.918926,0.915085,0.915135,0.850156
5,0.0021,0.034619,0.921079,0.92407,0.921079,0.921483,0.859962
6,0.0011,0.03382,0.92008,0.922417,0.92008,0.920947,0.860323


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/vivid-sweep-3/best_model


0,1
eval/accuracy,▁▇██████
eval/f1,▁▇██████
eval/loss,█▂▁▁▂▂▁▁
eval/mcc,▁▇██████
eval/precision,▁▇██████
eval/recall,▁▇██████
eval/runtime,▂▂▄▁▂▃▄█
eval/samples_per_second,▇▇▅█▇▆▅▁
eval/steps_per_second,▇▇▅█▇▆▅▁
eval_accuracy,▁

0,1
eval/accuracy,0.91658
eval/f1,0.9158
eval/loss,0.02857
eval/mcc,0.85089
eval/precision,0.91727
eval/recall,0.91658
eval/runtime,0.9616
eval/samples_per_second,2081.967
eval/steps_per_second,65.516
eval_accuracy,0.91658


[34m[1mwandb[0m: Agent Starting Run: b09ll822 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.1660609910653761
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.0278969730263126e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.045682074484633475


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2559,0.18131,0.671828,0.491118,0.671828,0.567335,0.275253
2,0.1377,0.093835,0.820679,0.842273,0.820679,0.824023,0.700965
3,0.065,0.056566,0.858142,0.874227,0.858142,0.862458,0.764038
4,0.0367,0.038966,0.887113,0.894843,0.887113,0.888931,0.80694
5,0.0237,0.053113,0.860639,0.884823,0.860639,0.867004,0.777303
6,0.0168,0.037197,0.901099,0.909222,0.901099,0.90342,0.833355
7,0.0126,0.039666,0.9001,0.909713,0.9001,0.902749,0.832527
8,0.0104,0.038002,0.901099,0.910569,0.901099,0.903858,0.834307
9,0.0089,0.041331,0.894605,0.907331,0.894605,0.898125,0.826249


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/avid-sweep-4/best_model


0,1
eval/accuracy,▁▆▇█▇██████
eval/f1,▁▆▇█▇██████
eval/loss,█▄▂▁▂▁▁▁▁▁▁
eval/mcc,▁▆▇█▇██████
eval/precision,▁▇▇████████
eval/recall,▁▆▇█▇██████
eval/runtime,▂▁▁▂▁▁▁▂▂▃█
eval/samples_per_second,▇█▇▇███▇▇▆▁
eval/steps_per_second,▇█▇▇███▇▇▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.9011
eval/f1,0.90342
eval/loss,0.0372
eval/mcc,0.83335
eval/precision,0.90922
eval/recall,0.9011
eval/runtime,1.9305
eval/samples_per_second,1037.024
eval/steps_per_second,65.267
eval_accuracy,0.9011


[34m[1mwandb[0m: Agent Starting Run: p8kls9ek with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.2181321096173653
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.226913286147192e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.04150061853758102


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2617,0.15951,0.696803,0.647118,0.696803,0.653318,0.437507
2,0.1082,0.060782,0.843157,0.856729,0.843157,0.846213,0.734987
3,0.0464,0.073639,0.81968,0.860616,0.81968,0.82976,0.72545
4,0.0276,0.044853,0.87013,0.888262,0.87013,0.874858,0.787351
5,0.0192,0.044366,0.869131,0.891523,0.869131,0.874853,0.791146
6,0.0144,0.049767,0.861638,0.889684,0.861638,0.868764,0.783627
7,0.011,0.041973,0.879121,0.902598,0.879121,0.885097,0.806875
8,0.009,0.050563,0.867632,0.893285,0.867632,0.874133,0.791768
9,0.0072,0.050921,0.871129,0.897818,0.871129,0.877651,0.798382
10,0.0064,0.054472,0.865634,0.896255,0.865634,0.872955,0.791966


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/genial-sweep-5/best_model


0,1
eval/accuracy,▁▇▆██▇███▇██
eval/f1,▁▇▆█████████
eval/loss,█▂▃▁▁▁▁▂▂▂▁▁
eval/mcc,▁▇▆█████████
eval/precision,▁▇▇█████████
eval/recall,▁▇▆██▇███▇██
eval/runtime,▂▂▃▂▁▂▂▂▂▁▂█
eval/samples_per_second,▇▇▆▇█▆▇▆▇█▇▁
eval/steps_per_second,▇▇▆▇█▆▇▆▇█▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.87912
eval/f1,0.8851
eval/loss,0.04197
eval/mcc,0.80688
eval/precision,0.9026
eval/recall,0.87912
eval/runtime,3.6967
eval/samples_per_second,541.563
eval/steps_per_second,67.898
eval_accuracy,0.87912


[34m[1mwandb[0m: Agent Starting Run: u3bk1ap4 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.10023212756042851
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 4.9754283621939545e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.03167974103668743


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2388,0.120943,0.752248,0.751397,0.752248,0.726671,0.531064
2,0.0659,0.042832,0.87013,0.886991,0.87013,0.874403,0.780588
3,0.02,0.039578,0.902098,0.911813,0.902098,0.903112,0.827967
4,0.0083,0.033933,0.913087,0.919189,0.913087,0.914866,0.849899
5,0.0044,0.034839,0.921578,0.922091,0.921578,0.921435,0.860195
6,0.0019,0.035552,0.920579,0.922132,0.920579,0.920923,0.86054
7,0.0009,0.033458,0.924575,0.92606,0.924575,0.925148,0.867518
8,0.0004,0.03536,0.926573,0.928884,0.926573,0.927407,0.871705
9,0.0003,0.035394,0.927572,0.929256,0.927572,0.928204,0.872874
10,0.0002,0.035677,0.925574,0.927796,0.925574,0.926345,0.869944


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/fluent-sweep-6/best_model


0,1
eval/accuracy,▁▆▇▇████████
eval/f1,▁▆▇█████████
eval/loss,█▂▁▁▁▁▁▁▁▁▁▁
eval/mcc,▁▆▇█████████
eval/precision,▁▆▇█████████
eval/recall,▁▆▇▇████████
eval/runtime,▂▁▃▂▂▁▂▁▁▃▅█
eval/samples_per_second,▇█▅▆▇▇▇█▇▆▄▁
eval/steps_per_second,▇█▅▆▇▇▇█▇▆▄▁
eval_accuracy,▁

0,1
eval/accuracy,0.92458
eval/f1,0.92515
eval/loss,0.03346
eval/mcc,0.86752
eval/precision,0.92606
eval/recall,0.92458
eval/runtime,1.0408
eval/samples_per_second,1923.559
eval/steps_per_second,60.532
eval_accuracy,0.92458


[34m[1mwandb[0m: Agent Starting Run: dmsb3gyd with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.09767994595326587
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 4.954342553674353e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.017062719390024027


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.24,0.133248,0.720779,0.730918,0.720779,0.719106,0.519827
2,0.0658,0.041433,0.877622,0.899252,0.877622,0.882531,0.793447
3,0.0195,0.042237,0.898102,0.907219,0.898102,0.898986,0.823182
4,0.0083,0.033561,0.918082,0.919981,0.918082,0.917666,0.854325
5,0.0031,0.042606,0.900599,0.910776,0.900599,0.903609,0.83106
6,0.0017,0.040594,0.919081,0.920823,0.919081,0.918569,0.856121
7,0.0011,0.037017,0.924575,0.926081,0.924575,0.92512,0.867288


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/swept-sweep-7/best_model


0,1
eval/accuracy,▁▆▇█▇████
eval/f1,▁▇▇█▇████
eval/loss,█▂▂▁▂▁▁▁▁
eval/mcc,▁▇▇█▇████
eval/precision,▁▇▇█▇████
eval/recall,▁▆▇█▇████
eval/runtime,▂▃▁▁▃▂▃▂█
eval/samples_per_second,▆▆██▅▇▆▆▁
eval/steps_per_second,▆▆██▅▇▆▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.91808
eval/f1,0.91767
eval/loss,0.03356
eval/mcc,0.85433
eval/precision,0.91998
eval/recall,0.91808
eval/runtime,0.9843
eval/samples_per_second,2034.01
eval/steps_per_second,64.007
eval_accuracy,0.91808


[34m[1mwandb[0m: Agent Starting Run: on80t2wp with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.06596443108581512
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 2.673225170708427e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.017460583358440376


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2459,0.166122,0.680819,0.618014,0.680819,0.636756,0.399995
2,0.1242,0.085929,0.814186,0.850335,0.814186,0.81817,0.692099
3,0.0456,0.043098,0.882118,0.891912,0.882118,0.883166,0.794789
4,0.0181,0.032193,0.917582,0.916618,0.917582,0.915985,0.851662
5,0.0084,0.031377,0.911089,0.914069,0.911089,0.911814,0.844416
6,0.0043,0.033108,0.911588,0.914782,0.911588,0.912504,0.845187
7,0.0024,0.031886,0.917083,0.91714,0.917083,0.916993,0.85293
8,0.0017,0.03317,0.914086,0.915895,0.914086,0.914702,0.849366


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/unique-sweep-8/best_model


0,1
eval/accuracy,▁▅▇███████
eval/f1,▁▆▇███████
eval/loss,█▄▂▁▁▁▁▁▁▁
eval/mcc,▁▆▇███████
eval/precision,▁▆▇███████
eval/recall,▁▅▇███████
eval/runtime,▂▂▁▂▁▂▂▂▂█
eval/samples_per_second,▇▇█▇█▇▇▇▇▁
eval/steps_per_second,▇▇█▇█▇▇▇▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.91109
eval/f1,0.91181
eval/loss,0.03138
eval/mcc,0.84442
eval/precision,0.91407
eval/recall,0.91109
eval/runtime,1.0211
eval/samples_per_second,1960.594
eval/steps_per_second,61.697
eval_accuracy,0.91109


[34m[1mwandb[0m: Agent Starting Run: r1081sse with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.08979420846445316
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.4852662152681313e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.015477773972318542


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2547,0.153289,0.695305,0.648012,0.695305,0.639404,0.406083
2,0.1018,0.063042,0.836663,0.863563,0.836663,0.843052,0.72959
3,0.0348,0.039496,0.893606,0.898673,0.893606,0.892264,0.812717
4,0.016,0.031877,0.912587,0.914898,0.912587,0.912761,0.846519
5,0.0082,0.033678,0.907093,0.914318,0.907093,0.909188,0.84042
6,0.0047,0.029308,0.922577,0.92328,0.922577,0.922546,0.862956
7,0.0032,0.030321,0.916583,0.918645,0.916583,0.917329,0.853846
8,0.0021,0.031363,0.920579,0.922836,0.920579,0.921284,0.860958
9,0.0016,0.032898,0.919081,0.922427,0.919081,0.920112,0.859161


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/lemon-sweep-9/best_model


0,1
eval/accuracy,▁▅▇████████
eval/f1,▁▆▇████████
eval/loss,█▃▂▁▁▁▁▁▁▁▁
eval/mcc,▁▆▇████████
eval/precision,▁▆▇████████
eval/recall,▁▅▇████████
eval/runtime,▂▂▂▂▃▂▂▁▂▃█
eval/samples_per_second,▇▇▇▆▆▇▇█▇▆▁
eval/steps_per_second,▇▇▇▆▆▇▇█▇▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.92258
eval/f1,0.92255
eval/loss,0.02931
eval/mcc,0.86296
eval/precision,0.92328
eval/recall,0.92258
eval/runtime,1.899
eval/samples_per_second,1054.253
eval/steps_per_second,66.352
eval_accuracy,0.92258


[34m[1mwandb[0m: Agent Starting Run: flaei46q with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.10735684270452696
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.636433108259683e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.01730357622994756


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2352,0.113904,0.787712,0.768258,0.787712,0.766369,0.607743
2,0.063,0.037646,0.887113,0.896399,0.887113,0.888461,0.805618
3,0.022,0.041133,0.895604,0.908571,0.895604,0.898841,0.824658
4,0.0096,0.03135,0.915085,0.916368,0.915085,0.914996,0.850156
5,0.0049,0.048126,0.884116,0.900567,0.884116,0.888788,0.81098
6,0.0026,0.033336,0.919081,0.921984,0.919081,0.919971,0.859325
7,0.0018,0.032184,0.922577,0.923953,0.922577,0.922943,0.86399


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/grateful-sweep-10/best_model


0,1
eval/accuracy,▁▆▇█▆████
eval/f1,▁▆▇█▆████
eval/loss,█▂▂▁▂▁▁▁▁
eval/mcc,▁▆▇█▇████
eval/precision,▁▇▇█▇████
eval/recall,▁▆▇█▆████
eval/runtime,▁▃▂▂▃▃▂▂█
eval/samples_per_second,█▆▇▆▆▆▇▇▁
eval/steps_per_second,█▆▇▆▆▆▇▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.91508
eval/f1,0.915
eval/loss,0.03135
eval/mcc,0.85016
eval/precision,0.91637
eval/recall,0.91508
eval/runtime,3.6908
eval/samples_per_second,542.425
eval/steps_per_second,68.006
eval_accuracy,0.91508


[34m[1mwandb[0m: Agent Starting Run: k7hdn97n with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.1002012010554752
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 2.215125969913568e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.012378069040828152


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2553,0.170128,0.67033,0.615995,0.67033,0.614305,0.366393
2,0.1321,0.101449,0.793706,0.846356,0.793706,0.798407,0.668527
3,0.0558,0.05194,0.862138,0.879575,0.862138,0.865258,0.766485
4,0.0245,0.033461,0.914086,0.913902,0.914086,0.91186,0.844624
5,0.013,0.032652,0.900599,0.908805,0.900599,0.902729,0.828942
6,0.0074,0.032356,0.905095,0.908865,0.905095,0.90639,0.834709
7,0.0052,0.030722,0.917083,0.917723,0.917083,0.917196,0.853742
8,0.0036,0.032259,0.914585,0.917283,0.914585,0.915538,0.850808
9,0.0028,0.032884,0.912088,0.914282,0.912088,0.912835,0.846263
10,0.0025,0.03291,0.913087,0.915553,0.913087,0.913942,0.84848


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/hearty-sweep-11/best_model


0,1
eval/accuracy,▁▅▆█████████
eval/f1,▁▅▇█████████
eval/loss,█▅▂▁▁▁▁▁▁▁▁▁
eval/mcc,▁▅▇█████████
eval/precision,▁▆▇█████████
eval/recall,▁▅▆█████████
eval/runtime,▂▁▁▂▁▂▂▁▃▂▄█
eval/samples_per_second,▆█▇▇█▆▇█▆▇▄▁
eval/steps_per_second,▆█▇▇█▆▇█▆▇▄▁
eval_accuracy,▁

0,1
eval/accuracy,0.91708
eval/f1,0.9172
eval/loss,0.03072
eval/mcc,0.85374
eval/precision,0.91772
eval/recall,0.91708
eval/runtime,1.0192
eval/samples_per_second,1964.329
eval/steps_per_second,61.815
eval_accuracy,0.91708


[34m[1mwandb[0m: Agent Starting Run: e4qm4f48 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.0777199613726914
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.931379532875502e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.016592798221360564


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2679,0.195758,0.635864,0.671906,0.635864,0.631358,0.426639
2,0.1089,0.056619,0.866633,0.876296,0.866633,0.869852,0.770129
3,0.0334,0.035747,0.902098,0.90524,0.902098,0.901711,0.826415
4,0.014,0.030716,0.913087,0.915303,0.913087,0.913338,0.846611
5,0.006,0.031131,0.921578,0.924451,0.921578,0.921439,0.860812
6,0.0034,0.03188,0.924076,0.923462,0.924076,0.922871,0.863368
7,0.002,0.032699,0.922577,0.92455,0.922577,0.922961,0.863941


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/balmy-sweep-12/best_model


0,1
eval/accuracy,▁▇▇██████
eval/f1,▁▇▇██████
eval/loss,█▂▁▁▁▁▁▁▁
eval/mcc,▁▆▇██████
eval/precision,▁▇▇██████
eval/recall,▁▇▇██████
eval/runtime,▁▁▂▁▁▃▃▃█
eval/samples_per_second,██▇█▇▅▆▆▁
eval/steps_per_second,██▇█▇▅▆▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.91309
eval/f1,0.91334
eval/loss,0.03072
eval/mcc,0.84661
eval/precision,0.9153
eval/recall,0.91309
eval/runtime,1.9212
eval/samples_per_second,1042.049
eval/steps_per_second,65.583
eval_accuracy,0.91309


[34m[1mwandb[0m: Agent Starting Run: 7lr6lcsy with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.07941534100677443
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.1594375452856098e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.01847333554332322


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2748,0.191002,0.647852,0.526809,0.647852,0.575776,0.290621
2,0.1569,0.130126,0.759241,0.752258,0.759241,0.747199,0.570963
3,0.0945,0.072499,0.843656,0.85247,0.843656,0.84313,0.72614
4,0.05,0.045486,0.894106,0.894619,0.894106,0.891058,0.80787
5,0.0289,0.038534,0.892108,0.899718,0.892108,0.894002,0.812724
6,0.0189,0.035545,0.8996,0.903748,0.8996,0.900871,0.825134
7,0.0138,0.031803,0.908591,0.908366,0.908591,0.908316,0.837785
8,0.0105,0.032947,0.903596,0.906548,0.903596,0.904598,0.831077
9,0.0088,0.032601,0.908092,0.91197,0.908092,0.909335,0.839501
10,0.0081,0.032729,0.906593,0.909636,0.906593,0.907728,0.837039


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/jolly-sweep-13/best_model


0,1
eval/accuracy,▁▄▆█████████
eval/f1,▁▅▇█████████
eval/loss,█▅▃▂▁▁▁▁▁▁▁▁
eval/mcc,▁▅▇█████████
eval/precision,▁▅▇█████████
eval/recall,▁▄▆█████████
eval/runtime,▂▃▂▃▂▄▁▃▄▄▂█
eval/samples_per_second,▇▆▇▆▇▅█▆▅▅▇▁
eval/steps_per_second,▇▆▇▆▇▅█▆▅▅▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.90859
eval/f1,0.90832
eval/loss,0.0318
eval/mcc,0.83779
eval/precision,0.90837
eval/recall,0.90859
eval/runtime,0.9794
eval/samples_per_second,2044.104
eval/steps_per_second,64.325
eval_accuracy,0.90859


[34m[1mwandb[0m: Agent Starting Run: qidrltxe with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.07868731301233534
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.3329572996100492e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.014204397139517185


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2822,0.182004,0.681319,0.525663,0.681319,0.592213,0.325605
2,0.125,0.075377,0.84016,0.849824,0.84016,0.841036,0.721459
3,0.0484,0.03855,0.897103,0.899291,0.897103,0.897887,0.819251
4,0.0211,0.036678,0.902098,0.907947,0.902098,0.90383,0.831513
5,0.0102,0.039182,0.896104,0.90802,0.896104,0.899437,0.825849
6,0.0066,0.033597,0.917582,0.917346,0.917582,0.91653,0.852155
7,0.0034,0.034334,0.920579,0.922506,0.920579,0.920842,0.860184
8,0.0025,0.034591,0.921079,0.923893,0.921079,0.921833,0.862023
9,0.0019,0.034207,0.924076,0.926436,0.924076,0.924585,0.866548


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/super-sweep-14/best_model


0,1
eval/accuracy,▁▆▇▇▇██████
eval/f1,▁▆▇█▇██████
eval/loss,█▃▁▁▁▁▁▁▁▁▁
eval/mcc,▁▆▇█▇██████
eval/precision,▁▇█████████
eval/recall,▁▆▇▇▇██████
eval/runtime,▁▁▁▃▃▂▃▂▂▃█
eval/samples_per_second,███▆▆▆▆▇▆▆▁
eval/steps_per_second,███▆▆▆▆▇▆▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.91758
eval/f1,0.91653
eval/loss,0.0336
eval/mcc,0.85216
eval/precision,0.91735
eval/recall,0.91758
eval/runtime,2.1634
eval/samples_per_second,925.394
eval/steps_per_second,58.242
eval_accuracy,0.91758


[34m[1mwandb[0m: Agent Starting Run: tqyjpqnk with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.11537765382250896
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.9271320761899984e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.016541142584019296


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2603,0.157313,0.6998,0.685446,0.6998,0.670572,0.466078
2,0.0888,0.045663,0.874126,0.886532,0.874126,0.877096,0.78699
3,0.0283,0.034644,0.907093,0.908777,0.907093,0.906043,0.834905
4,0.0148,0.030043,0.910589,0.911847,0.910589,0.909909,0.841607
5,0.007,0.037814,0.898601,0.910303,0.898601,0.90195,0.830438
6,0.0043,0.030344,0.921578,0.922996,0.921578,0.921803,0.862098
7,0.0029,0.030669,0.924575,0.927477,0.924575,0.925332,0.868419


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/sparkling-sweep-15/best_model


0,1
eval/accuracy,▁▆▇█▇████
eval/f1,▁▇▇█▇████
eval/loss,█▂▁▁▁▁▁▁▁
eval/mcc,▁▇▇█▇████
eval/precision,▁▇▇██████
eval/recall,▁▆▇█▇████
eval/runtime,▁▁▁▁▁▁▁▁█
eval/samples_per_second,███████▇▁
eval/steps_per_second,███████▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.91059
eval/f1,0.90991
eval/loss,0.03004
eval/mcc,0.84161
eval/precision,0.91185
eval/recall,0.91059
eval/runtime,2.198
eval/samples_per_second,910.81
eval/steps_per_second,57.324
eval_accuracy,0.91059


[34m[1mwandb[0m: Agent Starting Run: gpgnrlv5 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.09615175622097138
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.8868395610289017e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.023927330201964452


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2633,0.179293,0.671329,0.531498,0.671329,0.58323,0.314416
2,0.1461,0.110108,0.778721,0.794302,0.778721,0.769644,0.628482
3,0.0692,0.056429,0.856144,0.870231,0.856144,0.858383,0.752417
4,0.0311,0.03568,0.913087,0.913198,0.913087,0.9116,0.843306
5,0.0169,0.035039,0.895604,0.903347,0.895604,0.897911,0.821682
6,0.0102,0.032827,0.904595,0.908431,0.904595,0.90581,0.833668
7,0.0076,0.031172,0.914585,0.915046,0.914585,0.914711,0.849065
8,0.005,0.032688,0.911089,0.91425,0.911089,0.9122,0.845266
9,0.0041,0.032796,0.914086,0.91678,0.914086,0.914954,0.850142
10,0.0035,0.033354,0.911089,0.914769,0.911089,0.912375,0.845976


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/upbeat-sweep-16/best_model


0,1
eval/accuracy,▁▄▆█▇███████
eval/f1,▁▅▇█████████
eval/loss,█▅▂▁▁▁▁▁▁▁▁▁
eval/mcc,▁▅▇█████████
eval/precision,▁▆▇█████████
eval/recall,▁▄▆█▇███████
eval/runtime,▄▃▂▁▃▁▂▁▁▁▃█
eval/samples_per_second,▅▆▇█▆█▇███▆▁
eval/steps_per_second,▅▆▇█▆█▇███▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.91459
eval/f1,0.91471
eval/loss,0.03117
eval/mcc,0.84907
eval/precision,0.91505
eval/recall,0.91459
eval/runtime,1.0739
eval/samples_per_second,1864.317
eval/steps_per_second,58.667
eval_accuracy,0.91459


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mgt79n2n with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	dropout: 0.0945738635937638
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.557598742283547e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.029960777217273547


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 3012, Warmup Steps (10%): 301




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2443,0.125698,0.74026,0.733785,0.74026,0.70368,0.520519
2,0.0739,0.044371,0.876124,0.891085,0.876124,0.880169,0.792277
3,0.0243,0.037031,0.904595,0.911361,0.904595,0.905697,0.833653
4,0.0103,0.031521,0.917582,0.918348,0.917582,0.916653,0.853334
5,0.0051,0.041999,0.898601,0.908103,0.898601,0.901273,0.829581
6,0.0022,0.036204,0.924575,0.923436,0.924575,0.923073,0.864466
7,0.0017,0.034949,0.923077,0.924142,0.923077,0.923163,0.864151


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/desert-sweep-17/best_model


0,1
eval/accuracy,▁▆▇█▇████
eval/f1,▁▇▇█▇████
eval/loss,█▂▁▁▂▁▁▁▁
eval/mcc,▁▇▇█▇████
eval/precision,▁▇██▇████
eval/recall,▁▆▇█▇████
eval/runtime,▁▁▂▂▁▂▂▄█
eval/samples_per_second,█▇▇▇█▇▇▅▁
eval/steps_per_second,█▇▇▇█▇▇▅▁
eval_accuracy,▁

0,1
eval/accuracy,0.91758
eval/f1,0.91665
eval/loss,0.03152
eval/mcc,0.85333
eval/precision,0.91835
eval/recall,0.91758
eval/runtime,3.7971
eval/samples_per_second,527.238
eval/steps_per_second,66.102
eval_accuracy,0.91758


[34m[1mwandb[0m: Agent Starting Run: rv98tazt with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.06988870437736563
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 2.0300075559258756e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.0272158686010677


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2528,0.166883,0.68032,0.587864,0.68032,0.622067,0.379282
2,0.1313,0.096265,0.807692,0.842638,0.807692,0.808872,0.678773
3,0.0546,0.048431,0.880619,0.88904,0.880619,0.882539,0.792282
4,0.0238,0.034713,0.913087,0.912145,0.913087,0.910611,0.842415
5,0.0118,0.032211,0.905594,0.908843,0.905594,0.9066,0.835178
6,0.0066,0.032787,0.903596,0.907924,0.903596,0.905056,0.832336
7,0.0041,0.031617,0.915584,0.915373,0.915584,0.915399,0.850137
8,0.0028,0.032217,0.911089,0.91209,0.911089,0.911476,0.843027
9,0.0024,0.032763,0.915085,0.916223,0.915085,0.915446,0.850111
10,0.002,0.03311,0.912088,0.913623,0.912088,0.912678,0.845591


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/logical-sweep-18/best_model


0,1
eval/accuracy,▁▅▇█████████
eval/f1,▁▅▇█████████
eval/loss,█▄▂▁▁▁▁▁▁▁▁▁
eval/mcc,▁▅▇█████████
eval/precision,▁▆▇█████████
eval/recall,▁▅▇█████████
eval/runtime,▂▂▂▂▂▃▂▃▂▂▁█
eval/samples_per_second,▇▇▇▇▇▅▇▆▇▆█▁
eval/steps_per_second,▇▇▇▇▇▅▇▆▇▆█▁
eval_accuracy,▁

0,1
eval/accuracy,0.91558
eval/f1,0.9154
eval/loss,0.03162
eval/mcc,0.85014
eval/precision,0.91537
eval/recall,0.91558
eval/runtime,0.9668
eval/samples_per_second,2070.781
eval/steps_per_second,65.164
eval_accuracy,0.91558


[34m[1mwandb[0m: Agent Starting Run: lhbwadrt with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.06475614867026021
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 2.9714798637752895e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.020532736091327948


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2567,0.141278,0.726773,0.699542,0.726773,0.683005,0.487075
2,0.0858,0.046858,0.862637,0.884136,0.862637,0.868574,0.774124
3,0.024,0.033878,0.905594,0.90833,0.905594,0.904169,0.830954
4,0.0079,0.031404,0.917582,0.920231,0.917582,0.918103,0.854577
5,0.0032,0.032226,0.924575,0.925114,0.924575,0.924589,0.866155
6,0.0018,0.033072,0.925075,0.924713,0.925075,0.923785,0.865655
7,0.0011,0.031009,0.934066,0.933651,0.934066,0.933475,0.88197
8,0.0006,0.032599,0.925075,0.927774,0.925075,0.925893,0.868688
9,0.0004,0.03315,0.923576,0.926374,0.923576,0.924279,0.865831
10,0.0003,0.032688,0.928571,0.930262,0.928571,0.929022,0.873983


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/rosy-sweep-19/best_model


0,1
eval/accuracy,▁▆▇▇████████
eval/f1,▁▆▇█████████
eval/loss,█▂▁▁▁▁▁▁▁▁▁▁
eval/mcc,▁▆▇█████████
eval/precision,▁▇▇█████████
eval/recall,▁▆▇▇████████
eval/runtime,▂▃▃▃▄▄▄▁▂▃▂█
eval/samples_per_second,▇▆▆▆▅▄▅█▇▆▇▁
eval/steps_per_second,▇▆▆▆▅▄▅█▇▆▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.93407
eval/f1,0.93348
eval/loss,0.03101
eval/mcc,0.88197
eval/precision,0.93365
eval/recall,0.93407
eval/runtime,0.9698
eval/samples_per_second,2064.275
eval/steps_per_second,64.96
eval_accuracy,0.93407


[34m[1mwandb[0m: Agent Starting Run: y8z66lz9 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	dropout: 0.14616350630503
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 1.3841779179479247e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.02748111241894356


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2855,0.190434,0.676324,0.527204,0.676324,0.571206,0.282328
2,0.1325,0.08815,0.816184,0.845032,0.816184,0.821628,0.69294
3,0.0589,0.048054,0.870629,0.883675,0.870629,0.875106,0.783344
4,0.0292,0.035923,0.897602,0.907538,0.897602,0.899632,0.825007
5,0.0178,0.049623,0.860639,0.891818,0.860639,0.868385,0.781429
6,0.0111,0.034735,0.905095,0.915849,0.905095,0.908097,0.839975
7,0.0078,0.033522,0.913586,0.91765,0.913586,0.914586,0.850097
8,0.006,0.039928,0.904096,0.914302,0.904096,0.906871,0.8391
9,0.0049,0.038054,0.908591,0.918491,0.908591,0.911138,0.84574
10,0.0041,0.038866,0.907093,0.916634,0.907093,0.909661,0.843488


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/stilted-sweep-20/best_model


0,1
eval/accuracy,▁▅▇█▆███████
eval/f1,▁▆▇█▇███████
eval/loss,█▃▂▁▂▁▁▁▁▁▁▁
eval/mcc,▁▆▇█▇███████
eval/precision,▁▇▇█████████
eval/recall,▁▅▇█▆███████
eval/runtime,▁▂▂▂▁▂▃▂▂▁▂█
eval/samples_per_second,█▇▆▇█▇▆▇▇█▇▁
eval/steps_per_second,█▇▆▇█▇▆▇▇█▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.91359
eval/f1,0.91459
eval/loss,0.03352
eval/mcc,0.8501
eval/precision,0.91765
eval/recall,0.91359
eval/runtime,1.916
eval/samples_per_second,1044.892
eval/steps_per_second,65.762
eval_accuracy,0.91359


[34m[1mwandb[0m: Agent Starting Run: s30u4vke with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.1034387037541841
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 4.031975859107902e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.021479709725969433


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.282,0.198397,0.656344,0.573921,0.656344,0.553232,0.211904
2,0.1338,0.089032,0.786713,0.830848,0.786713,0.796474,0.657293
3,0.0464,0.05832,0.847652,0.87498,0.847652,0.856268,0.750778
4,0.0177,0.029516,0.916084,0.916863,0.916084,0.914571,0.849726
5,0.0086,0.037052,0.896104,0.910955,0.896104,0.900386,0.82776
6,0.0046,0.034954,0.91009,0.914934,0.91009,0.911616,0.844539
7,0.0026,0.032618,0.917582,0.92062,0.917582,0.918645,0.856686


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/snowy-sweep-21/best_model


0,1
eval/accuracy,▁▄▆█▇████
eval/f1,▁▆▇██████
eval/loss,█▃▂▁▁▁▁▁▁
eval/mcc,▁▆▇██████
eval/precision,▁▆▇██████
eval/recall,▁▄▆█▇████
eval/runtime,▅▃▃▂▃▅▁▁█
eval/samples_per_second,▄▆▆▆▆▃██▁
eval/steps_per_second,▄▆▆▆▆▃██▁
eval_accuracy,▁

0,1
eval/accuracy,0.91608
eval/f1,0.91457
eval/loss,0.02952
eval/mcc,0.84973
eval/precision,0.91686
eval/recall,0.91608
eval/runtime,0.9676
eval/samples_per_second,2069.098
eval/steps_per_second,65.111
eval_accuracy,0.91608


[34m[1mwandb[0m: Agent Starting Run: se7s8jwa with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.05297402711625454
[34m[1mwandb[0m: 	gradient_accumulation_steps: 2
[34m[1mwandb[0m: 	learning_rate: 3.6645358497319774e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.02394420900794704


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 1512, Warmup Steps (10%): 151




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2059,0.090774,0.81968,0.795754,0.81968,0.798575,0.666006
2,0.048,0.03237,0.905594,0.906091,0.905594,0.904366,0.831161
3,0.0139,0.033991,0.907592,0.915466,0.907592,0.908017,0.837626
4,0.0051,0.027954,0.924076,0.923244,0.924076,0.923099,0.8636
5,0.0023,0.030886,0.925075,0.924432,0.925075,0.92421,0.865175
6,0.0008,0.030131,0.927073,0.927195,0.927073,0.926909,0.870194
7,0.0004,0.030892,0.925075,0.924559,0.925075,0.924596,0.866209


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/helpful-sweep-22/best_model


0,1
eval/accuracy,▁▇▇██████
eval/f1,▁▇▇██████
eval/loss,█▁▂▁▁▁▁▁▁
eval/mcc,▁▇▇██████
eval/precision,▁▇▇██████
eval/recall,▁▇▇██████
eval/runtime,▂▁▃▁▂▂▂▃█
eval/samples_per_second,▇█▆█▇▆▇▆▁
eval/steps_per_second,▇█▆█▇▆▇▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.92408
eval/f1,0.9231
eval/loss,0.02795
eval/mcc,0.8636
eval/precision,0.92324
eval/recall,0.92408
eval/runtime,1.0355
eval/samples_per_second,1933.394
eval/steps_per_second,60.841
eval_accuracy,0.92408


[34m[1mwandb[0m: Agent Starting Run: upe8iyqg with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.0781574559193419
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 3.549230386605554e-05
[34m[1mwandb[0m: 	max_grad_norm: 1
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.01672352636719021


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2386,0.150986,0.706294,0.62887,0.706294,0.647712,0.41871
2,0.0922,0.052856,0.854146,0.874851,0.854146,0.858744,0.758624
3,0.0293,0.03303,0.910589,0.913134,0.910589,0.909579,0.840102
4,0.012,0.030766,0.919081,0.919667,0.919081,0.916565,0.853796
5,0.0057,0.029794,0.918082,0.923202,0.918082,0.919496,0.857631
6,0.0028,0.0322,0.921578,0.922894,0.921578,0.921866,0.861662
7,0.0015,0.03241,0.924076,0.925227,0.924076,0.924228,0.865676
8,0.0008,0.033705,0.91958,0.922339,0.91958,0.920625,0.8599


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/giddy-sweep-23/best_model


0,1
eval/accuracy,▁▆████████
eval/f1,▁▆████████
eval/loss,█▂▁▁▁▁▁▁▁▁
eval/mcc,▁▆████████
eval/precision,▁▇████████
eval/recall,▁▆████████
eval/runtime,▃▃▂▂▃▂▂▁▄█
eval/samples_per_second,▅▆▇▇▅▇▇█▅▁
eval/steps_per_second,▅▆▇▇▅▇▇█▅▁
eval_accuracy,▁

0,1
eval/accuracy,0.91808
eval/f1,0.9195
eval/loss,0.02979
eval/mcc,0.85763
eval/precision,0.9232
eval/recall,0.91808
eval/runtime,1.0175
eval/samples_per_second,1967.608
eval/steps_per_second,61.918
eval_accuracy,0.91808


[34m[1mwandb[0m: Agent Starting Run: ke597gtb with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.0692508584088774
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 4.521523068570861e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.017258985383454616


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.253,0.165434,0.672827,0.6466,0.672827,0.631361,0.408119
2,0.0907,0.053654,0.846154,0.890071,0.846154,0.857346,0.759595
3,0.0261,0.034985,0.909091,0.912225,0.909091,0.907999,0.838305
4,0.0088,0.033353,0.920579,0.919122,0.920579,0.919075,0.856775
5,0.0038,0.034758,0.914086,0.920419,0.914086,0.915899,0.851557
6,0.0014,0.035203,0.916084,0.919135,0.916084,0.917067,0.853219
7,0.0009,0.036115,0.918082,0.921988,0.918082,0.919077,0.856794


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/vocal-sweep-24/best_model


0,1
eval/accuracy,▁▆███████
eval/f1,▁▆███████
eval/loss,█▂▁▁▁▁▁▁▁
eval/mcc,▁▆███████
eval/precision,▁▇███████
eval/recall,▁▆███████
eval/runtime,▂▁▃▂▂▁▁▂█
eval/samples_per_second,▆█▆▇▇█▇▆▁
eval/steps_per_second,▆█▆▇▇█▇▆▁
eval_accuracy,▁

0,1
eval/accuracy,0.92058
eval/f1,0.91907
eval/loss,0.03335
eval/mcc,0.85677
eval/precision,0.91912
eval/recall,0.92058
eval/runtime,1.0647
eval/samples_per_second,1880.295
eval/steps_per_second,59.17
eval_accuracy,0.92058


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vxi8djhc with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.05447772881056161
[34m[1mwandb[0m: 	gradient_accumulation_steps: 4
[34m[1mwandb[0m: 	learning_rate: 4.2548179081638306e-05
[34m[1mwandb[0m: 	max_grad_norm: 2
[34m[1mwandb[0m: 	num_train_epochs: 12
[34m[1mwandb[0m: 	weight_decay: 0.02593966059115587


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Steps: 756, Warmup Steps (10%): 75




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc
1,0.2337,0.148874,0.706793,0.661273,0.706793,0.656537,0.432699
2,0.0867,0.050618,0.848651,0.87132,0.848651,0.853189,0.749949
3,0.026,0.033501,0.912587,0.913084,0.912587,0.910595,0.842184
4,0.0094,0.029491,0.921079,0.920924,0.921079,0.919899,0.857891
5,0.0039,0.029944,0.918581,0.920598,0.918581,0.91908,0.856771
6,0.0016,0.032239,0.92008,0.91998,0.92008,0.91982,0.857773
7,0.0009,0.032764,0.917582,0.918239,0.917582,0.917513,0.853583


✅ Best model saved to /content/drive/MyDrive/BERT_8_Results/rare-sweep-25/best_model


0,1
eval/accuracy,▁▆███████
eval/f1,▁▆███████
eval/loss,█▂▁▁▁▁▁▁▁
eval/mcc,▁▆███████
eval/precision,▁▇███████
eval/recall,▁▆███████
eval/runtime,▂▁▃▂▂▂▁▂█
eval/samples_per_second,▇▇▆▇▇▇█▇▁
eval/steps_per_second,▇▇▆▇▇▇█▇▁
eval_accuracy,▁

0,1
eval/accuracy,0.92108
eval/f1,0.9199
eval/loss,0.02949
eval/mcc,0.85789
eval/precision,0.92092
eval/recall,0.92108
eval/runtime,1.0099
eval/samples_per_second,1982.399
eval/steps_per_second,62.383
eval_accuracy,0.92108


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [13]:
#from transformers import BertForSequenceClassification
# Load best model (for example from Run 3)
#best_model_path = "/content/drive/MyDrive/BERT_8_Results/sweep_run_3/best_model"
#model = BertForSequenceClassification.from_pretrained(best_model_path)
#print("✅ Model loaded successfully!")


"""Each best_model/ directory contains the following files:
config.json (Model configuration)
pytorch_model.bin (Model weights)
tokenizer_config.json and vocab.txt (Tokenizer)
training_args.bin (Training arguments)
config.json (Model configuration)
pytorch_model.bin (Model weights)
tokenizer_config.json and vocab.txt (Tokenizer)
training_args.bin (Training arguments)"""

'Each best_model/ directory contains the following files:\nconfig.json (Model configuration)\npytorch_model.bin (Model weights)\ntokenizer_config.json and vocab.txt (Tokenizer)\ntraining_args.bin (Training arguments)\nconfig.json (Model configuration)\npytorch_model.bin (Model weights)\ntokenizer_config.json and vocab.txt (Tokenizer)\ntraining_args.bin (Training arguments)'