In [None]:
!pip install torchcodec



In [None]:
import torch
from accelerate import Accelerator
from accelerate.logging import get_logger
from datasets import load_from_disk, DatasetDict
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from dataclasses import dataclass
from typing import Optional, Union
import logging
import datasets
import math

import transformers
from transformers import (
    SchedulerType,
    Wav2Vec2Config,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForPreTraining,
    get_scheduler,
    is_wandb_available,
    set_seed,
)
from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
MAX_DURATION = 10.0
MIN_DURATION = 5.0
GRADIENT_CHECKPOINTING = True
MASK_TIME_PROB = None
MASK_TIME_LENGTH = None
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 16
LEARNING_RATE = 1e-4 #5e-5
ADAM_BETA1 = 0.9
ADAM_BETA2 = 0.98
ADAM_EPSILON = 1e-8
GRADIENT_ACCUMULATION_STEPS = 4
MAX_TRAINING_STEPS = None
NUM_TRAIN_EPOCHS = 50
LR_SCHEDULER_TYPE = "linear"
NUM_WARMUP_STEPS = 100 #0
MAX_GUMBEL_TEMPERATURE = 2.0
MIN_GUMBEL_TEMPERATURE = 1.0 #0.5
GUMBEL_TEMPERATURE = 0.999999 #0.999995
LOGGING_STEPS = 10
SAVING_STEPS = 500
OUTPUT_DIR = "/outputs"
INTER_CB_SIMILARITY_WEIGHT = 100

In [None]:
logger = get_logger(__name__)
logging.basicConfig(level=logging.INFO, force=True)

In [None]:
@dataclass
class DataCollatorForWav2Vec2Pretraining:
  model: Wav2Vec2ForPreTraining
  feature_extractor: Wav2Vec2FeatureExtractor
  padding: str = "longest"
  pad_to_multiple_of: int = None
  mask_time_prob: float = 0.65
  mask_time_length: int= 10

  def __call__(self, features: list[dict[str, Union[list[int], torch.tensor]]]) -> dict[str, torch.Tensor]:
    batch = self.feature_extractor.pad(
        features,
        padding=self.padding,
        pad_to_multiple_of=self.pad_to_multiple_of,
        return_tensors="pt"
    )

    device = batch["input_values"].device
    batch_size = batch['input_values'].shape[0]

    mask_indices_seq_length = self.model._get_feat_extract_output_lengths(batch["input_values"].shape[-1])
    mask_indices_seq_length = int(mask_indices_seq_length)

    if batch.get("attention_mask") is not None:
      batch["sub_attention_mask"] = self.model._get_feature_vector_attention_mask(
          mask_indices_seq_length, batch["attention_mask"]
      )

    features_shape = (batch_size, mask_indices_seq_length)

    # Sample randomly maksed indices
    mask_time_indices = _compute_mask_indices(
        features_shape,
        self.mask_time_prob,
        self.mask_time_length,
        attention_mask=batch.get("sub_attention_mask"),
    )

    # Sample negative indices
    sampled_negative_indices = _sample_negative_indices(
        features_shape,
        self.model.config.num_negatives,
        mask_time_indices=mask_time_indices,
    )

    batch["mask_time_indices"] = torch.tensor(mask_time_indices, dtype=torch.long, device=device)
    batch["sampled_negative_indices"] = torch.tensor(sampled_negative_indices, dtype=torch.long, device=device)

    return batch

In [None]:
def multiply_grads(params, c):
  """Multiply grad by a constant c"""
  for p in params:
    if p.grad is not None:
      if torch.is_tensor(c):
        c = c.to(p.grad.device)
      p.grad.data.mul_(c)

In [None]:
def get_grad_norm(params, scale=1):
  """Compute grad norm given a gradient scale"""
  total_norm = 0.0
  for p in params:
    if p.grad is not None:
      param_norm = (p.grad.detach().data / scale).norm(2)
      total_norm += param_norm.item() ** 2
  total_norm = total_norm ** 0.5
  return total_norm

In [None]:
accelerator = Accelerator()
logger.info(accelerator.state, main_process_only=False)

if accelerator.is_local_main_process:
  datasets.utils.logging.set_verbosity_warning()
  transformers.utils.logging.set_verbosity_info()

  if is_wandb_available():
    print("wandb have installed")
    import wandb
    wandb.init(project="wav2vec2-fromscratch")

else:
  datasets.utils.logging.set_verbosity_error()
  transformers.utils.logging.set_verbosity_error()

INFO:__main__:Distributed environment: DistributedType.NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: no



wandb have installed


[34m[1mwandb[0m: Currently logged in as: [33mrasarathathsarana63[0m ([33mrasarathathsarana63-university-of-moratuwa[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
set_seed(42)

In [None]:
accelerator.wait_for_everyone()

In [None]:
raw_datasets = DatasetDict()

raw_datasets['train'] = load_from_disk("/content/drive/MyDrive/SP/SP/librispeech_datasets/dataset_10h")
raw_datasets['val'] = load_from_disk("/content/drive/MyDrive/SP/SP/librispeech_datasets/dataset_val_clean")

In [None]:
raw_datasets["train"] = raw_datasets["train"].remove_columns("duration")

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "facebook/wav2vec2-base",
    return_attention_mask = True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}



In [None]:
raw_datasets = raw_datasets.cast_column(
    'audio', datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)
)

In [None]:
# only normalized-inputs-training is supported
if not feature_extractor.do_normalize:
  raise ValueError(
      "Training is only supported for normalized inputs. Make sure ``feature_extractor.do_normalize == True``"
  )

In [None]:
# Set max & min audio length in number of samples
max_length = int(MAX_DURATION * feature_extractor.sampling_rate)
min_length = int(MIN_DURATION * feature_extractor.sampling_rate)

In [None]:
def prepare_dataset(batch):
  sample = batch["audio"]
  inputs = feature_extractor(
      sample["array"],
      sampling_rate=sample["sampling_rate"],
      max_length=max_length,
      truncation=True
  )
  batch["input_values"] = inputs.input_values[0]
  batch["input_length"] = len(inputs.input_values[0])

  return batch

In [None]:
# load audio files into numpy arrays
with accelerator.main_process_first():
  vectorized_datasets = raw_datasets.map(
      prepare_dataset,
      num_proc=None,
      remove_columns=raw_datasets["train"].column_names,
  )

  if min_length > 0.0:
    vectorized_datasets = vectorized_datasets.filter(
        lambda x: x > min_length,
        num_proc=None,
        input_columns=["input_length"]
    )

  vectorized_datasets = vectorized_datasets.remove_columns("input_length")

In [None]:
config = Wav2Vec2Config.from_pretrained("facebook/wav2vec2-base")

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--wav2vec2-base/snapshots/0b5b8e868dd84f03fd87d01f9c4ff0f080fecfe8/config.json
Model config Wav2Vec2Config {
  "activation_dropout": 0.0,
  "adapter_attn_dim": null,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activation": "gel

In [None]:
import torch.nn.functional as F

def inter_codebook_similarity_loss(codebook_vectors):
    # codebook_vectors: [G, V, D]
    codebook_vectors = codebook_vectors.reshape(config.num_codevector_groups, config.num_codevectors_per_group, -1)
    G, V, D = codebook_vectors.shape
    losses = []
    for i in range(G):
        for j in range(i + 1, G):
            # Flatten groups into [V, D]
            e_i = codebook_vectors[i]  # [V, D]
            e_j = codebook_vectors[j]  # [V, D]

            # Normalize
            e_i = F.normalize(e_i, dim=-1)
            e_j = F.normalize(e_j, dim=-1)

            # Pairwise cosine similarity: [V, V]
            sim = torch.matmul(e_i, e_j.T)

            # Mean similarity
            losses.append(sim.mean())

    return torch.stack(losses).mean() if losses else torch.tensor(0.0, device=codebook_vectors.device)


In [None]:
# model = Wav2Vec2ForPreTraining.from_pretrained(
#     "facebook/wav2vec2-base",
#     config=config
# )

model = Wav2Vec2ForPreTraining(
    config=config
)

In [None]:
# Activate gradient checkpointing
if GRADIENT_CHECKPOINTING:
  model.gradient_checkpointing_enable()

In [None]:
mask_time_prob = config.mask_time_prob if MASK_TIME_PROB is None else MASK_TIME_PROB
mask_time_length = config.mask_time_length if MASK_TIME_LENGTH is None else MASK_TIME_LENGTH

In [None]:
data_collator = DataCollatorForWav2Vec2Pretraining(
    model=model,
    feature_extractor=feature_extractor,
    mask_time_prob=mask_time_prob,
    mask_time_length=mask_time_length
)

In [None]:
train_dataloader = DataLoader(
    vectorized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=TRAIN_BATCH_SIZE,
)

In [None]:
val_dataloader = DataLoader(
    vectorized_datasets["val"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=VAL_BATCH_SIZE,
)

In [None]:
optimizer = torch.optim.AdamW(
    list(model.parameters()),
    lr=LEARNING_RATE,
    betas=[ADAM_BETA1, ADAM_BETA2],
    eps=ADAM_EPSILON
)

In [None]:
model, optimizer, train_dataloader, val_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, val_dataloader
)

In [None]:
num_update_steps_per_epcoh = math.ceil(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS)

if MAX_TRAINING_STEPS is None:
  max_train_steps = num_update_steps_per_epcoh * NUM_TRAIN_EPOCHS
else:
  max_train_steps = MAX_TRAINING_STEPS

lr_scheduler = get_scheduler(
    name=LR_SCHEDULER_TYPE,
    optimizer=optimizer,
    num_warmup_steps=NUM_WARMUP_STEPS,
    num_training_steps=max_train_steps,
)

NUM_TRAIN_EPOCHS = math.ceil(max_train_steps / num_update_steps_per_epcoh)

In [None]:
trainable_params = [name for name, param in model.named_parameters() if param.requires_grad]
print("Trainable parameters:", list(trainable_params))

Trainable parameters: ['wav2vec2.masked_spec_embed', 'wav2vec2.feature_extractor.conv_layers.0.conv.weight', 'wav2vec2.feature_extractor.conv_layers.0.layer_norm.weight', 'wav2vec2.feature_extractor.conv_layers.0.layer_norm.bias', 'wav2vec2.feature_extractor.conv_layers.1.conv.weight', 'wav2vec2.feature_extractor.conv_layers.2.conv.weight', 'wav2vec2.feature_extractor.conv_layers.3.conv.weight', 'wav2vec2.feature_extractor.conv_layers.4.conv.weight', 'wav2vec2.feature_extractor.conv_layers.5.conv.weight', 'wav2vec2.feature_extractor.conv_layers.6.conv.weight', 'wav2vec2.feature_projection.layer_norm.weight', 'wav2vec2.feature_projection.layer_norm.bias', 'wav2vec2.feature_projection.projection.weight', 'wav2vec2.feature_projection.projection.bias', 'wav2vec2.encoder.pos_conv_embed.conv.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.layer_norm.weight', 'wav2vec2.

In [None]:
trainable_params

['wav2vec2.masked_spec_embed',
 'wav2vec2.feature_extractor.conv_layers.0.conv.weight',
 'wav2vec2.feature_extractor.conv_layers.0.layer_norm.weight',
 'wav2vec2.feature_extractor.conv_layers.0.layer_norm.bias',
 'wav2vec2.feature_extractor.conv_layers.1.conv.weight',
 'wav2vec2.feature_extractor.conv_layers.2.conv.weight',
 'wav2vec2.feature_extractor.conv_layers.3.conv.weight',
 'wav2vec2.feature_extractor.conv_layers.4.conv.weight',
 'wav2vec2.feature_extractor.conv_layers.5.conv.weight',
 'wav2vec2.feature_extractor.conv_layers.6.conv.weight',
 'wav2vec2.feature_projection.layer_norm.weight',
 'wav2vec2.feature_projection.layer_norm.bias',
 'wav2vec2.feature_projection.projection.weight',
 'wav2vec2.feature_projection.projection.bias',
 'wav2vec2.encoder.pos_conv_embed.conv.bias',
 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0',
 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1',
 'wav2vec2.encoder.layer_norm.weight',
 'wav2vec2.enco

In [None]:
total_batch_size = TRAIN_BATCH_SIZE * accelerator.num_processes * GRADIENT_ACCUMULATION_STEPS
logger.info("***** Runing training *****")
logger.info(f" Num examples = {len(vectorized_datasets['train'])}")
logger.info(f" Num Epochs = {NUM_TRAIN_EPOCHS}")
logger.info(f"  Instantaneous batch size per device = {TRAIN_BATCH_SIZE}")
logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
logger.info(f"  Gradient Accumulation steps = {GRADIENT_ACCUMULATION_STEPS}")
logger.info(f"  Total optimization steps = {MAX_TRAINING_STEPS}")

completed_steps = 0
starting_epoch = 0

progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
starting_epoch = 0

for epoch in range(starting_epoch, NUM_TRAIN_EPOCHS):
  model.train()
  for step, batch in enumerate(train_dataloader):
    num_losses = batch["mask_time_indices"].sum()
    sub_attention_mask = batch.pop("sub_attention_mask", None)
    sub_attention_mask = (
        sub_attention_mask if sub_attention_mask is not None else torch.ones_like(batch["mask_time_indices"])
    )
    precent_masked = num_losses / sub_attention_mask.sum()

    outputs = model(**batch)

    inter_sim_loss = inter_codebook_similarity_loss(model.quantizer.codevectors)
    # total_loss = outputs.loss + INTER_CB_SIMILARITY_WEIGHT * inter_sim_loss

    loss = (outputs.loss / GRADIENT_ACCUMULATION_STEPS) + INTER_CB_SIMILARITY_WEIGHT * inter_sim_loss
    accelerator.backward(loss)

    if accelerator.state.num_processes > 1:
      num_losses = accelerator.gather_for_metrics(num_losses).sum()
      gradient_multiplier = accelerator.state.num_processes / num_losses
      multiply_grads(model.parameters(), gradient_multiplier)
    else:
      multiply_grads(model.parameters(), 1 / num_losses)

    # Update step
    if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0 or step == len(train_dataloader) - 1:
      scale = (
          accelerator.scaler._scale.item()
          if hasattr(accelerator, "scaler") and accelerator.scaler is not None
          else 1.0
      )
      if accelerator.state.num_processes > 1:
        grad_norm = get_grad_norm(model.module.parameters(), scale)
      else:
        grad_norm = get_grad_norm(model.parameters(), scale)

      accelerator.clip_grad_norm_(model.parameters(), max_norm=1.0)

      optimizer.step()
      optimizer.zero_grad()

      if not accelerator.optimizer_step_was_skipped:
        lr_scheduler.step()
      elif accelerator.is_local_main_process:
        progress_bar.write(
          f"Gradients have overflown - skipping update step... Updating gradient scale to {scale}..."
        )

      # update gumbel temperature
      gumble_temperature = max(
          MAX_GUMBEL_TEMPERATURE * GUMBEL_TEMPERATURE**completed_steps,
          MIN_GUMBEL_TEMPERATURE,
      )

      if hasattr(model, "module"):
        model.module.set_gumbel_temperature(gumble_temperature)
      else:
        model.set_gumbel_temperature(gumble_temperature)

      progress_bar.update(1)
      completed_steps += 1

      # Log all results
      if (step + 1) % (GRADIENT_ACCUMULATION_STEPS * LOGGING_STEPS) == 0:
        loss.detach()
        outputs.contrastive_loss.detach()
        outputs.diversity_loss.detach()

        if accelerator.state.num_processes > 1:
          loss = accelerator.gather_for_metrics(loss).sum()
          outputs.contrastive_loss = accelerator.gather_for_metrics(outputs.contrastive_loss).sum()
          outputs.diversity_loss = accelerator.gather_for_metrics(outputs.diversity_loss).sum()
          percent_masked = accelerator.gather_for_metrics(precent_masked).sum()

        train_logs = {
            "loss": (loss * GRADIENT_ACCUMULATION_STEPS) / num_losses,
            "contrast_loss": outputs.contrastive_loss / num_losses,
            "div_loss": outputs.diversity_loss / num_losses,
            "inter_sim_loss": inter_sim_loss,
            "%_mask_idx": precent_masked / accelerator.num_processes,
            "ppl": outputs.codevector_perplexity,
            "lr": torch.tensor(optimizer.param_groups[0]["lr"]),
            "temp": torch.tensor(gumble_temperature),
            "grad_norm": torch.tensor(grad_norm),
        }


        log_str = ""
        for k, v in train_logs.items():
          log_str += f"| {k}: {v.item():.3e}"

        if accelerator.is_local_main_process:
          progress_bar.write(log_str)
          if is_wandb_available():
            wandb.log(train_logs)

      # save model
      if (step + 1) % (GRADIENT_ACCUMULATION_STEPS * SAVING_STEPS) == 0:
        if OUTPUT_DIR is not None:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                OUTPUT_DIR,
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save
            )

      if completed_steps >= max_train_steps:
        break
  model.eval()

  val_logs = {
      "val_loss": 0,
      "val_contrastive_loss": 0,
      "val_diversity_loss": 0,
      "val_num_losses": 0,
  }

  for step, batch in enumerate(val_dataloader):
    with torch.no_grad():
        batch.pop("sub_attention_mask", None)
        outputs = model(**batch)

    val_logs["val_loss"] += outputs.loss
    val_logs["val_contrastive_loss"] += outputs.contrastive_loss
    val_logs["val_diversity_loss"] += outputs.diversity_loss
    val_logs["val_num_losses"] += batch["mask_time_indices"].sum()

  if accelerator.num_processes > 1:
      val_logs = {k: accelerator.gather_for_metrics(v).sum() for k, v in val_logs.items()}

  val_logs = {k: v / val_logs["val_num_losses"] for k, v in val_logs.items()}

  log_str = ""
  for k, v in val_logs.items():
      log_str += f"| {k}: {v.item():.3e}"

  if accelerator.is_local_main_process:
      progress_bar.write(log_str)
      if is_wandb_available():
          wandb.log(val_logs)

  if OUTPUT_DIR is not None:
      accelerator.wait_for_everyone()
      unwrapped_model = accelerator.unwrap_model(model)
      unwrapped_model.save_pretrained(
          OUTPUT_DIR, is_main_process=accelerator.is_main_process, save_function=accelerator.save
      )

INFO:__main__:***** Runing training *****
INFO:__main__: Num examples = 2641
INFO:__main__: Num Epochs = 50
INFO:__main__:  Instantaneous batch size per device = 16
INFO:__main__:  Total train batch size (w. parallel, distributed & accumulation) = 64
INFO:__main__:  Gradient Accumulation steps = 4
INFO:__main__:  Total optimization steps = None


  0%|          | 0/2100 [00:00<?, ?it/s]

| loss: 5.360e+00| contrast_loss: 4.623e+00| div_loss: 5.945e-01| inter_sim_loss: 7.514e-01| %_mask_idx: 5.841e-02| ppl: 2.595e+02| lr: 1.000e-05| temp: 2.000e+00| grad_norm: 5.486e-01
| loss: 5.369e+00| contrast_loss: 4.633e+00| div_loss: 4.992e-01| inter_sim_loss: 7.513e-01| %_mask_idx: 5.688e-02| ppl: 3.205e+02| lr: 2.000e-05| temp: 2.000e+00| grad_norm: 3.650e-01
| loss: 5.352e+00| contrast_loss: 4.624e+00| div_loss: 5.217e-01| inter_sim_loss: 7.513e-01| %_mask_idx: 5.745e-02| ppl: 3.061e+02| lr: 3.000e-05| temp: 2.000e+00| grad_norm: 2.548e-01
| loss: 5.640e+00| contrast_loss: 4.618e+00| div_loss: 4.981e-01| inter_sim_loss: 7.511e-01| %_mask_idx: 3.870e-02| ppl: 3.212e+02| lr: 4.000e-05| temp: 2.000e+00| grad_norm: 2.645e-01


Configuration saved in /outputs/config.json


| val_loss: 4.675e+00| val_contrastive_loss: 4.617e+00| val_diversity_loss: 5.789e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.317e+00| contrast_loss: 4.616e+00| div_loss: 4.111e-01| inter_sim_loss: 7.508e-01| %_mask_idx: 5.824e-02| ppl: 3.769e+02| lr: 5.200e-05| temp: 2.000e+00| grad_norm: 1.960e-01
| loss: 5.332e+00| contrast_loss: 4.616e+00| div_loss: 3.826e-01| inter_sim_loss: 7.505e-01| %_mask_idx: 6.008e-02| ppl: 3.952e+02| lr: 6.200e-05| temp: 2.000e+00| grad_norm: 2.087e-01
| loss: 5.601e+00| contrast_loss: 4.617e+00| div_loss: 4.596e-01| inter_sim_loss: 7.502e-01| %_mask_idx: 4.008e-02| ppl: 3.459e+02| lr: 7.200e-05| temp: 2.000e+00| grad_norm: 2.241e-01
| loss: 5.312e+00| contrast_loss: 4.619e+00| div_loss: 3.486e-01| inter_sim_loss: 7.498e-01| %_mask_idx: 5.984e-02| ppl: 4.169e+02| lr: 8.200e-05| temp: 2.000e+00| grad_norm: 2.188e-01


Configuration saved in /outputs/config.json


| val_loss: 4.667e+00| val_contrastive_loss: 4.615e+00| val_diversity_loss: 5.204e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.664e+00| contrast_loss: 4.618e+00| div_loss: 4.681e-01| inter_sim_loss: 7.491e-01| %_mask_idx: 3.962e-02| ppl: 3.404e+02| lr: 9.400e-05| temp: 2.000e+00| grad_norm: 2.274e-01
| loss: 5.280e+00| contrast_loss: 4.615e+00| div_loss: 3.811e-01| inter_sim_loss: 7.486e-01| %_mask_idx: 6.020e-02| ppl: 3.961e+02| lr: 9.980e-05| temp: 2.000e+00| grad_norm: 1.795e-01
| loss: 5.620e+00| contrast_loss: 4.616e+00| div_loss: 3.843e-01| inter_sim_loss: 7.482e-01| %_mask_idx: 4.151e-02| ppl: 3.941e+02| lr: 9.930e-05| temp: 2.000e+00| grad_norm: 2.000e-01
| loss: 5.297e+00| contrast_loss: 4.614e+00| div_loss: 3.793e-01| inter_sim_loss: 7.478e-01| %_mask_idx: 5.812e-02| ppl: 3.972e+02| lr: 9.880e-05| temp: 2.000e+00| grad_norm: 1.768e-01


Configuration saved in /outputs/config.json


| val_loss: 4.666e+00| val_contrastive_loss: 4.615e+00| val_diversity_loss: 5.160e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.589e+00| contrast_loss: 4.617e+00| div_loss: 3.846e-01| inter_sim_loss: 7.470e-01| %_mask_idx: 4.045e-02| ppl: 3.939e+02| lr: 9.820e-05| temp: 2.000e+00| grad_norm: 2.239e-01
| loss: 5.402e+00| contrast_loss: 4.613e+00| div_loss: 4.023e-01| inter_sim_loss: 7.465e-01| %_mask_idx: 5.283e-02| ppl: 3.826e+02| lr: 9.770e-05| temp: 2.000e+00| grad_norm: 1.797e-01
| loss: 5.621e+00| contrast_loss: 4.617e+00| div_loss: 4.154e-01| inter_sim_loss: 7.460e-01| %_mask_idx: 4.097e-02| ppl: 3.741e+02| lr: 9.720e-05| temp: 2.000e+00| grad_norm: 2.032e-01
| loss: 5.296e+00| contrast_loss: 4.616e+00| div_loss: 3.348e-01| inter_sim_loss: 7.455e-01| %_mask_idx: 5.844e-02| ppl: 4.257e+02| lr: 9.670e-05| temp: 2.000e+00| grad_norm: 1.759e-01


Configuration saved in /outputs/config.json


| val_loss: 4.666e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 5.215e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.637e+00| contrast_loss: 4.616e+00| div_loss: 4.785e-01| inter_sim_loss: 7.447e-01| %_mask_idx: 3.924e-02| ppl: 3.338e+02| lr: 9.610e-05| temp: 2.000e+00| grad_norm: 1.986e-01
| loss: 5.347e+00| contrast_loss: 4.617e+00| div_loss: 3.904e-01| inter_sim_loss: 7.442e-01| %_mask_idx: 5.767e-02| ppl: 3.901e+02| lr: 9.560e-05| temp: 2.000e+00| grad_norm: 1.724e-01
| loss: 5.586e+00| contrast_loss: 4.614e+00| div_loss: 4.222e-01| inter_sim_loss: 7.437e-01| %_mask_idx: 4.084e-02| ppl: 3.698e+02| lr: 9.510e-05| temp: 2.000e+00| grad_norm: 1.831e-01
| loss: 5.667e+00| contrast_loss: 4.610e+00| div_loss: 4.671e-01| inter_sim_loss: 7.432e-01| %_mask_idx: 3.806e-02| ppl: 3.411e+02| lr: 9.460e-05| temp: 2.000e+00| grad_norm: 1.940e-01


Configuration saved in /outputs/config.json


| val_loss: 4.666e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 5.187e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.285e+00| contrast_loss: 4.614e+00| div_loss: 3.719e-01| inter_sim_loss: 7.424e-01| %_mask_idx: 5.964e-02| ppl: 4.020e+02| lr: 9.400e-05| temp: 2.000e+00| grad_norm: 1.601e-01
| loss: 5.316e+00| contrast_loss: 4.615e+00| div_loss: 3.336e-01| inter_sim_loss: 7.419e-01| %_mask_idx: 5.965e-02| ppl: 4.265e+02| lr: 9.350e-05| temp: 2.000e+00| grad_norm: 1.684e-01
| loss: 5.308e+00| contrast_loss: 4.613e+00| div_loss: 3.619e-01| inter_sim_loss: 7.414e-01| %_mask_idx: 5.965e-02| ppl: 4.084e+02| lr: 9.300e-05| temp: 2.000e+00| grad_norm: 1.657e-01
| loss: 5.285e+00| contrast_loss: 4.618e+00| div_loss: 3.109e-01| inter_sim_loss: 7.410e-01| %_mask_idx: 6.020e-02| ppl: 4.410e+02| lr: 9.250e-05| temp: 2.000e+00| grad_norm: 1.567e-01


Configuration saved in /outputs/config.json


| val_loss: 4.665e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 5.075e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.652e+00| contrast_loss: 4.619e+00| div_loss: 4.607e-01| inter_sim_loss: 7.402e-01| %_mask_idx: 4.021e-02| ppl: 3.452e+02| lr: 9.190e-05| temp: 1.999e+00| grad_norm: 1.875e-01
| loss: 5.584e+00| contrast_loss: 4.616e+00| div_loss: 4.300e-01| inter_sim_loss: 7.397e-01| %_mask_idx: 4.008e-02| ppl: 3.648e+02| lr: 9.140e-05| temp: 1.999e+00| grad_norm: 1.908e-01
| loss: 5.298e+00| contrast_loss: 4.618e+00| div_loss: 3.101e-01| inter_sim_loss: 7.392e-01| %_mask_idx: 5.804e-02| ppl: 4.415e+02| lr: 9.090e-05| temp: 1.999e+00| grad_norm: 1.529e-01
| loss: 5.609e+00| contrast_loss: 4.616e+00| div_loss: 3.949e-01| inter_sim_loss: 7.387e-01| %_mask_idx: 4.215e-02| ppl: 3.872e+02| lr: 9.040e-05| temp: 1.999e+00| grad_norm: 1.678e-01


Configuration saved in /outputs/config.json


| val_loss: 4.665e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 5.146e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.289e+00| contrast_loss: 4.617e+00| div_loss: 3.110e-01| inter_sim_loss: 7.379e-01| %_mask_idx: 5.821e-02| ppl: 4.410e+02| lr: 8.980e-05| temp: 1.999e+00| grad_norm: 1.626e-01
| loss: 5.280e+00| contrast_loss: 4.615e+00| div_loss: 3.123e-01| inter_sim_loss: 7.374e-01| %_mask_idx: 5.967e-02| ppl: 4.402e+02| lr: 8.930e-05| temp: 1.999e+00| grad_norm: 1.474e-01
| loss: 5.709e+00| contrast_loss: 4.615e+00| div_loss: 4.174e-01| inter_sim_loss: 7.369e-01| %_mask_idx: 3.820e-02| ppl: 3.729e+02| lr: 8.880e-05| temp: 1.999e+00| grad_norm: 1.890e-01
| loss: 5.604e+00| contrast_loss: 4.614e+00| div_loss: 3.968e-01| inter_sim_loss: 7.364e-01| %_mask_idx: 3.988e-02| ppl: 3.860e+02| lr: 8.830e-05| temp: 1.999e+00| grad_norm: 1.804e-01


Configuration saved in /outputs/config.json


| val_loss: 4.664e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 4.995e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.271e+00| contrast_loss: 4.616e+00| div_loss: 3.162e-01| inter_sim_loss: 7.356e-01| %_mask_idx: 5.912e-02| ppl: 4.376e+02| lr: 8.770e-05| temp: 1.999e+00| grad_norm: 1.573e-01
| loss: 5.605e+00| contrast_loss: 4.614e+00| div_loss: 4.182e-01| inter_sim_loss: 7.350e-01| %_mask_idx: 4.002e-02| ppl: 3.724e+02| lr: 8.720e-05| temp: 1.999e+00| grad_norm: 1.737e-01
| loss: 5.579e+00| contrast_loss: 4.617e+00| div_loss: 4.435e-01| inter_sim_loss: 7.345e-01| %_mask_idx: 4.044e-02| ppl: 3.562e+02| lr: 8.670e-05| temp: 1.999e+00| grad_norm: 1.809e-01
| loss: 5.626e+00| contrast_loss: 4.615e+00| div_loss: 4.204e-01| inter_sim_loss: 7.340e-01| %_mask_idx: 3.907e-02| ppl: 3.709e+02| lr: 8.620e-05| temp: 1.999e+00| grad_norm: 1.775e-01


Configuration saved in /outputs/config.json


| val_loss: 4.663e+00| val_contrastive_loss: 4.612e+00| val_diversity_loss: 5.051e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.297e+00| contrast_loss: 4.615e+00| div_loss: 3.178e-01| inter_sim_loss: 7.332e-01| %_mask_idx: 5.776e-02| ppl: 4.366e+02| lr: 8.560e-05| temp: 1.999e+00| grad_norm: 1.572e-01
| loss: 5.637e+00| contrast_loss: 4.612e+00| div_loss: 4.223e-01| inter_sim_loss: 7.326e-01| %_mask_idx: 3.911e-02| ppl: 3.697e+02| lr: 8.510e-05| temp: 1.999e+00| grad_norm: 1.634e-01
| loss: 5.605e+00| contrast_loss: 4.611e+00| div_loss: 4.594e-01| inter_sim_loss: 7.321e-01| %_mask_idx: 3.973e-02| ppl: 3.460e+02| lr: 8.460e-05| temp: 1.999e+00| grad_norm: 1.798e-01
| loss: 5.272e+00| contrast_loss: 4.617e+00| div_loss: 3.093e-01| inter_sim_loss: 7.316e-01| %_mask_idx: 5.888e-02| ppl: 4.421e+02| lr: 8.410e-05| temp: 1.999e+00| grad_norm: 1.513e-01


Configuration saved in /outputs/config.json


| val_loss: 4.663e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 4.981e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.604e+00| contrast_loss: 4.615e+00| div_loss: 4.678e-01| inter_sim_loss: 7.308e-01| %_mask_idx: 4.111e-02| ppl: 3.406e+02| lr: 8.350e-05| temp: 1.999e+00| grad_norm: 1.791e-01
| loss: 5.565e+00| contrast_loss: 4.613e+00| div_loss: 3.914e-01| inter_sim_loss: 7.303e-01| %_mask_idx: 4.008e-02| ppl: 3.895e+02| lr: 8.300e-05| temp: 1.999e+00| grad_norm: 1.873e-01
| loss: 5.609e+00| contrast_loss: 4.616e+00| div_loss: 4.458e-01| inter_sim_loss: 7.299e-01| %_mask_idx: 4.017e-02| ppl: 3.547e+02| lr: 8.250e-05| temp: 1.999e+00| grad_norm: 1.761e-01
| loss: 5.288e+00| contrast_loss: 4.615e+00| div_loss: 3.581e-01| inter_sim_loss: 7.294e-01| %_mask_idx: 5.906e-02| ppl: 4.108e+02| lr: 8.200e-05| temp: 1.999e+00| grad_norm: 1.402e-01


Configuration saved in /outputs/config.json


| val_loss: 4.664e+00| val_contrastive_loss: 4.613e+00| val_diversity_loss: 5.121e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.608e+00| contrast_loss: 4.618e+00| div_loss: 4.048e-01| inter_sim_loss: 7.287e-01| %_mask_idx: 3.878e-02| ppl: 3.809e+02| lr: 8.140e-05| temp: 1.999e+00| grad_norm: 1.867e-01
| loss: 5.597e+00| contrast_loss: 4.616e+00| div_loss: 4.146e-01| inter_sim_loss: 7.282e-01| %_mask_idx: 4.003e-02| ppl: 3.747e+02| lr: 8.090e-05| temp: 1.999e+00| grad_norm: 1.622e-01
| loss: 5.591e+00| contrast_loss: 4.614e+00| div_loss: 4.068e-01| inter_sim_loss: 7.278e-01| %_mask_idx: 3.918e-02| ppl: 3.796e+02| lr: 8.040e-05| temp: 1.999e+00| grad_norm: 1.775e-01
| loss: 5.280e+00| contrast_loss: 4.615e+00| div_loss: 3.287e-01| inter_sim_loss: 7.274e-01| %_mask_idx: 6.008e-02| ppl: 4.296e+02| lr: 7.990e-05| temp: 1.999e+00| grad_norm: 1.536e-01


Configuration saved in /outputs/config.json


| val_loss: 4.663e+00| val_contrastive_loss: 4.613e+00| val_diversity_loss: 4.988e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.337e+00| contrast_loss: 4.615e+00| div_loss: 3.305e-01| inter_sim_loss: 7.267e-01| %_mask_idx: 5.686e-02| ppl: 4.285e+02| lr: 7.930e-05| temp: 1.999e+00| grad_norm: 1.544e-01
| loss: 5.274e+00| contrast_loss: 4.618e+00| div_loss: 2.847e-01| inter_sim_loss: 7.263e-01| %_mask_idx: 5.885e-02| ppl: 4.578e+02| lr: 7.880e-05| temp: 1.999e+00| grad_norm: 1.411e-01
| loss: 5.570e+00| contrast_loss: 4.616e+00| div_loss: 4.059e-01| inter_sim_loss: 7.259e-01| %_mask_idx: 4.073e-02| ppl: 3.802e+02| lr: 7.830e-05| temp: 1.999e+00| grad_norm: 1.643e-01
| loss: 5.297e+00| contrast_loss: 4.616e+00| div_loss: 3.100e-01| inter_sim_loss: 7.254e-01| %_mask_idx: 5.725e-02| ppl: 4.416e+02| lr: 7.780e-05| temp: 1.999e+00| grad_norm: 1.431e-01


Configuration saved in /outputs/config.json


| val_loss: 4.664e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 5.049e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.592e+00| contrast_loss: 4.617e+00| div_loss: 4.077e-01| inter_sim_loss: 7.247e-01| %_mask_idx: 3.996e-02| ppl: 3.791e+02| lr: 7.720e-05| temp: 1.999e+00| grad_norm: 1.644e-01
| loss: 5.590e+00| contrast_loss: 4.614e+00| div_loss: 4.205e-01| inter_sim_loss: 7.242e-01| %_mask_idx: 3.990e-02| ppl: 3.709e+02| lr: 7.670e-05| temp: 1.999e+00| grad_norm: 1.739e-01
| loss: 5.322e+00| contrast_loss: 4.615e+00| div_loss: 3.310e-01| inter_sim_loss: 7.238e-01| %_mask_idx: 5.684e-02| ppl: 4.282e+02| lr: 7.620e-05| temp: 1.999e+00| grad_norm: 1.577e-01
| loss: 5.604e+00| contrast_loss: 4.617e+00| div_loss: 4.175e-01| inter_sim_loss: 7.234e-01| %_mask_idx: 3.917e-02| ppl: 3.728e+02| lr: 7.570e-05| temp: 1.999e+00| grad_norm: 1.508e-01


Configuration saved in /outputs/config.json


| val_loss: 4.663e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 4.944e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.275e+00| contrast_loss: 4.614e+00| div_loss: 3.294e-01| inter_sim_loss: 7.227e-01| %_mask_idx: 5.942e-02| ppl: 4.292e+02| lr: 7.510e-05| temp: 1.999e+00| grad_norm: 1.607e-01
| loss: 5.248e+00| contrast_loss: 4.616e+00| div_loss: 3.059e-01| inter_sim_loss: 7.223e-01| %_mask_idx: 6.012e-02| ppl: 4.442e+02| lr: 7.460e-05| temp: 1.999e+00| grad_norm: 1.460e-01
| loss: 5.590e+00| contrast_loss: 4.616e+00| div_loss: 4.227e-01| inter_sim_loss: 7.220e-01| %_mask_idx: 4.034e-02| ppl: 3.695e+02| lr: 7.410e-05| temp: 1.999e+00| grad_norm: 1.811e-01
| loss: 5.587e+00| contrast_loss: 4.615e+00| div_loss: 4.111e-01| inter_sim_loss: 7.216e-01| %_mask_idx: 3.961e-02| ppl: 3.769e+02| lr: 7.360e-05| temp: 1.999e+00| grad_norm: 1.683e-01


Configuration saved in /outputs/config.json


| val_loss: 4.662e+00| val_contrastive_loss: 4.613e+00| val_diversity_loss: 4.976e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors


| loss: 5.584e+00| contrast_loss: 4.614e+00| div_loss: 3.958e-01| inter_sim_loss: 7.210e-01| %_mask_idx: 4.007e-02| ppl: 3.867e+02| lr: 7.300e-05| temp: 1.999e+00| grad_norm: 1.642e-01
| loss: 5.320e+00| contrast_loss: 4.617e+00| div_loss: 3.060e-01| inter_sim_loss: 7.205e-01| %_mask_idx: 5.628e-02| ppl: 4.441e+02| lr: 7.250e-05| temp: 1.999e+00| grad_norm: 1.504e-01
| loss: 5.551e+00| contrast_loss: 4.613e+00| div_loss: 3.793e-01| inter_sim_loss: 7.201e-01| %_mask_idx: 4.008e-02| ppl: 3.973e+02| lr: 7.200e-05| temp: 1.999e+00| grad_norm: 1.579e-01
| loss: 5.326e+00| contrast_loss: 4.617e+00| div_loss: 3.543e-01| inter_sim_loss: 7.197e-01| %_mask_idx: 5.741e-02| ppl: 4.133e+02| lr: 7.150e-05| temp: 1.999e+00| grad_norm: 1.442e-01


Configuration saved in /outputs/config.json


| val_loss: 4.662e+00| val_contrastive_loss: 4.614e+00| val_diversity_loss: 4.841e-01| val_num_losses: 1.000e+00


Model weights saved in /outputs/model.safetensors
