In [None]:
!pip install pytorch_lightning  seqeval evaluate

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from datasets import load_dataset, Dataset, concatenate_datasets
from evaluate import load as load_metric
from transformers import AutoTokenizer
from pytorch_lightning.callbacks import EarlyStopping
import random
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
#dataset = load_dataset("sst2")
#dataset.remove_columns(['idx']) # This dataset does not have an 'id' column

#### **Dataset**

دیتاست استفاده شده یک دیتاست 20 کلاسه هستش که فایل های دیتاست رو روی سیستم داشتم

In [None]:
from google.colab import files

def upload_dataset_from_system():
  uploaded = files.upload()
  data = pd.read_parquet(list(uploaded.keys())[0])
  dataset = Dataset.from_pandas(data)
  return dataset

# --- return dataset classes ---
def dataset_classes(dataset):
  sett = []
  for item in dataset:
    sett.append(item['label'])
  print(f"classes :{set(sett)}")
  return {"num_classes":len(set(sett))}

# --- tokenizer ---
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
VOCAB_SIZE = tokenizer.vocab_size

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# --- load dataset ---
train_data = upload_dataset_from_system()
val_data = upload_dataset_from_system()
test_data = upload_dataset_from_system()

Saving train-00000-of-00001.parquet to train-00000-of-00001.parquet


Saving validation-00000-of-00001.parquet to validation-00000-of-00001.parquet


Saving test-00000-of-00001.parquet to test-00000-of-00001.parquet


In [None]:
print(f"train_dataset :{train_data}")
print(f"val_dataset :{val_data}")
print(f"test_dataset :{test_data}")

train_dataset :Dataset({
    features: ['text', 'label'],
    num_rows: 45000
})
val_dataset :Dataset({
    features: ['text', 'label'],
    num_rows: 5000
})
test_dataset :Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})


In [None]:
dataset_classes(train_data)

classes :{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}


{'num_classes': 20}

In [None]:
train_data = concatenate_datasets([train_data, test_data.select(range(45000))])
test_data = test_data.select(range(45000,50000))

In [None]:
print(f"train_dataset :{train_data}")
print(f"val_dataset :{val_data}")
print(f"test_dataset :{test_data}")

train_dataset :Dataset({
    features: ['text', 'label'],
    num_rows: 90000
})
val_dataset :Dataset({
    features: ['text', 'label'],
    num_rows: 5000
})
test_dataset :Dataset({
    features: ['text', 'label'],
    num_rows: 5000
})


#### **preprocess & Dataset-utilities**

In [None]:
from datasets import Dataset
def preproces_data(example, num_classes=20):
  tokenized_item = tokenizer(example['text'])
  inp_ids = tokenized_item['input_ids']
  attn_msk = tokenized_item['attention_mask']
  label = example['label']
  # labels = [0]*num_classes
  # labels[label] = 1

  return {
         "input_ids": inp_ids, # Convert list to tensor
          "attention_mask": attn_msk, # Convert list to tensor
          "labels": label # Return the original label index
    }

In [None]:
train_dataset = train_data.map(preproces_data,remove_columns=['text','label'])
val_dataset = val_data.map(preproces_data,remove_columns=['text','label'])
test_dataset = test_data.map(preproces_data,remove_columns=['text','label'])
train_dataset

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 90000
})

In [None]:
def collate_fn(batch):
    input_ids = [
        torch.tensor(item['input_ids'], dtype=torch.long)
        if not isinstance(item['input_ids'], torch.Tensor) else item['input_ids'].long()
        for item in batch
    ]
    attention_mask = [
        torch.tensor(item['attention_mask'], dtype=torch.long)
        if not isinstance(item['attention_mask'], torch.Tensor) else item['attention_mask'].long()
        for item in batch
    ]

    # labels: one-hot / multi-hot
    labels = torch.stack([
        torch.tensor(item['labels'], dtype=torch.long) # Changed to torch.long
        if not isinstance(item['labels'], torch.Tensor) else item['labels'].long()
        for item in batch
    ])

    input_ids_padded = pad_sequence(
        input_ids, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask_padded = pad_sequence(
        attention_mask, batch_first=True, padding_value=0
    )

    return {
        "input_ids": input_ids_padded,        # (batch_size, seq_len)
        "attention_mask": attention_mask_padded,  # (batch_size, seq_len)
        "labels": labels                      # (batch_size, num_classes)
    }

#### **Model**

##### **LitModule**

In [None]:
import torchmetrics
from torch.optim.lr_scheduler import ReduceLROnPlateau # Import ReduceLROnPlateau

class LitModule(pl.LightningModule):
  def __init__(self,model,tokenizer,lr=1e-3,train_dataset=train_dataset,val_dataset=val_dataset,test_dataset=test_dataset):
    super().__init__()
    self.model = model
    self.loss_fn =  nn.CrossEntropyLoss()  #nn.BCEWithLogitsLoss()
    self.lr = lr
    self.train_dataset = train_dataset
    #train_data.map(lambda examples: preproces_data(examples, num_classes=self.model.classifier.out_features,remove_columns=['text','label']), batched=True)
    self.validation_dataset = val_dataset
    #val_data.map(lambda examples: preproces_data(examples, num_classes=self.model.classifier.out_features,remove_columns=['text','label']), batched=True)
    self.test_dataset = test_dataset
    self.train_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=self.model.classifier.out_features)
    self.val_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=self.model.classifier.out_features)
    self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=self.model.classifier.out_features)
    self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=self.model.classifier.out_features)
    self.test_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=self.model.classifier.out_features)
    self.test_f1 = torchmetrics.F1Score(task="multiclass", num_classes=self.model.classifier.out_features)


  def forward(self,input_ids):
    return self.model(input_ids)

  def train_dataloader(self):
    return DataLoader(self.train_dataset,batch_size=1,shuffle=True, collate_fn=collate_fn)

  def val_dataloader(self):
    return DataLoader(self.validation_dataset,batch_size=1, collate_fn=collate_fn)

  def test_dataloader(self):
    return DataLoader(self.test_dataset,batch_size=1, collate_fn=collate_fn)


  def training_step(self,batch,batch_idx):
     input_ids = batch['input_ids']
     labels = batch['labels'] # Labels are now class indices

     logits = self.model(input_ids)
     # Reshape logits to be (batch_size * seq_len, num_classes) and labels to be (batch_size * seq_len)
     loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
     self.log("train_loss",loss)

     preds = torch.argmax(logits, dim=-1)
     # Flatten preds and labels for metrics
     self.train_accuracy.update(preds.view(-1), labels.view(-1))
     self.train_f1.update(preds.view(-1), labels.view(-1))
     self.log('train_acc', self.train_accuracy, on_step=True, on_epoch=False)
     self.log('train_f1', self.train_f1, on_step=True, on_epoch=False)

     return loss

  def validation_step(self,batch,batch_idx):
      input_ids = batch['input_ids']
      labels = batch['labels'] # Labels are now class indices

      logits = self.model(input_ids)
      # Reshape logits to be (batch_size * seq_len, num_classes) and labels to be (batch_size * seq_len)
      loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
      self.log("val_loss",loss)

      preds = torch.argmax(logits, dim=-1)
      # Flatten preds and labels for metrics
      self.val_accuracy.update(preds.view(-1), labels.view(-1))
      self.val_f1.update(preds.view(-1), labels.view(-1))
      self.log('val_acc', self.val_accuracy, on_step=False, on_epoch=True)
      self.log('val_f1', self.val_f1, on_step=False, on_epoch=True)

  def test_step(self, batch, batch_idx):
      input_ids = batch['input_ids']
      labels = batch['labels']

      logits = self.model(input_ids)
      loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
      self.log("test_loss", loss)

      preds = torch.argmax(logits, dim=-1)
      self.test_accuracy.update(preds.view(-1), labels.view(-1))
      self.test_f1.update(preds.view(-1), labels.view(-1))
      self.log('test_acc', self.test_accuracy, on_step=False, on_epoch=True)
      self.log('test_f1', self.test_f1, on_step=False, on_epoch=True)

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
    # Use ReduceLROnPlateau
    scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3) # Removed verbose=True
    return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "monitor": "val_loss"}} # Monitor validation loss

#### **Estandard_Model**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# 🔹 یک بلاک ترنسفورمر
class TransformerBlock(nn.Module):
    def __init__(self, hidden_dim=128, num_heads=4, ff_dim=256, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads,
            batch_first=True
        )
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)

        # Feed Forward
        self.ff = nn.Sequential(
            nn.Linear(hidden_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, hidden_dim)
        )

        self.drop1 = nn.Dropout(dropout)
        self.drop2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Self-Attention + Residual
        attn_out, _ = self.attn(x, x, x, attn_mask=mask)  # (B, T, H)
        x = self.ln1(x + self.drop1(attn_out))

        # FeedForward + Residual
        ff_out = self.ff(x)
        x = self.ln2(x + self.drop2(ff_out))

        return x


# 🔹 مدل کامل با چند بلاک ترنسفورمر
MAX_LENGTH = 512
class Estandard_Model(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, hidden_dim=128, num_layers=3, num_heads=4, ff_dim=256, num_labels=20):
        super().__init__()
        self.word_embed = nn.Embedding(vocab_size, hidden_dim)
        self.pos_embed = nn.Embedding(MAX_LENGTH, hidden_dim)

        self.layers = nn.ModuleList([
            TransformerBlock(hidden_dim=hidden_dim, num_heads=num_heads, ff_dim=ff_dim)
            for _ in range(num_layers)
        ])

        self.classifier = nn.Linear(hidden_dim, num_labels)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, input_ids):
        batch_size, seq_len = input_ids.size()
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)

        # word + positional embedding
        x = self.word_embed(input_ids) + self.pos_embed(positions)

        # Transformer layers
        for layer in self.layers:
            x = layer(x)

        # Sequence pooling (mean pooling)
        x = x.mean(dim=1)  # (B, H)

        # Classification head
        logits = self.classifier(x)
        return logits


#### **Model_1**

In [None]:
class conv_Attention(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.hidden_dim = hidden_dim
        # Conv1d expects input (batch_size, channels, sequence_length)
        self.key = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3,padding='same')
        self.value = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3,padding='same')
        self.query = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3,padding='same')
        self.attn_drop = nn.Dropout(0.3)
        self.resid_drop = nn.Dropout(0.3)

    def forward(self, x):
        B, T, C = x.size()

        # Permute dimensions for Conv1d: (B, T, C) -> (B, C, T)
        x_permuted = x.permute(0, 2, 1)

        k = self.key(x_permuted)
        q = self.query(x_permuted)
        v = self.value(x_permuted)

        # Permute back to (B, T, C) for attention calculation
        k = k.permute(0, 2, 1)
        q = q.permute(0, 2, 1)
        v = v.permute(0, 2, 1)


        att = (q @ k.transpose(-2, -1)) / math.sqrt(C)
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)

        y = att @ v
        y = self.resid_drop(y)
        return y

In [None]:
class EncoderBlck_1(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.conv_attn = conv_Attention(hidden_dim)

        self.drop_attn = nn.Dropout(0.5)
        self.drop_proj = nn.Dropout(0.3)

        self.ln_attn = nn.LayerNorm(hidden_dim)
        self.ln_proj = nn.LayerNorm(hidden_dim)

        self.conv1 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding="same")
        self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding="same")

        self.proj = nn.Linear(hidden_dim*2,hidden_dim)

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.LSTM):
            for name, param in m.named_parameters():
                if "weight_ih" in name:
                    nn.init.xavier_uniform_(param.data)
                elif "weight_hh" in name:
                    nn.init.orthogonal_(param.data)
                elif "bias" in name:
                    param.data.fill_(0)
        elif isinstance(m, nn.Conv1d):
            nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.ones_(m.weight)
            nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)

    def forward(self, input_ids):
        # GRU-attention
        x = self.conv_attn(input_ids)          # (B, hidden_dim)
        #x = x.unsqueeze(1)                     # (B, 1, hidden_dim)
        x = self.drop_attn(x)
        y = self.ln_attn(x)
        # Conv1d expects (B, C, L)
        y1 = self.conv1(y.permute(0, 2, 1)).permute(0, 2, 1)  # (B, 1, hidden_dim)
        y2 = self.conv2(y.permute(0, 2, 1)).permute(0, 2, 1)  # (B, 1, hidden_dim)
        y = torch.cat([y1, y2], dim=-1)        # (B, 1, hidden_dim*2)
        y = self.proj(y)
        y = self.drop_proj(y)
        y = self.ln_proj(y)
        return y

In [None]:
MAX_LENGTH = 512
class Model_1(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, hidden_dim=128,num_layers=3, num_labels=20):
        super().__init__()
        self.word_embed = nn.Embedding(vocab_size, hidden_dim)
        self.pos_embed = nn.Embedding(MAX_LENGTH,hidden_dim)
        self.layers = nn.ModuleList([
            EncoderBlck_1(hidden_dim=hidden_dim)
            for _ in range(num_layers)
        ])
        # چون خروجی SimpleEncoder_2 همون hidden_dim هست، اینو تغییر دادم
        #self.shared_proj = nn.Linear(hidden_dim, hidden_dim)
        #self.ln_proj = nn.LayerNorm(hidden_dim)
        self.classifier = nn.Linear(hidden_dim, num_labels)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, input_ids):
        batch_size, seq_len = input_ids.size()
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
        x = self.word_embed(input_ids) + self.pos_embed(positions)
        for layer in self.layers:
            x = layer(x)
            #x = self.shared_proj(x)
            #x = self.ln_proj(x)
        x = x.mean(dim=1)  # (B, H)  ---> میشه representation کل سکوئنس
        logits = self.classifier(x)
        return logits

In [None]:
# ===== Training =====
model_1 = Model_1(num_labels=20,hidden_dim=128,num_layers=3)
lit_module_1 = LitModule(model_1,tokenizer)

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_1 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_1.fit(lit_module_1)

trainer_1.test(lit_module_1)

#### **Model_2**

In [None]:
class gru_Attention(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.key = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.value = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.query = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.attn_drop = nn.Dropout(0.3)
        self.resid_drop = nn.Dropout(0.3)

    def forward(self, x):
        B, T, C = x.size()

        _, k = self.key(x)
        _, q = self.query(x)
        _, v = self.value(x)

        k = k.squeeze(0)
        q = q.squeeze(0)
        v = v.squeeze(0)

        att = (q @ k.transpose(-2, -1)) / math.sqrt(C)
        #att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float("-inf"))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)

        y = att @ v
        y = self.resid_drop(y)
        return y

In [None]:
class EncoderBlck_2(nn.Module):
    def __init__(self, hidden_dim=128):
        super().__init__()
        self.lstm = nn.LSTM(hidden_dim*2, hidden_dim, batch_first=True, bidirectional=False)
        self.gru_attn = gru_Attention(hidden_dim)

        self.drop_lstm = nn.Dropout(0.5)
        self.drop_attn = nn.Dropout(0.5)

        self.ln_lstm = nn.LayerNorm(hidden_dim)
        self.ln_attn = nn.LayerNorm(hidden_dim)

        self.conv1 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding="same")
        self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding="same")

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.LSTM):
            for name, param in m.named_parameters():
                if "weight_ih" in name:
                    nn.init.xavier_uniform_(param.data)
                elif "weight_hh" in name:
                    nn.init.orthogonal_(param.data)
                elif "bias" in name:
                    param.data.fill_(0)
        elif isinstance(m, nn.Conv1d):
            nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.LayerNorm):
            nn.init.ones_(m.weight)
            nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)

    def forward(self, input_ids):
        # GRU-attention
        x = self.gru_attn(input_ids)          # (B, hidden_dim)
        x = x.unsqueeze(1)                     # (B, 1, hidden_dim)
        x = self.drop_attn(x)
        y = self.ln_attn(x)

        # Conv1d expects (B, C, L)
        y1 = self.conv1(y.permute(0, 2, 1)).permute(0, 2, 1)  # (B, 1, hidden_dim)
        y2 = self.conv2(y.permute(0, 2, 1)).permute(0, 2, 1)  # (B, 1, hidden_dim)

        y = torch.cat([y1, y2], dim=-1)        # (B, 1, hidden_dim*2)

        y, _ = self.lstm(y)                   # (B, 1, hidden_dim)
        y = self.drop_lstm(y)
        y = self.ln_lstm(y)

        return y


In [None]:
MAX_LENGTH = 512
class Model_2(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, hidden_dim=128,num_layers=3, num_labels=20):
        super().__init__()
        self.word_embed = nn.Embedding(vocab_size, hidden_dim)
        self.pos_embed = nn.Embedding(MAX_LENGTH,hidden_dim)
        self.layers = nn.ModuleList([
            EncoderBlck_2(hidden_dim=hidden_dim)
            for _ in range(num_layers)
        ])
        # چون خروجی SimpleEncoder_2 همون hidden_dim هست، اینو تغییر دادم
        self.shared_proj = nn.Linear(hidden_dim, hidden_dim)
        self.ln_proj = nn.LayerNorm(hidden_dim)
        self.classifier = nn.Linear(hidden_dim, num_labels)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, input_ids):
        batch_size, seq_len = input_ids.size()
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
        x = self.word_embed(input_ids) + self.pos_embed(positions)
        for layer in self.layers:
            x = layer(x)
            x = self.shared_proj(x)
            x = self.ln_proj(x)
        logits = self.classifier(x)
        return logits

In [None]:
# ===== Training =====
model_2 = Model_2(num_labels=20)
lit_module_2 = LitModule(model_2,tokenizer)

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)


trainer_2 = pl.Trainer(
    max_epochs=12,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_2.fit(lit_module_2)

# ===== Testing =====
trainer_2.test(lit_module_2)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_2            | 5.8 M  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | Multicl

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.211


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.211. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.8626790046691895,
  'test_acc': 0.21359999477863312,
  'test_f1': 0.21359999477863312}]

نتیجه : عملکرد ناموفق

دقت ~20% برای ۲۰ کلاس از «کاملاً شانسی» (5%) خیلی بهتره، اما هنوز جای پیشرفت زیاد هست

پیش بینی شانسی برای ۲۰ کلاس ≈ 1/20 = 5%. پس 21% یعنی مدل واقعاً چیزی آموخته

علت عملکرد پایین ⁉

⏹
اولا که 4 ایپاک اموزشی خیلی کمه، مدل توی ایپاک 5 تا 10 تازه داره یادمیگیره

خیلی سختگیرانه تنظیم شده early-stopping در واقع

پیشنهاد بهتر:
 - patience -> 5-10
 - min-delta -> 1e-4

(**from-scratch**) برای وقتی که مدل رو از صفر اموزش میدیم:
- تعداد ایپاک بین 10 تا 30 خوبه

⏹
برای تسک طبقه بندی متن زیادیه gru_attention مدل و همچنین مکانیزم

در تسک های طبقه بندی، مدل به یک دید کلی برای دسته بندی متن ها احتیاج داره. ولی این مدل جزییات و پیچیدگی های متن رو تا حدودی یاد گرفته. دیگه اون دید کلی رو برای دسته بندی متون نداره، دیدش ریزبینانه و جزئی تر هست

خیلی خوبه seq2seq , text-generation این مکانیزم توجه برای تسک های

ولی برای این تسک که فقط یک طبقه بندی سادست زیادی بوده

#### **Model_3**

In [None]:
MAX_LENGTH = 512
class Model_3(nn.Module):
    def __init__(self, vocab_size=VOCAB_SIZE, hidden_dim=128,num_layers=2, num_labels=20):
        super().__init__()
        self.word_embed = nn.Embedding(vocab_size, hidden_dim)
        self.pos_embed = nn.Embedding(MAX_LENGTH,hidden_dim)
        self.encod_blocks_1 = nn.ModuleList([
            EncoderBlck_1(hidden_dim=hidden_dim)
            for _ in range(num_layers)
        ])
        self.encod_blocks_2 = nn.ModuleList([
            EncoderBlck_2(hidden_dim=hidden_dim)
            for _ in range(num_layers)
        ])
        # چون خروجی SimpleEncoder_2 همون hidden_dim هست، اینو تغییر دادم
        #self.shared_proj = nn.Linear(hidden_dim, hidden_dim)
        #self.ln_proj = nn.LayerNorm(hidden_dim)
        self.classifier = nn.Linear(hidden_dim, num_labels)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(self, input_ids):
        batch_size, seq_len = input_ids.size()
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
        x = self.word_embed(input_ids) + self.pos_embed(positions)
        for conv,lstm in zip(self.encod_blocks_1,self.encod_blocks_2):
            x = lstm(x)
            x = conv(x)
            #x = self.shared_proj(x)
            #x = self.ln_proj(x)
        x = x.mean(dim=1)  # برای اینکه مسیله از حالت طبقه بندی توکن به طبقه بندی متن بره
        logits = self.classifier(x)
        return logits

In [None]:
# ===== Training =====
model_3 = Model_3(num_labels=20)
lit_module_3 = LitModule(model_3,tokenizer)

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)


trainer_3 = pl.Trainer(
    max_epochs=15,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_3.fit(lit_module_3)

trainer_3.test(lit_module_3)

NameError: name 'EncoderBlck_2' is not defined

#### Labs

##### Lab_1

In [None]:
# ===== Training =====
est_model = Estandard_Model(num_labels=20,hidden_dim=16,num_heads=2,num_layers=3)
lit_est_module = LitModule(est_model,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

est_trainer = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
est_trainer.fit(lit_est_module)

est_trainer.test(lit_est_module)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Estandard_Model    | 525 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.217


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.217. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 4.7213568687438965,
  'test_acc': 0.20800000429153442,
  'test_f1': 0.20800000429153442}]

In [None]:
# ===== Training =====
model_1 = Model_1(num_labels=20,hidden_dim=16,num_layers=3)
lit_module_1 = LitModule(model_1,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_1 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_1.fit(lit_module_1)

trainer_1.test(lit_module_1)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_1            | 510 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7551193237304688,
  'test_acc': 0.20200000703334808,
  'test_f1': 0.20200000703334808}]

In [None]:
# ===== Training =====
model_2 = Model_2(num_labels=20,hidden_dim=16,num_layers=3)
lit_module_2 = LitModule(model_2,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_2 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_2.fit(lit_module_2)

trainer_2.test(lit_module_2)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_2            | 526 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accur

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7458322048187256,
  'test_acc': 0.20200000703334808,
  'test_f1': 0.20200000703334808}]

In [None]:
# ===== Training =====
model_3 = Model_3(num_labels=20,hidden_dim=16,num_layers=1)
lit_module_3 = LitModule(model_3,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_3 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_3.fit(lit_module_3)

trainer_3.test(lit_module_3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_3            | 511 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accur

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.759507656097412,
  'test_acc': 0.20200000703334808,
  'test_f1': 0.20200000703334808}]

##### Lab_2

In [None]:
# ===== Training =====
est_model = Estandard_Model(num_labels=20,hidden_dim=32,num_heads=2,num_layers=2)
lit_est_module = LitModule(est_model,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

est_trainer = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
est_trainer.fit(lit_est_module)

est_trainer.test(lit_est_module)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Estandard_Model    | 1.0 M  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.223


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.223. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 4.999692440032959,
  'test_acc': 0.18000000715255737,
  'test_f1': 0.18000000715255737}]

In [None]:
# ===== Training =====
model_1 = Model_1(num_labels=20,hidden_dim=32,num_layers=2)
lit_module_1 = LitModule(model_1,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_1 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_1.fit(lit_module_1)

trainer_1.test(lit_module_1)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_1            | 1.0 M  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved by 0.003 >= min_delta = 0.0. New best score: 0.223


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.223. Signaling Trainer to stop.
INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=6` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 3.592057228088379,
  'test_acc': 0.1720000058412552,
  'test_f1': 0.1720000058412552}]

In [None]:
# ===== Training =====
model_2 = Model_2(num_labels=20,hidden_dim=32,num_layers=2)
lit_module_2 = LitModule(model_2,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_2 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_2.fit(lit_module_2)

trainer_2.test(lit_module_2)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_2            | 1.1 M  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7362406253814697,
  'test_acc': 0.20200000703334808,
  'test_f1': 0.20200000703334808}]

In [None]:
# ===== Training =====
model_3 = Model_3(num_labels=20,hidden_dim=32,num_layers=1)
lit_module_3 = LitModule(model_3,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=0.00,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_3 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_3.fit(lit_module_3)

trainer_3.test(lit_module_3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_3            | 1.0 M  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.775315284729004,
  'test_acc': 0.20200000703334808,
  'test_f1': 0.20200000703334808}]

##### Lab_3

In [None]:
# ===== Training =====
est_model = Estandard_Model(num_labels=20,hidden_dim=16,num_heads=2,num_layers=4)
lit_est_module = LitModule(est_model,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

est_trainer = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
est_trainer.fit(lit_est_module)

est_trainer.test(lit_est_module)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Estandard_Model    | 535 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 3.744661808013916,
  'test_acc': 0.164000004529953,
  'test_f1': 0.164000004529953}]

In [None]:
# ===== Training =====
model_1 = Model_1(num_labels=20,hidden_dim=16,num_layers=4)
lit_module_1 = LitModule(model_1,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_1 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_1.fit(lit_module_1)

trainer_1.test(lit_module_1)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_1            | 514 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7757880687713623,
  'test_acc': 0.20200000703334808,
  'test_f1': 0.20200000703334808}]

In [None]:
# ===== Training =====
model_2 = Model_2(num_labels=20,hidden_dim=16,num_layers=4)
lit_module_2 = LitModule(model_2,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_2 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_2.fit(lit_module_2)

trainer_2.test(lit_module_2)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_2            | 536 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accur

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.749612331390381,
  'test_acc': 0.20200000703334808,
  'test_f1': 0.20200000703334808}]

In [None]:
# ===== Training =====
model_3 = Model_3(num_labels=20,hidden_dim=16,num_layers=4)
lit_module_3 = LitModule(model_3,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(300)),test_dataset=test_dataset.select(range(500)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_3 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_3.fit(lit_module_3)

trainer_3.test(lit_module_3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_3            | 553 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accur

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.220. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7445192337036133,
  'test_acc': 0.20200000703334808,
  'test_f1': 0.20200000703334808}]

##### Lab_4

In [None]:
# ===== Training =====
est_model = Estandard_Model(num_labels=20,hidden_dim=64,num_heads=4,num_layers=7)
lit_est_module = LitModule(est_model,tokenizer,train_dataset=train_dataset.select(range(5000)),val_dataset=val_dataset.select(range(650)),test_dataset=test_dataset.select(range(800)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

est_trainer = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
est_trainer.fit(lit_est_module)

est_trainer.test(lit_est_module)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Estandard_Model    | 2.3 M  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | Multicl

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.205


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.205. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7678890228271484,
  'test_acc': 0.20749999582767487,
  'test_f1': 0.20749999582767487}]

In [None]:
# ===== Training =====
model_1 = Model_1(num_labels=20,hidden_dim=64,num_layers=10)
lit_module_1 = LitModule(model_1,tokenizer,train_dataset=train_dataset.select(range(5000)),val_dataset=val_dataset.select(range(650)),test_dataset=test_dataset.select(range(800)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_1 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_1.fit(lit_module_1)

trainer_1.test(lit_module_1)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_1            | 2.7 M  | train
1 | loss_fn        | CrossEntropyLoss  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.205


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.205. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7745981216430664,
  'test_acc': 0.20749999582767487,
  'test_f1': 0.20749999582767487}]

In [None]:
# ===== Training =====
model_2 = Model_2(num_labels=20,hidden_dim=64,num_layers=7)
lit_module_2 = LitModule(model_2,tokenizer,train_dataset=train_dataset.select(range(5000)),val_dataset=val_dataset.select(range(650)),test_dataset=test_dataset.select(range(800)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-4,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_2 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_2.fit(lit_module_2)

trainer_2.test(lit_module_2)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_2            | 3.0 M  | train
1 | loss_fn        | CrossEntropyLoss  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.205


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.205. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.742191791534424,
  'test_acc': 0.20749999582767487,
  'test_f1': 0.20749999582767487}]

In [None]:
# ===== Training =====
model_3 = Model_3(num_labels=20,hidden_dim=64,num_layers=5)
lit_module_3 = LitModule(model_3,tokenizer,train_dataset=train_dataset.select(range(5000)),val_dataset=val_dataset.select(range(650)),test_dataset=test_dataset.select(range(800)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-2,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_3 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
trainer_3.fit(lit_module_3)

trainer_3.test(lit_module_3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_3            | 3.1 M  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | Multicl

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.205


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.205. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7850182056427,
  'test_acc': 0.20749999582767487,
  'test_f1': 0.20749999582767487}]

##### Lab_5

In [None]:
# ===== Training =====
est_model = Estandard_Model(num_labels=20,hidden_dim=8,num_heads=4,num_layers=15)
lit_est_module = LitModule(est_model,tokenizer,train_dataset=train_dataset.select(range(1000)),val_dataset=val_dataset.select(range(100)),test_dataset=test_dataset.select(range(200)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

est_trainer = pl.Trainer(
    max_epochs=20,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    #callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.0 # Add gradient clipping
)
est_trainer.fit(lit_est_module)

est_trainer.test(lit_est_module)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Estandard_Model    | 318 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.713329792022705,
  'test_acc': 0.23000000417232513,
  'test_f1': 0.23000000417232513}]

In [None]:
# ===== Training =====
model_1 = Model_1(num_labels=20,hidden_dim=8,num_layers=15)
lit_module_1 = LitModule(model_1,tokenizer,train_dataset=train_dataset.select(range(1000)),val_dataset=val_dataset.select(range(100)),test_dataset=test_dataset.select(range(200)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_1 = pl.Trainer(
    max_epochs=6,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    #callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.2 # Add gradient clipping
)
trainer_1.fit(lit_module_1)

trainer_1.test(lit_module_1)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_1            | 265 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=6` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7416319847106934,
  'test_acc': 0.23000000417232513,
  'test_f1': 0.23000000417232513}]

In [None]:
# ===== Training =====
model_2 = Model_2(num_labels=20,hidden_dim=8,num_layers=20)
lit_module_2 = LitModule(model_2,tokenizer,train_dataset=train_dataset.select(range(1000)),val_dataset=val_dataset.select(range(100)),test_dataset=test_dataset.select(range(300)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-4,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_2 = pl.Trainer(
    max_epochs=8,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    #callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.3 # Add gradient clipping
)
trainer_2.fit(lit_module_2)

trainer_2.test(lit_module_2)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_2            | 299 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=8` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7937281131744385,
  'test_acc': 0.20666666328907013,
  'test_f1': 0.20666666328907013}]

In [None]:
# ===== Training =====
model_3 = Model_3(num_labels=20,hidden_dim=8,num_layers=15)
lit_module_3 = LitModule(model_3,tokenizer,train_dataset=train_dataset.select(range(1000)),val_dataset=val_dataset.select(range(100)),test_dataset=test_dataset.select(range(300)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=6,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_3 = pl.Trainer(
    max_epochs=8,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.5 # Add gradient clipping
)
trainer_3.fit(lit_module_3)

trainer_3.test(lit_module_3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_3            | 304 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accur

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_f1 improved. New best score: 0.190


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_f1 did not improve in the last 6 records. Best score: 0.190. Signaling Trainer to stop.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.831317663192749,
  'test_acc': 0.20666666328907013,
  'test_f1': 0.20666666328907013}]

##### Lab_6

In [None]:
# ===== Training =====
model_1 = Model_1(num_labels=20,hidden_dim=8,num_layers=20)
lit_module_1 = LitModule(model_1,tokenizer,train_dataset=train_dataset.select(range(1000)),val_dataset=val_dataset.select(range(100)),test_dataset=test_dataset.select(range(200)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-3,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_1 = pl.Trainer(
    max_epochs=24,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    #callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.2 # Add gradient clipping
)
trainer_1.fit(lit_module_1)

trainer_1.test(lit_module_1)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_1            | 271 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=24` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7519705295562744,
  'test_acc': 0.23000000417232513,
  'test_f1': 0.23000000417232513}]

In [None]:
# ===== Training =====
model_2 = Model_2(num_labels=20,hidden_dim=4,num_layers=10)
lit_module_2 = LitModule(model_2,tokenizer,train_dataset=train_dataset.select(range(1000)),val_dataset=val_dataset.select(range(100)),test_dataset=test_dataset.select(range(300)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-4,           # Minimum change to qualify as an improvement
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_2 = pl.Trainer(
    max_epochs=25,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    #callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.3 # Add gradient clipping
)
trainer_2.fit(lit_module_2)

trainer_2.test(lit_module_2)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_2            | 131 K  | train
1 | loss_fn        | CrossEntropyLoss   | 0      | train
2 | train_accuracy | MulticlassAccuracy | 0      | train
3 | val_accuracy   | MulticlassAccuracy | 0      | train
4 | train_f1       | MulticlassF1Score  | 0      | train
5 | val_f1         | MulticlassF1Score  | 0      | train
6 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=25` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.782485246658325,
  'test_acc': 0.20666666328907013,
  'test_f1': 0.20666666328907013}]

In [None]:
# ===== Training =====
model_3 = Model_3(num_labels=20,hidden_dim=128,num_layers=10)
lit_module_3 = LitModule(model_3,tokenizer,train_dataset=train_dataset.select(range(3000)),val_dataset=val_dataset.select(range(500)),test_dataset=test_dataset.select(range(600)))

early_stop_callback = EarlyStopping(
    monitor='val_f1',  # Metric to monitor
    min_delta=1e-4,           # Minimum change to qualify as an improvement
    patience=5,               # Number of epochs with no improvement after which training will be stopped
    verbose=True,             # Log information to the console
    mode='max'                # 'max' mode means we want to maximize the monitored quantity
)

trainer_3 = pl.Trainer(
    max_epochs=15,
    accelerator=device,
    logger=False, # Set to True to use a logger for better tracking (e.g., TensorBoard)
    #callbacks=[early_stop_callback], # Pass the early stopping callback
    gradient_clip_val=1.2 # Add gradient clipping
)
trainer_3.fit(lit_module_3)

trainer_3.test(lit_module_3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type               | Params | Mode 
--------------------------------------------------------------
0 | model          | Model_3            | 12.7 M | train
1 | loss_fn        | CrossEntropyLoss  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=15` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 2.7677080631256104,
  'test_acc': 0.20000000298023224,
  'test_f1': 0.20000000298023224}]