In [None]:
!pip install pytorch_lightning  seqeval evaluate

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import pytorch_lightning as pl
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score
from datasets import load_dataset, Dataset, concatenate_datasets
from evaluate import load as load_metric
from transformers import AutoTokenizer
from pytorch_lightning.callbacks import EarlyStopping
import random
import pandas as pd
from google.colab import files
import builtins
import statistics
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

#### **Load Datasets**

In [3]:
def upload_dataset_from_system():
  uploaded = files.upload()
  data = Dataset.from_pandas(pd.read_parquet(list(uploaded.keys())[0]))
  return data

# --- return dataset classes ---
def dataset_classes(dataset):
  return list(set([item['label'] for item in dataset]))

# --- tokenizer ---
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
VOCAB_SIZE = tokenizer.vocab_size

# --- max , mean of dataset lengths ---
def get_max_mean_length(dataset,text='text'):
  lengths = [len(tokenizer(x)['input_ids']) for x in dataset[text]]
  return {"max":builtins.max(lengths) , "mean":statistics.mean(lengths)}

# --- percentile ---
def get_percentile(dataset,per_list=[98,99,99.9],text='text'):
  lengths = np.array([len(tokenizer(x)['input_ids']) for x in dataset[text]])
  return np.percentile(lengths,per_list)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [32]:
# --- load dataset 20-classes ---
train_data = upload_dataset_from_system()
val_data = upload_dataset_from_system()
test_data = upload_dataset_from_system()

Saving train-00000-of-00001.parquet to train-00000-of-00001.parquet


Saving validation-00000-of-00001.parquet to validation-00000-of-00001.parquet


Saving test-00000-of-00001.parquet to test-00000-of-00001.parquet


In [None]:
print("train",get_max_mean_length(train_data))
print("val",get_max_mean_length(val_data))
print("test",get_max_mean_length(test_data))
print("train_percentile",get_percentile(train_data,per_list=[97,98,99,99.8]))
print("val_percentile",get_percentile(val_data,per_list=[97,98,99,99.8]))
print("test_percentile",get_percentile(test_data,per_list=[97,98,99,99.8]))   # max_length -> 40

train {'max': 75, 'mean': 21.237466666666666}
val {'max': 57, 'mean': 19.7332}
test {'max': 93, 'mean': 21.28922}
train_percentile [34. 35. 37. 42.]
val_percentile [33. 34. 36. 40.]
test_percentile [34. 35. 37. 42.]


In [4]:
# --- load sst5 ---
sst = load_dataset("SetFit/sst5")

README.md:   0%|          | 0.00/421 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl: 0.00B [00:00, ?B/s]

dev.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/8544 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1101 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2210 [00:00<?, ? examples/s]

In [None]:
print("train",get_max_mean_length(sst['train']))
print("val",get_max_mean_length(sst['validation']))
print("test",get_max_mean_length(sst['test']))
print("train_percentile",get_percentile(sst['train'],per_list=[97,98,99,99.8]))
print("val_percentile",get_percentile(sst['validation'],per_list=[97,98,99,99.8]))
print("test_percentile",get_percentile(sst['test'],per_list=[97,98,99,99.8]))      # max_length -> 60

train {'max': 80, 'mean': 25.039911048689138}
val {'max': 60, 'mean': 25.236148955495004}
test {'max': 70, 'mean': 25.000452488687785}
train_percentile [49. 52. 55. 61.]
val_percentile [48.  50.  53.  56.6]
test_percentile [48.   50.   53.91 59.  ]


In [4]:
# --- load imdb ---
imdb = load_dataset("imdb")

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
print("train",get_max_mean_length(imdb['train']))
print("test",get_max_mean_length(imdb['test']))
print("train_percentile",get_percentile(imdb['train'],per_list=[97,98,99,99.8]))
print("test_percentile",get_percentile(imdb['test'],per_list=[97,98,99,99.8])) # max_length -> 1200

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


train {'max': 3127, 'mean': 313.87132}
test {'max': 3157, 'mean': 306.771}
train_percentile [ 948.03  1055.    1206.    1420.008]
test_percentile [ 920.03  1032.    1192.01  1362.002]


#### **preprocess & Dataset-utilities**

In [5]:

def preproces_data(example,max_len,text ='text',label = 'label'):
  tokenized_item = tokenizer(example[text],truncation=True,max_length=max_len,padding="max_length")
  inp_ids = tokenized_item['input_ids']
  attn_msk = tokenized_item['attention_mask']
  label = example[label]

  return {
         "input_ids": inp_ids, # Convert list to tensor
          "attention_mask": attn_msk, # Convert list to tensor
          "labels": label # Return the original label index
    }

In [33]:
# --- 20 classes dataset ---
train_dataset = train_data.map(preproces_data, fn_kwargs={'max_len': 32}, remove_columns=['text','label'])
val_dataset = val_data.map(preproces_data, fn_kwargs={'max_len': 32}, remove_columns=['text','label'])
test_dataset = test_data.map(preproces_data, fn_kwargs={'max_len': 32}, remove_columns=['text','label'])
train_dataset

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 45000
})

In [7]:
# --- sst5 ---
sst_train = sst['train'].map(preproces_data,fn_kwargs={'max_len': 48},remove_columns=['text','label','label_text'])
sst_val = sst['validation'].map(preproces_data,fn_kwargs={'max_len': 48},remove_columns=['text','label','label_text'])
sst_test = sst['test'].map(preproces_data,fn_kwargs={'max_len': 48},remove_columns=['text','label','label_text'])
sst_train

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 8544
})

In [6]:
imdb_train = imdb['train'].map(preproces_data,fn_kwargs={'max_len': 1024},remove_columns=['text','label'])
imdb_test = imdb['test'].map(preproces_data,fn_kwargs={'max_len': 1024},remove_columns=['text','label'])
imdb_test

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 25000
})

In [7]:
def collate_fn(batch):
    input_ids = [
        torch.tensor(item['input_ids'], dtype=torch.long)
        if not isinstance(item['input_ids'], torch.Tensor) else item['input_ids'].long()
        for item in batch
    ]
    attention_mask = [
        torch.tensor(item['attention_mask'], dtype=torch.long)
        if not isinstance(item['attention_mask'], torch.Tensor) else item['attention_mask'].long()
        for item in batch
    ]

    labels = torch.stack([
        torch.tensor(item['labels'], dtype=torch.long) # Changed to torch.long
        if not isinstance(item['labels'], torch.Tensor) else item['labels'].long()
        for item in batch
    ])


    input_ids_padded = pad_sequence(
        input_ids, batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask_padded = pad_sequence(
        attention_mask, batch_first=True, padding_value=0
    )

    return {
        "input_ids": input_ids_padded,        # (batch_size, seq_len)
        "attention_mask": attention_mask_padded,  # (batch_size, seq_len)
        "labels": labels                      # (batch_size, num_classes)
    }

##### **LitModule**

In [8]:
import torchmetrics
from torch.optim.lr_scheduler import ReduceLROnPlateau # Import ReduceLROnPlateau

class LitModule(pl.LightningModule):
  def __init__(self,model,batch_size=64,lr=1e-3,train_dataset=None,val_dataset=None,test_dataset=None):
    super().__init__()
    self.model = model
    self.batch_size = batch_size
    self.lr = lr
    self.train_dataset = train_dataset
    self.validation_dataset = val_dataset
    self.test_dataset = test_dataset

    self.loss_fn =  nn.CrossEntropyLoss()

    self.train_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=self.model.classifier.out_features)
    self.val_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=self.model.classifier.out_features)
    self.train_f1 = torchmetrics.F1Score(task="multiclass", num_classes=self.model.classifier.out_features)
    self.val_f1 = torchmetrics.F1Score(task="multiclass", num_classes=self.model.classifier.out_features)
    self.test_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=self.model.classifier.out_features)
    self.test_f1 = torchmetrics.F1Score(task="multiclass", num_classes=self.model.classifier.out_features)

    # F1 per-class
    self.test_f1_per_class = torchmetrics.F1Score(
        task="multiclass",
        num_classes=self.model.classifier.out_features,
        average=None
    )

  def forward(self,input_ids):
    return self.model(input_ids)

  def train_dataloader(self):
    return DataLoader(self.train_dataset,batch_size=self.batch_size,shuffle=True,collate_fn=collate_fn) #sampler=self.sampler

  def val_dataloader(self):
    return DataLoader(self.validation_dataset,batch_size=self.batch_size, collate_fn=collate_fn)

  def test_dataloader(self):
    return DataLoader(self.test_dataset,batch_size=self.batch_size, collate_fn=collate_fn)


  def training_step(self,batch,batch_idx):
     input_ids = batch['input_ids']
     labels = batch['labels'].long()

     logits = self.model(input_ids)
     loss = self.loss_fn(logits, labels)#self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
     self.log("train_loss",loss)

     preds = torch.argmax(logits, dim=-1)

     self.log('train_acc', self.train_accuracy(preds, labels), on_step=False, on_epoch=True, prog_bar=True)
     self.log('train_f1', self.train_f1(preds, labels), on_step=False, on_epoch=True, prog_bar=True)


     return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    labels = batch['labels'].long()  # shape: [batch_size]

    logits = self.model(input_ids)              # shape: [batch_size, num_classes]
    loss = self.loss_fn(logits, labels)
    self.log("val_loss", loss, prog_bar=True)

    preds = torch.argmax(logits, dim=-1)        # shape: [batch_size]
    self.val_accuracy.update(preds, labels)
    self.val_f1.update(preds, labels)

    self.log('val_acc', self.val_accuracy, on_step=False, on_epoch=True, prog_bar=True)
    self.log('val_f1', self.val_f1, on_step=False, on_epoch=True, prog_bar=True)


  def test_step(self, batch, batch_idx):
    input_ids = batch['input_ids']
    labels = batch['labels'].long()

    logits = self.model(input_ids)
    loss = self.loss_fn(logits, labels)

    preds = torch.argmax(logits, dim=-1)

    # فقط update کنیم
    self.test_accuracy.update(preds, labels)
    self.test_f1.update(preds, labels)
    self.test_f1_per_class.update(preds, labels)

    self.log("-- test_loss --", loss, prog_bar=True)

  def on_test_epoch_end(self):
    acc = self.test_accuracy.compute()
    f1_macro = self.test_f1.compute()
    f1_per_class = self.test_f1_per_class.compute()

    # لاگ نهایی
    self.log("-- test_acc --", acc)
    self.log("-- test_f1 --", f1_macro)
    for i, score in enumerate(f1_per_class):
        self.log(f"test_f1_class_{i}", score)

    # ریست کردن
    self.test_accuracy.reset()
    self.test_f1.reset()
    self.test_f1_per_class.reset()


  def configure_optimizers(self):
     optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
     scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)  # ✅ روی val_loss باید min باشه
     return {"optimizer": optimizer,"lr_scheduler": {"scheduler": scheduler, "monitor": "val_loss"}}


#### **weight-initialize**

In [9]:
class InitWeight:
    def __call__(self, m):
        if isinstance(m, nn.LSTM):
            for name, param in m.named_parameters():
                with torch.no_grad():
                    if "weight_ih" in name:
                        nn.init.xavier_uniform_(param.data)
                    elif "weight_hh" in name:
                        nn.init.orthogonal_(param.data)
                    elif "bias" in name:
                        param.fill_(0)
        elif isinstance(m,nn.GRU):
            for name, param in m.named_parameters(): # Added loop for GRU
                with torch.no_grad():
                    if "weight_ih" in name:
                        nn.init.xavier_uniform_(param.data)
                    elif "weight_hh" in name:
                        nn.init.orthogonal_(param.data)
                    elif "bias" in name:
                        param.fill_(0)
        elif isinstance(m, nn.Conv1d):
            with torch.no_grad():
                nn.init.kaiming_uniform_(m.weight, nonlinearity="relu")
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

        elif isinstance(m, nn.Linear):
            with torch.no_grad():
                nn.init.normal_(m.weight, mean=0.0, std=0.02)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

#### **Attentions**

In [10]:
class gru_Attention(nn.Module):
    def __init__(self, hidden_dim=128,out_dim=0):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim if out_dim !=0 else hidden_dim
        self.key = nn.GRU(hidden_dim, self.out_dim, batch_first=True)
        self.value = nn.GRU(hidden_dim, self.out_dim, batch_first=True)
        self.query = nn.GRU(hidden_dim, self.out_dim, batch_first=True)
        self.attn_drop = nn.Dropout(0.2)
        self.resid_drop = nn.Dropout(0.2)
        self.apply(InitWeight())

    def forward(self, x):
        B, T, C = x.size()

        # output, h_n = gru(x)
        _, k = self.key(x)   # h_n
        _, q = self.query(x)  # h_n
        _, v = self.value(x)  # h_n

        k = k.squeeze(0)
        q = q.squeeze(0)
        v = v.squeeze(0)

        att = (q @ k.transpose(-2, -1)) / math.sqrt(C)
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)

        y = att @ v
        y = self.resid_drop(y)
        y = y.unsqueeze(1)
        return y

In [11]:
class conv_Attention(nn.Module):
    def __init__(self, hidden_dim=128,out_dim=0):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim if out_dim !=0 else hidden_dim
        # Conv1d expects input (batch_size, channels, sequence_length)
        self.key = nn.Conv1d(hidden_dim, self.out_dim, kernel_size=3,padding='same')
        self.value = nn.Conv1d(hidden_dim, self.out_dim, kernel_size=3,padding='same')
        self.query = nn.Conv1d(hidden_dim, self.out_dim, kernel_size=3,padding='same')
        self.attn_drop = nn.Dropout(0.15)
        self.resid_drop = nn.Dropout(0.15)
        self.apply(InitWeight())

    def forward(self, x):
        B, T, C = x.size()

        # Permute dimensions for Conv1d: (B, T, C) -> (B, C, T)
        x_permuted = x.permute(0, 2, 1)

        k = self.key(x_permuted)
        q = self.query(x_permuted)
        v = self.value(x_permuted)

        # Permute back to (B, T, C) for attention calculation
        k = k.permute(0, 2, 1)
        q = q.permute(0, 2, 1)
        v = v.permute(0, 2, 1)


        att = (q @ k.transpose(-2, -1)) / math.sqrt(C)
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)

        y = att @ v
        y = self.resid_drop(y)
        return y

In [12]:
class gru_Attention_2(nn.Module):
    def __init__(self, hidden_dim=128,out_dim=0):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.out_dim = out_dim if out_dim !=0 else hidden_dim
        self.key = nn.GRU(hidden_dim, self.out_dim, batch_first=True)
        self.value = nn.GRU(hidden_dim, self.out_dim, batch_first=True)
        self.query = nn.GRU(hidden_dim, self.out_dim, batch_first=True)
        self.attn_drop = nn.Dropout(0.2)
        self.resid_drop = nn.Dropout(0.2)
        self.apply(InitWeight())

    def forward(self, x):
        B, T, C = x.size()

        # output, h_n = gru(x)
        k, _ = self.key(x)   # output
        q, _ = self.query(x)  # output
        v, _ = self.value(x)  # output

        att = (q @ k.transpose(-2, -1)) / math.sqrt(C)
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)

        y = att @ v
        y = self.resid_drop(y)
        return y

In [13]:
class _MultiheadAttention(nn.Module):
    def __init__(self, hidden_dim=128, num_heads=4):
        super().__init__()
        self.attn = nn.MultiheadAttention(
              embed_dim=hidden_dim,
              num_heads=num_heads,
              batch_first=True
           )
        self.apply(InitWeight())
    def forward(self, x):
        return self.attn(x,x,x)[0]

#### **FFN**

In [14]:
class conv_lstm_ffn(nn.Module):
  def __init__(self,hidden_dim=128,ff_dim=128):
    super().__init__()
    self.fc = nn.Linear(hidden_dim,ff_dim)
    self.conv1 = nn.Conv1d(ff_dim, ff_dim, kernel_size=3, padding="same")
    self.conv2 = nn.Conv1d(ff_dim, ff_dim, kernel_size=5, padding="same")
    self.lstm = nn.LSTM(ff_dim*2, hidden_dim, batch_first=True, bidirectional=False)
    self.apply(InitWeight())

  def forward(self,x):
      x = self.fc(x)
      y1 = self.conv1(x.permute(0, 2, 1)).permute(0, 2, 1)  # (B, 1, ff_dim)
      y2 = self.conv2(x.permute(0, 2, 1)).permute(0, 2, 1)  # (B, 1, ff_dim)
      y = torch.cat([y1, y2], dim=-1)        # (B, 1, ff_dim*2)
      y = F.relu(y)
      y, _ = self.lstm(y)                   # (B, 1, hidden_dim)
      return y


In [15]:
class conv_lin_ffn(nn.Module):
  def __init__(self,hidden_dim=128,ff_dim=128):
    super().__init__()
    self.fc = nn.Linear(hidden_dim,ff_dim)
    self.conv1 = nn.Conv1d(ff_dim, ff_dim, kernel_size=3, padding="same")
    self.conv2 = nn.Conv1d(ff_dim, ff_dim, kernel_size=5, padding="same")
    self.proj = nn.Linear(ff_dim*2,hidden_dim)
    self.apply(InitWeight())

  def forward(self,x):
      x = self.fc(x)
      y1 = self.conv1(x.permute(0, 2, 1)).permute(0, 2, 1)  # (B, 1, ff_dim)
      y2 = self.conv2(x.permute(0, 2, 1)).permute(0, 2, 1)  # (B, 1, ff_dim)
      y = torch.cat([y1, y2], dim=-1)        # (B, 1, ff_dim*2)
      y = F.relu(y)
      y = self.proj(y)                       # (B, 1, hidden_dim)
      return y


In [16]:
class lin_relu_ffn(nn.Module):
  def __init__(self,hidden_dim=128,ff_dim=128):
    super().__init__()
    self.ffn = nn.Sequential(
            nn.Linear(hidden_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, hidden_dim)
        )
    self.apply(InitWeight())

  def forward(self,x):
    return self.ffn(x)

In [17]:
class lin_gelu_ffn(nn.Module):
  def __init__(self,hidden_dim=128,ff_dim=128):
    super().__init__()
    self.ffn = nn.Sequential(
            nn.Linear(hidden_dim, ff_dim),
            nn.GELU(),
            nn.Linear(ff_dim, hidden_dim)
        )
    self.apply(InitWeight())

  def forward(self,x):
    return self.ffn(x)

#### **MultiHeadAttention**

In [18]:
class MultiHeadAttention(nn.Module):
  def __init__(self,attention,hidden_dim=128,num_heads=4):
    super().__init__()
    self.num_heads = num_heads
    self.hidden_dim = hidden_dim
    self.head_dim = self.hidden_dim // self.num_heads
    self.heads = nn.ModuleList([attention(hidden_dim=self.hidden_dim,out_dim=self.head_dim)
     for _ in range(self.num_heads)])
    self.fc_out = nn.Linear(self.hidden_dim,self.hidden_dim)
    self.apply(InitWeight())

  def forward(self,x):
    out = torch.cat([head(x) for head in self.heads], dim=-1)
    return self.fc_out(out)

#### **CustomTransformer**

In [19]:
class CustomTransformer(nn.Module):
  def __init__(self, hidden_dim=128, attn=None,FFN=None, dropout=0.1): #num_heads=4,
      super().__init__()

      # Attention
      self.attn=attn
      # Feed Forward
      self.ffn=FFN

      self.ln1 = nn.LayerNorm(hidden_dim)
      self.ln2 = nn.LayerNorm(hidden_dim)
      self.drop1 = nn.Dropout(dropout)
      self.drop2 = nn.Dropout(dropout)

  def forward(self, x, mask=None):
        # Self-Attention + Residual
        attn_out = self.attn(x)
        x = self.ln1(x + self.drop1(attn_out))

        # FeedForward + Residual
        ff_out = self.ffn(x)
        x = self.ln2(x + self.drop2(ff_out))

        return x

#### **Model**

In [20]:
class Model(nn.Module):
  def __init__(self,vocab_size=VOCAB_SIZE,max_leng=256,hidden_dim=128,module_list=None,num_labels=20,num_repeats=4):
    super().__init__()
    self.word_embed = nn.Embedding(vocab_size, hidden_dim)
    self.pos_embed = nn.Embedding(max_leng, hidden_dim)

    self.layers = nn.ModuleList([item for item in module_list for _ in range(num_repeats)])
    self.classifier = nn.Linear(hidden_dim, num_labels)
    self.apply(InitWeight())

  def forward(self, input_ids):
        batch_size, seq_len = input_ids.size()
        positions = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)

        # word + positional embedding
        x = self.word_embed(input_ids) + self.pos_embed(positions)

        # Transformer layers
        for layer in self.layers:
            x = layer(x)

        # Sequence pooling (mean pooling)
        x = x.mean(dim=1)  # (B, H)

        # Classification head
        logits = self.classifier(x)
        return logits

#### **Creat_Module_List**

In [21]:
class Create_Module_list(nn.Module):
  def __init__(self,hidden_dim=128,ff_dim=128,attn_ffn_tuples=None,num_heads=4,drop=0.1):
    super().__init__()
    self.drop=drop
    self.num_heads = num_heads
    self.tuples = attn_ffn_tuples
    self.hidden_dim = hidden_dim
    self.ff_dim = ff_dim

  def forward(self):
    module_list = []
    for item in self.tuples:
      attn_module = item[0]
      ffn_module = item[1]
      num_heads_item = self.num_heads # Default to the class num_heads

      if len(item) == 3:
          num_heads_item = item[2] # Use the specified number of heads

      module_list.append(CustomTransformer(hidden_dim=self.hidden_dim,
          attn=MultiHeadAttention(attention=attn_module,hidden_dim=self.hidden_dim,num_heads=num_heads_item) if attn_module is not None else _MultiheadAttention(hidden_dim=self.hidden_dim,num_heads=num_heads_item),
          FFN=ffn_module(hidden_dim=self.hidden_dim,ff_dim=self.ff_dim) if ffn_module is not None else lin_relu_ffn(hidden_dim=self.hidden_dim,ff_dim=self.ff_dim),
          dropout=self.drop))
    return module_list

#### **Train-Method**

In [22]:
def train_model(attn_ffn_tuples=None,
                num_labels=20,
                hidden_dim=32,
                max_leng=256,
                ffn_dim=64,
                num_heads=4,
                dropout=0.1,
                lr = 1e-3,
                batch_size=32,
                num_repeat_modules=3,
                train_data=None,
                val_data=None,
                test_data=None,
                early_stop=False,
                patience=7,
                max_epochs=5,
                gradient_clip_val=5.0
                ):

  module_list_creator = Create_Module_list(hidden_dim=hidden_dim,
                                           ff_dim=ffn_dim,
                                           attn_ffn_tuples=attn_ffn_tuples,
                                           num_heads=num_heads,
                                           drop=dropout
                                           )
  module_list = module_list_creator()

  model = Model(hidden_dim = hidden_dim,
                max_leng = max_leng,
                num_labels=num_labels,
                module_list=module_list,
                num_repeats=num_repeat_modules
                )

  lit_module = LitModule(model,
                         batch_size=batch_size,
                         lr=lr,
                         train_dataset=train_data,
                         val_dataset=val_data,
                         test_dataset=test_data
                         )

  early_stop_callback = EarlyStopping(monitor='val_f1',min_delta=1e-4,patience=patience,verbose=True,mode='max')

  callbacks=[]
  if early_stop:
    callbacks.append(early_stop_callback)

  trainer = pl.Trainer(
    max_epochs=max_epochs,
    accelerator=device,
    logger=False,
    callbacks=callbacks,
    gradient_clip_val=gradient_clip_val
  )

  trainer.fit(lit_module)
  trainer.test(lit_module)

  return model, lit_module

#### **Labs**

##### sst

In [28]:
tuples = [(None,None)]
model, _ = train_model(num_labels=5,
                hidden_dim=16,
                max_leng=50,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                       num_heads=1,
                batch_size=64,
                       lr=5e-4,
                num_repeat_modules=1,
                train_data=sst_train,
                val_data=sst_val,
                test_data=sst_test,
                max_epochs=4)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 491 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | trai

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

In [29]:
tuples = [(gru_Attention_2,None)]
model, _ = train_model(num_labels=5,
                hidden_dim=16,
                max_leng=50,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                       num_heads=1,
                batch_size=64,
                       lr=5e-4,
                num_repeat_modules=1,
                train_data=sst_train,
                val_data=sst_val,
                test_data=sst_test,
                max_epochs=4)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 495 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | trai

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

In [30]:
tuples = [(gru_Attention,None)]
model, _ = train_model(num_labels=5,
                hidden_dim=16,
                max_leng=50,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                       num_heads=1,
                batch_size=64,
                       lr=5e-4,
                num_repeat_modules=1,
                train_data=sst_train,
                val_data=sst_val,
                test_data=sst_test,
                max_epochs=4)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 495 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | train
3 | val_accuracy      | MulticlassAccuracy | 0      | train
4 | train_f1          | MulticlassF1Score  | 0      | train
5 | val_f1            | MulticlassF1Sc

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

In [31]:
tuples = [(conv_Attention,None)]
model, _ = train_model(num_labels=5,
                hidden_dim=16,
                max_leng=50,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                       num_heads=1,
                batch_size=64,
                       lr=5e-4,
                num_repeat_modules=1,
                train_data=sst_train,
                val_data=sst_val,
                test_data=sst_test,
                max_epochs=4)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 492 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | trai

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=4` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

##### 20-cls-dataset

In [35]:
tuples = [(None,None)]
model, _ = train_model(num_labels=20,
                hidden_dim=16,
                max_leng=34,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                batch_size=64,
                       lr = 1e-3,
                num_repeat_modules=1,
                train_data=train_dataset,
                val_data=val_dataset,
                test_data=test_dataset,
                max_epochs=3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 491 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | train
3 | val_accuracy      | MulticlassAccuracy | 0      | train
4 | train_f1          | MulticlassF1Score  | 0      | train
5 | val_f1            | MulticlassF1Sc

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

In [36]:
tuples = [(gru_Attention_2,None)]
model, _ = train_model(num_labels=20,
                hidden_dim=16,
                max_leng=34,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                batch_size=64,
                       lr = 1e-3,
                num_repeat_modules=1,
                train_data=train_dataset,
                val_data=val_dataset,
                test_data=test_dataset,
                max_epochs=3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 493 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | trai

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

In [37]:
tuples = [(gru_Attention,None)]
model, _ = train_model(num_labels=20,
                hidden_dim=16,
                max_leng=34,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                batch_size=64,
                       lr = 1e-3,
                num_repeat_modules=1,
                train_data=train_dataset,
                val_data=val_dataset,
                test_data=test_dataset,
                max_epochs=3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 493 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | train
3 | val_accuracy      | MulticlassAccuracy | 0      | train
4 | train_f1          | MulticlassF1Score  | 0      | train
5 | val_f1            | MulticlassF1Sc

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

In [38]:
tuples = [(conv_Attention,None)]
model, _ = train_model(num_labels=20,
                hidden_dim=16,
                max_leng=34,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                batch_size=64,
                       lr = 1e-3,
                num_repeat_modules=1,
                train_data=train_dataset,
                val_data=val_dataset,
                test_data=test_dataset,
                max_epochs=3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 492 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | trai

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Testing: |          | 0/? [00:00<?, ?it/s]

##### imdb

In [25]:
tuples = [(None,None)]
model, _ = train_model(num_labels=2,
                hidden_dim=16,
                max_leng=1026,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                batch_size=64,
                       lr = 1e-3,
                num_repeat_modules=1,
                train_data=imdb_train,
                val_data=imdb_test.shuffle().select(range(6000)),
                test_data=imdb_test.shuffle().select(range(6000,25000)),
                max_epochs=3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 507 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | train
3 | val_accuracy      | MulticlassAccuracy | 0      | train
4 | train_

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [26]:
tuples = [(gru_Attention_2,None)]
model, _ = train_model(num_labels=2,
                hidden_dim=16,
                max_leng=1026,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                batch_size=64,
                       lr = 1e-3,
                num_repeat_modules=1,
                train_data=imdb_train,
                val_data=imdb_test.shuffle().select(range(6000)),
                test_data=imdb_test.shuffle().select(range(6000,25000)),
                max_epochs=3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 509 K  | train
1 | loss_fn           | CrossE

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [27]:
tuples = [(gru_Attention,None)]
model, _ = train_model(num_labels=2,
                hidden_dim=16,
                max_leng=1026,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                batch_size=64,
                       lr = 1e-3,
                num_repeat_modules=1,
                train_data=imdb_train,
                val_data=imdb_test.shuffle().select(range(6000)),
                test_data=imdb_test.shuffle().select(range(6000,25000)),
                max_epochs=3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 509 K  | train
1 | loss_fn           | CrossEntropyLoss   | 0      | train
2 | train_accuracy    | MulticlassAccuracy | 0      | train
3 | val_accuracy      | MulticlassAccuracy | 0      | train
4 | train_

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [28]:
tuples = [(conv_Attention,None)]
model, _ = train_model(num_labels=2,
                hidden_dim=16,
                max_leng=1026,
                attn_ffn_tuples=tuples,
                ffn_dim=32,
                batch_size=64,
                       lr = 1e-3,
                num_repeat_modules=1,
                train_data=imdb_train,
                val_data=imdb_test.shuffle().select(range(6000)),
                test_data=imdb_test.shuffle().select(range(6000,25000)),
                max_epochs=3)

INFO:pytorch_lightning.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:751: Checkpoint directory /content/checkpoints exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type               | Params | Mode 
-----------------------------------------------------------------
0 | model             | Model              | 508 K  | train
1 | loss_fn           | CrossE

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]