In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Pip installation

In [None]:
pip install transformers torch pandas scikit-learn

### Bigbird on test dataset as train
(because train dataset requires a lot of computation power)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BigBirdConfig, BigBirdModel
import pandas as pd
from torch.optim import AdamW
from tqdm import tqdm

class ListOpsDataset(Dataset):
    def __init__(self, file_path, max_length=4096): 
        df = pd.read_csv(file_path, sep='\t')
        
        # Filter sequences by length and keep original spacing
        self.texts = []
        self.labels = []
        for text, label in zip(df['Source'], df['Target']):
            if len(text) < 10000:  # Only keep sequences less than 10000 characters
                self.texts.append(text)
                self.labels.append(label)
        
        self.max_length = max_length
        
        self.vocab = {
            'PAD': 0, '[': 1, ']': 2, 'SM': 3, 'MAX': 4,
            'MIN': 5, 'MED': 6, '0': 7, '1': 8, '2': 9,
            '3': 10, '4': 11, '5': 12, '6': 13, '7': 14,
            '8': 15, '9': 16, '(': 17, ')': 18, ' ': 19, '\t': 20
        }
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = [self.vocab[char] for char in text if char in self.vocab]
        
        # Create attention mask and truncate/pad
        attention_mask = [1] * min(len(tokens), self.max_length)
        tokens = tokens[:self.max_length]
        
        if len(tokens) < self.max_length:
            padding_length = self.max_length - len(tokens)
            tokens = tokens + [self.vocab['PAD']] * padding_length
            attention_mask = attention_mask + [0] * padding_length
            
        return {
            'input_ids': torch.tensor(tokens, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }


class BigBirdListOps(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.config = BigBirdConfig(
            hidden_size=512,      
            num_attention_heads=8,
            intermediate_size=2048,
            num_hidden_layers=6,  
            vocab_size=21,  
            max_position_embeddings=4096,
            attention_type="block_sparse",
            block_size=64,        
            num_random_blocks=3,   
        )
        self.bigbird = BigBirdModel(self.config)
        self.classifier = nn.Linear(512, num_classes) 
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bigbird(input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        return logits

def validate_model(model, val_loader, device, criterion):
    """
    Validate the model on a validation dataset
    
    Args:
        model (nn.Module): The model to validate
        val_loader (DataLoader): Validation data loader
        device (torch.device): Device to run validation on
        criterion (nn.Module): Loss function
    
    Returns:
        tuple: Average validation loss and accuracy
    """
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total_predictions += labels.size(0)
            correct_predictions += (predicted == labels).sum().item()
    
    avg_val_loss = total_loss / len(val_loader)
    val_accuracy = correct_predictions / total_predictions
    
    return avg_val_loss, val_accuracy

def train_model(train_path, val_path, batch_size=8, num_epochs=10, learning_rate=1e-4, save_interval=5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Create datasets and dataloaders
    train_dataset = ListOpsDataset(train_path)
    val_dataset = ListOpsDataset(val_path)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model = BigBirdListOps().to(device)
    
    # Use gradient accumulation and mixed precision to reduce memory usage
    scaler = torch.cuda.amp.GradScaler()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    best_val_accuracy = 0
    
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        optimizer.zero_grad()
        
        # Training loop
        for i, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} Training")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Use mixed precision training
            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                loss = loss / 4  # Gradient accumulation
            
            scaler.scale(loss).backward()
            
            # Gradient accumulation
            if (i + 1) % 4 == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_loader)
        
        # Validation
        avg_val_loss, val_accuracy = validate_model(model, val_loader, device, criterion)
        
        print(f'\nEpoch {epoch+1}/{num_epochs}:')
        print(f'Training Loss: {avg_train_loss:.4f}')
        print(f'Validation Loss: {avg_val_loss:.4f}')
        print(f'Validation Accuracy: {val_accuracy:.4f}')
        
        # Save best model based on validation accuracy
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_val_accuracy': best_val_accuracy,
            }, 'best_bigbird_listops.pth')
            print(f'Best model saved with validation accuracy: {best_val_accuracy:.4f}')
        
        # Save periodic checkpoints
        if (epoch + 1) % save_interval == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': avg_train_loss,
                'val_loss': avg_val_loss,
                'val_accuracy': val_accuracy,
            }, f'bigbird_listops_epoch_{epoch+1}.pth')
            print(f'Model checkpoint saved at epoch {epoch+1}')

if __name__ == "__main__":
    # Update paths for train and validation datasets
    train_model(
        train_path='/kaggle/input/lra-listops/basic_test.tsv',
        val_path='/kaggle/input/validation/basic_val.tsv'
    )

Using device: cuda


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
Epoch 1/10 Training: 100%|██████████| 436/436 [00:31<00:00, 13.86it/s]
Validation: 100%|██████████| 432/432 [00:22<00:00, 19.39it/s]



Epoch 1/10:
Training Loss: 0.6003
Validation Loss: 2.2578
Validation Accuracy: 0.1922
Best model saved with validation accuracy: 0.1922


Epoch 2/10 Training: 100%|██████████| 436/436 [00:31<00:00, 14.05it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.47it/s]



Epoch 2/10:
Training Loss: 0.5728
Validation Loss: 2.2762
Validation Accuracy: 0.1691


Epoch 3/10 Training: 100%|██████████| 436/436 [00:31<00:00, 13.94it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.45it/s]



Epoch 3/10:
Training Loss: 0.5726
Validation Loss: 2.2552
Validation Accuracy: 0.1922


Epoch 4/10 Training: 100%|██████████| 436/436 [00:31<00:00, 13.99it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.11it/s]



Epoch 4/10:
Training Loss: 0.5702
Validation Loss: 2.2722
Validation Accuracy: 0.1691


Epoch 5/10 Training: 100%|██████████| 436/436 [00:31<00:00, 13.99it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.33it/s]



Epoch 5/10:
Training Loss: 0.5685
Validation Loss: 2.2496
Validation Accuracy: 0.1922
Model checkpoint saved at epoch 5


Epoch 6/10 Training: 100%|██████████| 436/436 [00:31<00:00, 14.00it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.36it/s]



Epoch 6/10:
Training Loss: 0.5673
Validation Loss: 2.2869
Validation Accuracy: 0.1691


Epoch 7/10 Training: 100%|██████████| 436/436 [00:31<00:00, 14.00it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.32it/s]



Epoch 7/10:
Training Loss: 0.5681
Validation Loss: 2.2576
Validation Accuracy: 0.1691


Epoch 8/10 Training: 100%|██████████| 436/436 [00:31<00:00, 14.00it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.33it/s]



Epoch 8/10:
Training Loss: 0.5666
Validation Loss: 2.2644
Validation Accuracy: 0.1691


Epoch 9/10 Training: 100%|██████████| 436/436 [00:31<00:00, 13.95it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.29it/s]



Epoch 9/10:
Training Loss: 0.5688
Validation Loss: 2.2525
Validation Accuracy: 0.1691


Epoch 10/10 Training: 100%|██████████| 436/436 [00:31<00:00, 13.98it/s]
Validation: 100%|██████████| 432/432 [00:21<00:00, 20.29it/s]



Epoch 10/10:
Training Loss: 0.5667
Validation Loss: 2.2587
Validation Accuracy: 0.1922
Model checkpoint saved at epoch 10


### Bigbird model on 10 % of train dataset 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BigBirdConfig, BigBirdModel, PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors
import pandas as pd
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def create_custom_tokenizer(train_path):
    df = pd.read_csv(train_path, sep='\t')
    train_texts = df['Source'].iloc[:len(df)//10].tolist()
    
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    
    trainer = trainers.BpeTrainer(
        vocab_size=23,
        special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
    )
    
    tokenizer.train_from_iterator(train_texts, trainer)
    wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
    wrapped_tokenizer.pad_token = "[PAD]"
    return wrapped_tokenizer

class ListOpsDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=8192):
        df = pd.read_csv(file_path, sep='\t')
        df = df.iloc[:len(df)//10]
        self.texts = df['Source'].tolist()
        self.labels = df['Target'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': encoding['attention_mask'][0],
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class BigBirdListOps(nn.Module):
    def __init__(self, vocab_size, num_classes=10):
        super().__init__()
        self.config = BigBirdConfig(
            hidden_size=8,
            num_attention_heads=2,
            intermediate_size=512,
            num_hidden_layers=2,
            vocab_size=vocab_size,
            max_position_embeddings=8192,
            attention_type="block_sparse",
            block_size=64,
            num_random_blocks=2
        )
        self.bigbird = BigBirdModel(self.config)
        self.classifier = nn.Linear(8, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bigbird(input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        return logits

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def train_model(train_path, val_path, batch_size=2, num_epochs=5, learning_rate=1e-4):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = create_custom_tokenizer(train_path)
    
    train_dataset = ListOpsDataset(train_path, tokenizer)
    val_dataset = ListOpsDataset(val_path, tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model = BigBirdListOps(vocab_size=tokenizer.vocab_size).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_loss += loss.item()
        
        train_metrics = evaluate(model, train_loader, criterion, device)
        val_metrics = evaluate(model, val_loader, criterion, device)
        
        print(f'Epoch {epoch+1}:')
        print(f'Train - Loss: {train_metrics["loss"]:.4f}, Accuracy: {train_metrics["accuracy"]:.4f}, F1: {train_metrics["f1"]:.4f}')
        print(f'Val   - Loss: {val_metrics["loss"]:.4f}, Accuracy: {val_metrics["accuracy"]:.4f}, F1: {val_metrics["f1"]:.4f}')
        
        torch.save(model.state_dict(), f'bigbird_listops_epoch_{epoch+1}.pth')

if __name__ == "__main__":
    train_model(
        train_path='/kaggle/input/lra-listops/basic_train.tsv',
        val_path='/kaggle/input/validation/basic_val.tsv'
    )






Epoch 1/5: 100%|██████████| 4800/4800 [08:48<00:00,  9.08it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1:
Train - Loss: 2.2567, Accuracy: 0.1635, F1: 0.0460
Val   - Loss: 2.2697, Accuracy: 0.1550, F1: 0.0416


Epoch 2/5: 100%|██████████| 4800/4800 [08:48<00:00,  9.08it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2:
Train - Loss: 2.2590, Accuracy: 0.1643, F1: 0.0464
Val   - Loss: 2.2715, Accuracy: 0.1600, F1: 0.0441


Epoch 3/5: 100%|██████████| 4800/4800 [08:48<00:00,  9.08it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3:
Train - Loss: 2.2562, Accuracy: 0.1643, F1: 0.0464
Val   - Loss: 2.2655, Accuracy: 0.1600, F1: 0.0441


Epoch 4/5: 100%|██████████| 4800/4800 [08:47<00:00,  9.10it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4:
Train - Loss: 2.2570, Accuracy: 0.1635, F1: 0.0460
Val   - Loss: 2.2819, Accuracy: 0.1550, F1: 0.0416


Epoch 5/5: 100%|██████████| 4800/4800 [08:47<00:00,  9.10it/s]


### RNN with attention 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from torch.nn import functional as F
from tqdm import tqdm

class ListOpsDataset(Dataset):
    def __init__(self, file_path, max_length=4096):
        df = pd.read_csv(file_path, sep='\t')
        self.texts = []
        self.labels = []
        for text, label in zip(df['Source'], df['Target']):
            if len(text) < 10000:
                self.texts.append(text)
                self.labels.append(label)
        
        self.max_length = max_length
        self.vocab = {
            'PAD': 0, '[': 1, ']': 2, 'SM': 3, 'MAX': 4,
            'MIN': 5, 'MED': 6, '0': 7, '1': 8, '2': 9,
            '3': 10, '4': 11, '5': 12, '6': 13, '7': 14,
            '8': 15, '9': 16, '(': 17, ')': 18, ' ': 19, '\t': 20
        }
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = [self.vocab[char] for char in text if char in self.vocab]
        
        attention_mask = [1] * min(len(tokens), self.max_length)
        tokens = tokens[:self.max_length]
        
        if len(tokens) < self.max_length:
            padding_length = self.max_length - len(tokens)
            tokens = tokens + [self.vocab['PAD']] * padding_length
            attention_mask = attention_mask + [0] * padding_length
            
        return {
            'input_ids': torch.tensor(tokens, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class RNNAttention(nn.Module):
    def __init__(self, vocab_size=21, embedding_dim=128, hidden_size=256, num_layers=2, num_classes=10):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=True)
        
        self.attention = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1, bias=False)
        )
        
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.dropout = nn.Dropout(0.3)

    def attention_net(self, lstm_output, mask):
        attention_weights = self.attention(lstm_output).squeeze(-1)
        attention_weights = attention_weights.masked_fill(mask == 0, float('-inf'))
        attention_weights = F.softmax(attention_weights, dim=1)
        context = torch.bmm(attention_weights.unsqueeze(1), lstm_output).squeeze(1)
        return context

    def forward(self, input_ids, attention_mask):
        embedded = self.dropout(self.embedding(input_ids))
        output, (hidden, cell) = self.rnn(embedded)
        
        context = self.attention_net(output, attention_mask)
        output = self.fc(context)
        return output

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return total_loss / len(dataloader), 100 * correct / total

def train_model(train_path, val_path, batch_size=32, num_epochs=10, learning_rate=1e-3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    train_dataset = ListOpsDataset(train_path)
    val_dataset = ListOpsDataset(val_path)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    model = RNNAttention().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scaler = torch.cuda.amp.GradScaler()
    
    best_val_acc = 0
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            total_loss += loss.item()
        
        train_loss = total_loss / len(train_loader)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        
        print(f'Epoch {epoch+1}:')
        print(f'Training Loss: {train_loss:.4f}')
        print(f'Validation Loss: {val_loss:.4f}')
        print(f'Validation Accuracy: {val_acc:.2f}%')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'New best model saved with validation accuracy: {val_acc:.2f}%')
        
        print()
    
    return model

if __name__ == "__main__":
    train_model(
        train_path='/kaggle/input/lra-listops/basic_test.tsv',
        val_path='/kaggle/input/validation/basic_val.tsv'
    )

Using device: cuda


  scaler = torch.cuda.amp.GradScaler()

  with torch.cuda.amp.autocast():

Epoch 1/10:   2%|▏         | 1/55 [00:00<00:53,  1.01it/s][A
Epoch 1/10:   4%|▎         | 2/55 [00:01<00:49,  1.06it/s][A
Epoch 1/10:   5%|▌         | 3/55 [00:02<00:47,  1.08it/s][A
Epoch 1/10:   7%|▋         | 4/55 [00:03<00:46,  1.09it/s][A
Epoch 1/10:   9%|▉         | 5/55 [00:04<00:45,  1.09it/s][A
Epoch 1/10:  11%|█         | 6/55 [00:05<00:44,  1.10it/s][A
Epoch 1/10:  13%|█▎        | 7/55 [00:06<00:43,  1.10it/s][A
Epoch 1/10:  15%|█▍        | 8/55 [00:07<00:42,  1.10it/s][A
Epoch 1/10:  16%|█▋        | 9/55 [00:08<00:41,  1.10it/s][A
Epoch 1/10:  18%|█▊        | 10/55 [00:09<00:40,  1.10it/s][A
Epoch 1/10:  20%|██        | 11/55 [00:10<00:39,  1.10it/s][A
Epoch 1/10:  22%|██▏       | 12/55 [00:10<00:39,  1.10it/s][A
Epoch 1/10:  24%|██▎       | 13/55 [00:11<00:38,  1.10it/s][A
Epoch 1/10:  25%|██▌       | 14/55 [00:12<00:37,  1.10it/s][A
Epoch 1/10:  27%|██▋       | 15/55 [00:13<00:36,  1.

Epoch 1:
Training Loss: 2.2607
Validation Loss: 2.2361
Validation Accuracy: 19.22%
New best model saved with validation accuracy: 19.22%




Epoch 2/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 2/10:   2%|▏         | 1/55 [00:00<00:49,  1.08it/s][A
Epoch 2/10:   4%|▎         | 2/55 [00:01<00:48,  1.10it/s][A
Epoch 2/10:   5%|▌         | 3/55 [00:02<00:47,  1.10it/s][A
Epoch 2/10:   7%|▋         | 4/55 [00:03<00:46,  1.10it/s][A
Epoch 2/10:   9%|▉         | 5/55 [00:04<00:45,  1.10it/s][A
Epoch 2/10:  11%|█         | 6/55 [00:05<00:44,  1.10it/s][A
Epoch 2/10:  13%|█▎        | 7/55 [00:06<00:43,  1.11it/s][A
Epoch 2/10:  15%|█▍        | 8/55 [00:07<00:43,  1.09it/s][A
Epoch 2/10:  16%|█▋        | 9/55 [00:08<00:42,  1.09it/s][A
Epoch 2/10:  18%|█▊        | 10/55 [00:09<00:41,  1.10it/s][A
Epoch 2/10:  20%|██        | 11/55 [00:10<00:39,  1.10it/s][A
Epoch 2/10:  22%|██▏       | 12/55 [00:10<00:39,  1.10it/s][A
Epoch 2/10:  24%|██▎       | 13/55 [00:11<00:38,  1.10it/s][A
Epoch 2/10:  25%|██▌       | 14/55 [00:12<00:37,  1.10it/s][A
Epoch 2/10:  27%|██▋       | 15/55 [00:13<00:36,  1.11it/s][A
Epoch 2/1

Epoch 2:
Training Loss: 2.2439
Validation Loss: 2.2393
Validation Accuracy: 16.91%




Epoch 3/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 3/10:   2%|▏         | 1/55 [00:00<00:49,  1.09it/s][A
Epoch 3/10:   4%|▎         | 2/55 [00:01<00:48,  1.10it/s][A
Epoch 3/10:   5%|▌         | 3/55 [00:02<00:47,  1.10it/s][A
Epoch 3/10:   7%|▋         | 4/55 [00:03<00:46,  1.10it/s][A
Epoch 3/10:   9%|▉         | 5/55 [00:04<00:45,  1.10it/s][A
Epoch 3/10:  11%|█         | 6/55 [00:05<00:44,  1.10it/s][A
Epoch 3/10:  13%|█▎        | 7/55 [00:06<00:43,  1.10it/s][A
Epoch 3/10:  15%|█▍        | 8/55 [00:07<00:42,  1.11it/s][A
Epoch 3/10:  16%|█▋        | 9/55 [00:08<00:41,  1.10it/s][A
Epoch 3/10:  18%|█▊        | 10/55 [00:09<00:40,  1.10it/s][A
Epoch 3/10:  20%|██        | 11/55 [00:09<00:39,  1.11it/s][A
Epoch 3/10:  22%|██▏       | 12/55 [00:10<00:38,  1.10it/s][A
Epoch 3/10:  24%|██▎       | 13/55 [00:11<00:38,  1.10it/s][A
Epoch 3/10:  25%|██▌       | 14/55 [00:12<00:37,  1.10it/s][A
Epoch 3/10:  27%|██▋       | 15/55 [00:13<00:36,  1.10it/s][A
Epoch 3/1

Epoch 3:
Training Loss: 2.2429
Validation Loss: 2.2375
Validation Accuracy: 16.91%




Epoch 4/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 4/10:   2%|▏         | 1/55 [00:00<00:48,  1.11it/s][A
Epoch 4/10:   4%|▎         | 2/55 [00:01<00:48,  1.10it/s][A
Epoch 4/10:   5%|▌         | 3/55 [00:02<00:47,  1.09it/s][A
Epoch 4/10:   7%|▋         | 4/55 [00:03<00:46,  1.10it/s][A
Epoch 4/10:   9%|▉         | 5/55 [00:04<00:45,  1.09it/s][A
Epoch 4/10:  11%|█         | 6/55 [00:05<00:45,  1.08it/s][A
Epoch 4/10:  13%|█▎        | 7/55 [00:06<00:44,  1.08it/s][A
Epoch 4/10:  15%|█▍        | 8/55 [00:07<00:43,  1.09it/s][A
Epoch 4/10:  16%|█▋        | 9/55 [00:08<00:41,  1.10it/s][A
Epoch 4/10:  18%|█▊        | 10/55 [00:09<00:40,  1.10it/s][A
Epoch 4/10:  20%|██        | 11/55 [00:10<00:39,  1.10it/s][A
Epoch 4/10:  22%|██▏       | 12/55 [00:10<00:39,  1.10it/s][A
Epoch 4/10:  24%|██▎       | 13/55 [00:11<00:38,  1.10it/s][A
Epoch 4/10:  25%|██▌       | 14/55 [00:12<00:37,  1.10it/s][A
Epoch 4/10:  27%|██▋       | 15/55 [00:13<00:36,  1.11it/s][A
Epoch 4/1

Epoch 4:
Training Loss: 2.2422
Validation Loss: 2.2360
Validation Accuracy: 16.91%




Epoch 5/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 5/10:   2%|▏         | 1/55 [00:00<00:48,  1.11it/s][A
Epoch 5/10:   4%|▎         | 2/55 [00:01<00:48,  1.09it/s][A
Epoch 5/10:   5%|▌         | 3/55 [00:02<00:47,  1.10it/s][A
Epoch 5/10:   7%|▋         | 4/55 [00:03<00:46,  1.10it/s][A
Epoch 5/10:   9%|▉         | 5/55 [00:04<00:45,  1.10it/s][A
Epoch 5/10:  11%|█         | 6/55 [00:05<00:44,  1.10it/s][A
Epoch 5/10:  13%|█▎        | 7/55 [00:06<00:43,  1.10it/s][A
Epoch 5/10:  15%|█▍        | 8/55 [00:07<00:42,  1.10it/s][A
Epoch 5/10:  16%|█▋        | 9/55 [00:08<00:41,  1.10it/s][A
Epoch 5/10:  18%|█▊        | 10/55 [00:09<00:41,  1.10it/s][A
Epoch 5/10:  20%|██        | 11/55 [00:10<00:40,  1.10it/s][A
Epoch 5/10:  22%|██▏       | 12/55 [00:10<00:38,  1.10it/s][A
Epoch 5/10:  24%|██▎       | 13/55 [00:11<00:38,  1.10it/s][A
Epoch 5/10:  25%|██▌       | 14/55 [00:12<00:37,  1.11it/s][A
Epoch 5/10:  27%|██▋       | 15/55 [00:13<00:36,  1.09it/s][A
Epoch 5/1

Epoch 5:
Training Loss: 2.2432
Validation Loss: 2.2356
Validation Accuracy: 19.22%




Epoch 6/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 6/10:   2%|▏         | 1/55 [00:00<00:48,  1.11it/s][A
Epoch 6/10:   4%|▎         | 2/55 [00:01<00:48,  1.09it/s][A
Epoch 6/10:   5%|▌         | 3/55 [00:02<00:47,  1.10it/s][A
Epoch 6/10:   7%|▋         | 4/55 [00:03<00:46,  1.11it/s][A
Epoch 6/10:   9%|▉         | 5/55 [00:04<00:45,  1.11it/s][A
Epoch 6/10:  11%|█         | 6/55 [00:05<00:44,  1.10it/s][A
Epoch 6/10:  13%|█▎        | 7/55 [00:06<00:43,  1.11it/s][A
Epoch 6/10:  15%|█▍        | 8/55 [00:07<00:42,  1.11it/s][A
Epoch 6/10:  16%|█▋        | 9/55 [00:08<00:41,  1.10it/s][A
Epoch 6/10:  18%|█▊        | 10/55 [00:09<00:40,  1.10it/s][A
Epoch 6/10:  20%|██        | 11/55 [00:09<00:39,  1.10it/s][A
Epoch 6/10:  22%|██▏       | 12/55 [00:10<00:39,  1.10it/s][A
Epoch 6/10:  24%|██▎       | 13/55 [00:11<00:38,  1.09it/s][A
Epoch 6/10:  25%|██▌       | 14/55 [00:12<00:37,  1.10it/s][A
Epoch 6/10:  27%|██▋       | 15/55 [00:13<00:36,  1.10it/s][A
Epoch 6/1

Epoch 6:
Training Loss: 2.2430
Validation Loss: 2.2360
Validation Accuracy: 19.22%




Epoch 7/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 7/10:   2%|▏         | 1/55 [00:00<00:48,  1.10it/s][A
Epoch 7/10:   4%|▎         | 2/55 [00:01<00:48,  1.10it/s][A
Epoch 7/10:   5%|▌         | 3/55 [00:02<00:47,  1.10it/s][A
Epoch 7/10:   7%|▋         | 4/55 [00:03<00:46,  1.11it/s][A
Epoch 7/10:   9%|▉         | 5/55 [00:04<00:45,  1.10it/s][A
Epoch 7/10:  11%|█         | 6/55 [00:05<00:44,  1.10it/s][A
Epoch 7/10:  13%|█▎        | 7/55 [00:06<00:43,  1.10it/s][A
Epoch 7/10:  15%|█▍        | 8/55 [00:07<00:42,  1.10it/s][A
Epoch 7/10:  16%|█▋        | 9/55 [00:08<00:41,  1.11it/s][A
Epoch 7/10:  18%|█▊        | 10/55 [00:09<00:40,  1.10it/s][A
Epoch 7/10:  20%|██        | 11/55 [00:09<00:39,  1.10it/s][A
Epoch 7/10:  22%|██▏       | 12/55 [00:10<00:38,  1.10it/s][A
Epoch 7/10:  24%|██▎       | 13/55 [00:11<00:37,  1.11it/s][A
Epoch 7/10:  25%|██▌       | 14/55 [00:12<00:37,  1.11it/s][A
Epoch 7/10:  27%|██▋       | 15/55 [00:13<00:36,  1.11it/s][A
Epoch 7/1

Epoch 7:
Training Loss: 2.2426
Validation Loss: 2.2333
Validation Accuracy: 19.22%




Epoch 8/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 8/10:   2%|▏         | 1/55 [00:00<00:48,  1.12it/s][A
Epoch 8/10:   4%|▎         | 2/55 [00:01<00:47,  1.11it/s][A
Epoch 8/10:   5%|▌         | 3/55 [00:02<00:46,  1.11it/s][A
Epoch 8/10:   7%|▋         | 4/55 [00:03<00:46,  1.11it/s][A
Epoch 8/10:   9%|▉         | 5/55 [00:04<00:45,  1.11it/s][A
Epoch 8/10:  11%|█         | 6/55 [00:05<00:45,  1.08it/s][A
Epoch 8/10:  13%|█▎        | 7/55 [00:06<00:45,  1.06it/s][A
Epoch 8/10:  15%|█▍        | 8/55 [00:07<00:43,  1.07it/s][A
Epoch 8/10:  16%|█▋        | 9/55 [00:08<00:42,  1.07it/s][A
Epoch 8/10:  18%|█▊        | 10/55 [00:09<00:41,  1.08it/s][A
Epoch 8/10:  20%|██        | 11/55 [00:10<00:40,  1.09it/s][A
Epoch 8/10:  22%|██▏       | 12/55 [00:11<00:39,  1.10it/s][A
Epoch 8/10:  24%|██▎       | 13/55 [00:11<00:38,  1.10it/s][A
Epoch 8/10:  25%|██▌       | 14/55 [00:12<00:37,  1.10it/s][A
Epoch 8/10:  27%|██▋       | 15/55 [00:13<00:36,  1.10it/s][A
Epoch 8/1

Epoch 8:
Training Loss: 2.2404
Validation Loss: 2.2347
Validation Accuracy: 19.22%




Epoch 9/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 9/10:   2%|▏         | 1/55 [00:00<00:48,  1.11it/s][A
Epoch 9/10:   4%|▎         | 2/55 [00:01<00:48,  1.09it/s][A
Epoch 9/10:   5%|▌         | 3/55 [00:02<00:47,  1.09it/s][A
Epoch 9/10:   7%|▋         | 4/55 [00:03<00:46,  1.10it/s][A
Epoch 9/10:   9%|▉         | 5/55 [00:04<00:45,  1.10it/s][A
Epoch 9/10:  11%|█         | 6/55 [00:05<00:44,  1.10it/s][A
Epoch 9/10:  13%|█▎        | 7/55 [00:06<00:43,  1.11it/s][A
Epoch 9/10:  15%|█▍        | 8/55 [00:07<00:42,  1.11it/s][A
Epoch 9/10:  16%|█▋        | 9/55 [00:08<00:41,  1.11it/s][A
Epoch 9/10:  18%|█▊        | 10/55 [00:09<00:40,  1.10it/s][A
Epoch 9/10:  20%|██        | 11/55 [00:09<00:39,  1.11it/s][A
Epoch 9/10:  22%|██▏       | 12/55 [00:10<00:39,  1.10it/s][A
Epoch 9/10:  24%|██▎       | 13/55 [00:11<00:38,  1.09it/s][A
Epoch 9/10:  25%|██▌       | 14/55 [00:12<00:37,  1.10it/s][A
Epoch 9/10:  27%|██▋       | 15/55 [00:13<00:37,  1.07it/s][A
Epoch 9/1

Epoch 9:
Training Loss: 2.2401
Validation Loss: 2.2352
Validation Accuracy: 19.22%




Epoch 10/10:   0%|          | 0/55 [00:00<?, ?it/s][A
Epoch 10/10:   2%|▏         | 1/55 [00:00<00:48,  1.10it/s][A
Epoch 10/10:   4%|▎         | 2/55 [00:01<00:47,  1.11it/s][A
Epoch 10/10:   5%|▌         | 3/55 [00:02<00:47,  1.11it/s][A
Epoch 10/10:   7%|▋         | 4/55 [00:03<00:46,  1.11it/s][A
Epoch 10/10:   9%|▉         | 5/55 [00:04<00:45,  1.11it/s][A
Epoch 10/10:  11%|█         | 6/55 [00:05<00:44,  1.10it/s][A
Epoch 10/10:  13%|█▎        | 7/55 [00:06<00:43,  1.09it/s][A
Epoch 10/10:  15%|█▍        | 8/55 [00:07<00:42,  1.10it/s][A
Epoch 10/10:  16%|█▋        | 9/55 [00:08<00:41,  1.10it/s][A
Epoch 10/10:  18%|█▊        | 10/55 [00:09<00:41,  1.10it/s][A
Epoch 10/10:  20%|██        | 11/55 [00:10<00:40,  1.10it/s][A
Epoch 10/10:  22%|██▏       | 12/55 [00:10<00:39,  1.10it/s][A
Epoch 10/10:  24%|██▎       | 13/55 [00:11<00:38,  1.10it/s][A
Epoch 10/10:  25%|██▌       | 14/55 [00:12<00:37,  1.10it/s][A
Epoch 10/10:  27%|██▋       | 15/55 [00:13<00:36,  1.10it

Epoch 10:
Training Loss: 2.2394
Validation Loss: 2.2345
Validation Accuracy: 19.22%



### Bigbird model with preprocessing and tokenizer using 10 % of test dataset

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BigBirdConfig, BigBirdModel, PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, pre_tokenizers, processors
import pandas as pd
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def load_tokenizer():
    # Create vocabulary
    vocab = {
        "[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3, "[MASK]": 4,
        "(": 5, ")": 6, "[": 7, "]": 8, "8": 9, "7": 10, "9": 11,
        "5": 12, "4": 13, "6": 14, "3": 15, "1": 16, "0": 17, "2": 18,
        "MIN": 19, "MED": 20, "MAX": 21, "SM": 22
    }
    
    # Create a WordLevel tokenizer
    tokenizer = Tokenizer(models.WordLevel(vocab, unk_token="[UNK]"))
    
    # Add whitespace pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
    
    # Add special tokens
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    
    # Create the wrapped tokenizer
    wrapped_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]"
    )
    
    return wrapped_tokenizer

class ListOpsDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=8192, use_subset=True):
        df = pd.read_csv(file_path, sep='\t')
        
        # Use only 10% of the data if use_subset is True
        if use_subset:
            df = df.sample(frac=0.1, random_state=42)
        
        self.texts = df['Source'].tolist()
        self.labels = df['Target'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': encoding['attention_mask'][0],
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class BigBirdListOps(nn.Module):
    def __init__(self, vocab_size=23, num_classes=10):
        super().__init__()
        self.config = BigBirdConfig(
            hidden_size=8,
            num_attention_heads=4,
            intermediate_size=512,
            num_hidden_layers=2,
            vocab_size=vocab_size,
            max_position_embeddings=8192,
            attention_type="block_sparse",
            block_size=64,
            num_random_blocks=2
        )
        self.bigbird = BigBirdModel(self.config)
        self.classifier = nn.Linear(8, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bigbird(input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        return logits

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def train_model(train_path, val_path, batch_size=10, num_epochs=5, learning_rate=1e-4):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = load_tokenizer()
    
    # Create datasets with 10% of the data
    train_dataset = ListOpsDataset(train_path, tokenizer, use_subset=True)
    val_dataset = ListOpsDataset(val_path, tokenizer, use_subset=True)
    
    print(f"Training on {len(train_dataset)} samples")
    print(f"Validating on {len(val_dataset)} samples")
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model = BigBirdListOps(vocab_size=23).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_loss += loss.item()
        
        train_metrics = evaluate(model, train_loader, criterion, device)
        val_metrics = evaluate(model, val_loader, criterion, device)
        
        print(f'Epoch {epoch+1}:')
        print(f'Train - Loss: {train_metrics["loss"]:.4f}, Accuracy: {train_metrics["accuracy"]:.4f}, F1: {train_metrics["f1"]:.4f}')
        print(f'Val   - Loss: {val_metrics["loss"]:.4f}, Accuracy: {val_metrics["accuracy"]:.4f}, F1: {val_metrics["f1"]:.4f}')
        
        torch.save(model.state_dict(), f'bigbird_listops_epoch_{epoch+1}.pth')

if __name__ == "__main__":
    train_model(
        train_path='/kaggle/input/lra-listops/basic_train.tsv',
        val_path='/kaggle/input/validation/basic_val.tsv'
    )



Training on 9600 samples
Validating on 200 samples


Epoch 1/5: 100%|██████████| 960/960 [10:38<00:00,  1.50it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1:
Train - Loss: 2.2711, Accuracy: 0.1654, F1: 0.0470
Val   - Loss: 2.2344, Accuracy: 0.1750, F1: 0.0521


Epoch 2/5: 100%|██████████| 960/960 [10:38<00:00,  1.50it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2:
Train - Loss: 2.2604, Accuracy: 0.1627, F1: 0.0455
Val   - Loss: 2.2252, Accuracy: 0.1850, F1: 0.0578


Epoch 3/5: 100%|██████████| 960/960 [10:37<00:00,  1.51it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3:
Train - Loss: 2.2585, Accuracy: 0.1654, F1: 0.0470
Val   - Loss: 2.2361, Accuracy: 0.1750, F1: 0.0521


Epoch 4/5: 100%|██████████| 960/960 [10:38<00:00,  1.50it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4:
Train - Loss: 2.2570, Accuracy: 0.1654, F1: 0.0470
Val   - Loss: 2.2350, Accuracy: 0.1750, F1: 0.0521


Epoch 5/5: 100%|██████████| 960/960 [10:37<00:00,  1.51it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5:
Train - Loss: 2.2558, Accuracy: 0.1654, F1: 0.0470
Val   - Loss: 2.2317, Accuracy: 0.1750, F1: 0.0521


  _warn_prf(average, modifier, msg_start, len(result))


### Bigbird Model using test and train datasets with sequences of 20 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BigBirdConfig, BigBirdModel, PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, pre_tokenizers, processors
import pandas as pd
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def load_tokenizer():
    # Create vocabulary
    vocab = {
        "[PAD]": 0, "[UNK]": 1, "[CLS]": 2, "[SEP]": 3, "[MASK]": 4,
        "(": 5, ")": 6, "[": 7, "]": 8, "8": 9, "7": 10, "9": 11,
        "5": 12, "4": 13, "6": 14, "3": 15, "1": 16, "0": 17, "2": 18,
        "MIN": 19, "MED": 20, "MAX": 21, "SM": 22
    }
    
    # Create a WordLevel tokenizer
    tokenizer = Tokenizer(models.WordLevel(vocab, unk_token="[UNK]"))
    
    # Add whitespace pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
    
    # Add special tokens
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
    
    # Create the wrapped tokenizer
    wrapped_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]"
    )
    
    return wrapped_tokenizer

class ListOpsDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=1024, use_subset=True):
        df = pd.read_csv(file_path, sep='\t')
        
        # Use only 10% of the data if use_subset is True
        if use_subset:
            df = df.sample(frac=0.1, random_state=42)
        
        self.texts = df['Source'].tolist()
        self.labels = df['Target'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': encoding['attention_mask'][0],
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class BigBirdListOps(nn.Module):
    def __init__(self, vocab_size=23, num_classes=10):
        super().__init__()
        self.config = BigBirdConfig(
            hidden_size=8,
            num_attention_heads=4,
            intermediate_size=512,
            num_hidden_layers=2,
            vocab_size=vocab_size,
            max_position_embeddings=1024,
            attention_type="block_sparse",
            block_size=64,
            num_random_blocks=2
        )
        self.bigbird = BigBirdModel(self.config)
        self.classifier = nn.Linear(8, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bigbird(input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        return logits

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    return {
        'loss': total_loss / len(dataloader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def train_model(train_path, val_path, batch_size=10, num_epochs=10, learning_rate=1e-3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = load_tokenizer()
    
    # Create datasets with 10% of the data
    train_dataset = ListOpsDataset(train_path, tokenizer, use_subset=True)
    val_dataset = ListOpsDataset(val_path, tokenizer, use_subset=True)
    
    print(f"Training on {len(train_dataset)} samples")
    print(f"Validating on {len(val_dataset)} samples")
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model = BigBirdListOps(vocab_size=23).to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            total_loss += loss.item()
        
        train_metrics = evaluate(model, train_loader, criterion, device)
        val_metrics = evaluate(model, val_loader, criterion, device)
        
        print(f'Epoch {epoch+1}:')
        print(f'Train - Loss: {train_metrics["loss"]:.4f}, Accuracy: {train_metrics["accuracy"]:.4f}, F1: {train_metrics["f1"]:.4f}')
        print(f'Val   - Loss: {val_metrics["loss"]:.4f}, Accuracy: {val_metrics["accuracy"]:.4f}, F1: {val_metrics["f1"]:.4f}')
        
        torch.save(model.state_dict(), f'bigbird_listops_epoch_{epoch+1}.pth')

if __name__ == "__main__":
    train_model(
        train_path='/kaggle/input/listops/train_d20s.tsv',
        val_path='/kaggle/input/listops/test_d20s.tsv'
    )



Training on 9000 samples
Validating on 1000 samples


Epoch 1/10: 100%|██████████| 900/900 [01:07<00:00, 13.37it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1:
Train - Loss: 2.3058, Accuracy: 0.1127, F1: 0.0355
Val   - Loss: 2.2934, Accuracy: 0.1160, F1: 0.0357


Epoch 2/10: 100%|██████████| 900/900 [01:07<00:00, 13.32it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2:
Train - Loss: 2.2483, Accuracy: 0.1832, F1: 0.1031
Val   - Loss: 2.2694, Accuracy: 0.1630, F1: 0.0899


Epoch 3/10: 100%|██████████| 900/900 [01:07<00:00, 13.30it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3:
Train - Loss: 2.2366, Accuracy: 0.1788, F1: 0.1241
Val   - Loss: 2.2625, Accuracy: 0.1680, F1: 0.1180


Epoch 4/10: 100%|██████████| 900/900 [01:07<00:00, 13.32it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4:
Train - Loss: 2.2229, Accuracy: 0.1931, F1: 0.1107
Val   - Loss: 2.2496, Accuracy: 0.1770, F1: 0.1050


Epoch 5/10: 100%|██████████| 900/900 [01:07<00:00, 13.31it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 5:
Train - Loss: 2.2231, Accuracy: 0.1900, F1: 0.1229
Val   - Loss: 2.2530, Accuracy: 0.1740, F1: 0.1087


Epoch 6/10: 100%|██████████| 900/900 [01:07<00:00, 13.31it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 6:
Train - Loss: 2.2055, Accuracy: 0.2063, F1: 0.1285
Val   - Loss: 2.2540, Accuracy: 0.1850, F1: 0.1177


Epoch 7/10: 100%|██████████| 900/900 [01:07<00:00, 13.28it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 7:
Train - Loss: 2.2019, Accuracy: 0.2089, F1: 0.1388
Val   - Loss: 2.2615, Accuracy: 0.1820, F1: 0.1211


Epoch 8/10: 100%|██████████| 900/900 [01:07<00:00, 13.27it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 8:
Train - Loss: 2.1898, Accuracy: 0.2187, F1: 0.1395
Val   - Loss: 2.2481, Accuracy: 0.1840, F1: 0.1171


Epoch 9/10: 100%|██████████| 900/900 [01:07<00:00, 13.27it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 9:
Train - Loss: 2.1780, Accuracy: 0.2219, F1: 0.1434
Val   - Loss: 2.2489, Accuracy: 0.1910, F1: 0.1214


Epoch 10/10: 100%|██████████| 900/900 [01:07<00:00, 13.26it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 10:
Train - Loss: 2.1707, Accuracy: 0.2211, F1: 0.1464
Val   - Loss: 2.2458, Accuracy: 0.1850, F1: 0.1259


  _warn_prf(average, modifier, msg_start, len(result))
