## Imports and Configuration

In [7]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import numpy as np
from typing import Optional, Tuple
from pathlib import Path
import h5py
import math



# # Set up reproducibility and basic configuration
# pl.seed_everything(42)
# EXPERIMENT_NAME = f"sequential_pathfinder_{datetime.now().strftime('%Y%m%d_%H%M')}"

# # Model and training hyperparameters
# HIDDEN_SIZE = 256
# NUM_LAYERS = 4
# NUM_HEADS = 8
# DROPOUT = 0.1
# BATCH_SIZE = 64
# LEARNING_RATE = 1e-4
# MAX_EPOCHS = 35
# WINDOW_SIZE = 5  # Size of local view window (5x5)
# MAX_STEPS = 128  # Maximum sequence length for path following

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: Tesla P100-PCIE-16GB


##  Sequential Dataset Implementation

In [8]:
class PathfinderH5Dataset(Dataset):
    def __init__(self, h5_path):
        self.h5_path = h5_path
        with h5py.File(h5_path, 'r') as f:
            self.length = len(f['images'])
    
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        with h5py.File(self.h5_path, 'r') as f:
            image = torch.from_numpy(f['images'][idx]).float()
            label = torch.tensor(f['labels'][idx]).long()
        
        image = (image > 127.5).float()
        directions = self.compute_direction_embeddings(image)
        pos_emb = self.get_positional_encoding(1024, 256)
        
        return {
            'image': image.view(-1),
            'directions': directions,
            'pos_emb': pos_emb,
            'label': label
        }
    
    def compute_direction_embeddings(self, image):
        dirs = torch.zeros((32, 32, 8))
        padded = F.pad(image, (1, 1, 1, 1))
        directions = [(-1, -1), (-1, 0), (-1, 1), (0, -1), 
                     (0, 1), (1, -1), (1, 0), (1, 1)]
        
        for i in range(32):
            for j in range(32):
                if image[i, j] > 0:
                    for d, (di, dj) in enumerate(directions):
                        dirs[i, j, d] = padded[i+di+1, j+dj+1]
        return dirs.view(-1, 8)
    
    def get_positional_encoding(self, seq_len, d_model):
        position = torch.arange(seq_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(seq_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe[:, :128]  # Match d_model dimension

##  Attention Modules


In [9]:
class LocalGlobalAttention(nn.Module):
    def __init__(self, dim: int, window_size: int = 7, num_heads: int = 8):
        super().__init__()
        self.dim = dim
        self.window_size = window_size
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        
        self.qkv = nn.Linear(dim, dim * 3)
        self.proj = nn.Linear(dim, dim)
        
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        # Compute attention scores
        scale = self.head_dim ** -0.5
        attn = (q @ k.transpose(-2, -1)) * scale
        
        if mask is not None:
            attn = attn.masked_fill(mask == 0, float('-inf'))
        
        attn = F.softmax(attn, dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        return x

##  Transformer

In [10]:
# Main Model
class PathfinderTransformer(pl.LightningModule):
    def __init__(
        self,
        d_model: int = 128,
        nhead: int = 4,
        num_layers: int = 4,
    ):
        super().__init__()
        
        self.pixel_embed = nn.Linear(1, d_model)
        self.dir_embed = nn.Linear(8, d_model)
        self.pos_embed = nn.Linear(d_model, d_model)
        
        self.layers = nn.ModuleList([
            LocalGlobalAttention(d_model, window_size=7, num_heads=nhead)
            for _ in range(num_layers)
        ])
        
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(d_model, 2)
        
    def forward(self, batch):
        x = batch['image'].unsqueeze(-1)
        dirs = batch['directions']
        pos = batch['pos_emb']
        
        # Embeddings
        x = self.pixel_embed(x)
        dirs = self.dir_embed(dirs)
        pos = self.pos_embed(pos)
        
        # Combine embeddings
        x = x + dirs + pos
        x = self.dropout(x)
        
        # Process through transformer layers
        for layer in self.layers:
            x = layer(x)
        
        x = self.norm(x)
        x = x.mean(dim=1)  # Global pooling
        x = self.classifier(x)
        
        return x
    
    def training_step(self, batch, batch_idx):
        y_hat = self(batch)
        loss = F.cross_entropy(y_hat, batch['label'])
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        y_hat = self(batch)
        loss = F.cross_entropy(y_hat, batch['label'])
        acc = (y_hat.argmax(dim=1) == batch['label']).float().mean()
        self.log('val_loss', loss)
        self.log('val_acc', acc)
        
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=1e-4,
            weight_decay=0.01
        )
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=200,
            eta_min=1e-6
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_loss"
            }
        }

## Training Setup

In [11]:
def train_pathfinder_model():
    """
    Trains the Sequential Longformer model on the pathfinder task,
    implementing curriculum learning across different difficulty levels.
    """
    # Initialize data module with all difficulty levels
    data_module = PathfinderDataModule('/kaggle/input/dataset-longformer')
    data_module.setup()
    
    model = SequentialLongformerModel()
    dummy_input = torch.randn(BATCH_SIZE, MAX_STEPS, model.input_size)
    dummy_mask = torch.ones(BATCH_SIZE, MAX_STEPS)
    output = model(dummy_input, dummy_mask)
    print(output.shape)  # Should match (BATCH_SIZE, 2)

    
    # Create callbacks for monitoring and saving
    callbacks = [
        pl.callbacks.ModelCheckpoint(
            dirpath='checkpoints',
            filename='pathfinder-{epoch:02d}-{val_acc:.2f}',
            monitor='val_acc',
            mode='max',
            save_top_k=3,
            verbose=True
        ),
        pl.callbacks.EarlyStopping(
            monitor='val_acc',
            mode='max',
            patience=7,    # More patience for complex learning
            min_delta=0.01,
            verbose=True
        ),
        pl.callbacks.LearningRateMonitor(
            logging_interval='step'
        )
    ]
    
    # Set up logger for detailed training monitoring
    logger = pl.loggers.TensorBoardLogger(
        save_dir='logs',
        name=EXPERIMENT_NAME,
        version=datetime.now().strftime('%Y%m%d_%H%M')
    )
    
    # Initialize trainer with our configurations
    trainer = pl.Trainer(
        max_epochs=5,
        accelerator='gpu',
        devices=1,
        precision='16-mixed',
        callbacks=callbacks,
        logger=logger,
        gradient_clip_val=0.5,
        accumulate_grad_batches=2,  # Effective batch size doubling
        log_every_n_steps=10
    )
    
    # Train progressively on each difficulty level
    difficulties = ['easy', 'medium', 'hard']
    results = {}
    
    for difficulty in difficulties:
        print(f"\nTraining on {difficulty.upper()} dataset:")
        print("=" * 50)
        
        # Train model
        trainer.fit(
            model,
            train_dataloaders=data_module.train_dataloader(difficulty),
            val_dataloaders=data_module.val_dataloader(difficulty)
        )
        
        # Test performance
        test_results = trainer.test(
            model,
            dataloaders=data_module.test_dataloader(difficulty)
        )
        
        results[difficulty] = test_results[0]
        
        print(f"\nResults for {difficulty}:")
        print(f"Test Accuracy: {results[difficulty]['test_acc']*100:.2f}%")
        print(f"Test Loss: {results[difficulty]['test_loss']:.4f}")
        
        # Save model state for this difficulty level
        torch.save(
            model.state_dict(),
            f'model_state_{difficulty}.pt'
        )
    
    return model, results

## Progressive Training function 

In [12]:
def train_progressive(max_epochs: int = 5):
    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter('runs/pathfinder')
    histories = {'medium': [], 'hard': []}
    
    difficulties = ['medium', 'hard']
    model = None
    
    for difficulty in difficulties:
        print(f"\nTraining on {difficulty} dataset...")
        file_path = f'/kaggle/input/dataset-longformer/merged_data_{difficulty}.h5'
        
        with h5py.File(file_path, 'r') as f:
            total_samples = len(f['images'])
            print(f"\nDataset size: {total_samples}")
            print(f"Connected paths: {(f['labels'][:] == 1).sum()}")
            print(f"Disconnected paths: {(f['labels'][:] == 0).sum()}")
            total_samples = len(f['images'])
            indices = torch.randperm(total_samples)
            train_split = int(0.8 * total_samples)
            val_split = int(0.9 * total_samples)
            
            train_dataset = PathfinderH5Dataset(file_path)
            val_dataset = PathfinderH5Dataset(file_path)
            test_dataset = PathfinderH5Dataset(file_path)
            
            train_dataset.indices = indices[:train_split]
            val_dataset.indices = indices[train_split:val_split]
            test_dataset.indices = indices[val_split:]
        
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0)
        test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)
        
        if model is None:
            model = PathfinderTransformer(d_model=128, nhead=4, num_layers=4)
        
        trainer = pl.Trainer(
            max_epochs=max_epochs,
            accelerator='gpu',
            devices=1,
            precision=16,
            limit_train_batches=0.3,
            limit_val_batches=0.2,
            callbacks=[
               pl.callbacks.EarlyStopping(monitor='val_loss', patience=5),
               pl.callbacks.RichProgressBar(),
               pl.callbacks.LearningRateMonitor()
           ],
            enable_progress_bar=True,
            log_every_n_steps=1,
            logger=pl.loggers.TensorBoardLogger('runs', name=f'pathfinder_{difficulty}')

        )
        class MetricsCallback(pl.Callback):
           def on_train_epoch_end(self, trainer, pl_module):
               metrics = trainer.callback_metrics
               histories[difficulty].append({
                   'epoch': trainer.current_epoch,
                   'train_loss': metrics['train_loss'].item(),
                   'train_acc': metrics['train_acc'].item(),
                   'val_loss': metrics['val_loss'].item(),
                   'val_acc': metrics['val_acc'].item()
               })
               print(f"\nEpoch {trainer.current_epoch}")
               print(f"Train Loss: {metrics['train_loss']:.4f}, Acc: {metrics['train_acc']:.4f}")
               print(f"Val Loss: {metrics['val_loss']:.4f}, Acc: {metrics['val_acc']:.4f}")
               
               writer.add_scalar(f'{difficulty}/train_loss', metrics['train_loss'], trainer.current_epoch)
               writer.add_scalar(f'{difficulty}/train_acc', metrics['train_acc'], trainer.current_epoch)
               writer.add_scalar(f'{difficulty}/val_loss', metrics['val_loss'], trainer.current_epoch)
               writer.add_scalar(f'{difficulty}/val_acc', metrics['val_acc'], trainer.current_epoch)
       
        trainer.callbacks.append(MetricsCallback())
        trainer.fit(model, train_loader, val_loader)
        test_result = trainer.test(model, test_loader)
        print(f"\nTest results for {difficulty}: {test_result}")
        
    plot_histories(histories)
    writer.close()
    return model

def plot_histories(histories):
   fig, axes = plt.subplots(2, 2, figsize=(15, 10))
   
   for difficulty in histories:
       data = pd.DataFrame(histories[difficulty])
       data.plot(x='epoch', y='train_loss', ax=axes[0,0], label=f'{difficulty}_train')
       data.plot(x='epoch', y='val_loss', ax=axes[0,1], label=f'{difficulty}_val')
       data.plot(x='epoch', y='train_acc', ax=axes[1,0], label=f'{difficulty}_train')
       data.plot(x='epoch', y='val_acc', ax=axes[1,1], label=f'{difficulty}_val')
   
   axes[0,0].set_title('Training Loss')
   axes[0,1].set_title('Validation Loss')
   axes[1,0].set_title('Training Accuracy')
   axes[1,1].set_title('Validation Accuracy')
   
   plt.tight_layout()
   plt.savefig('training_history.png')
   plt.show()

In [None]:
model = train_progressive()


Training on medium dataset...

Dataset size: 200000
Connected paths: 100222
Disconnected paths: 99778


/opt/conda/lib/python3.10/site-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!


Output()

In [None]:
# In separate cell after training starts:
%load_ext tensorboard
%tensorboard --logdir runs/