In [1]:
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset, Subset

import os

# Set the CUBLAS_WORKSPACE_CONFIG environment variable
# ':4096:8' or ':16:8' can be used. Here, we use ':4096:8' as an example.
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

SEED = 75485

random.seed(SEED)
np.random.seed(SEED)
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
def seed_worker(worker_id):
    np.random.seed(torch.initial_seed() % 2**32)    
    random.seed(torch.initial_seed() % 2**32)

In [2]:
import os
import subprocess
import importlib  # Import importlib

required_packages = ['torch', 'torchvision', 'segmentation-models-pytorch', 'efficientnet_pytorch']

def install_packages(packages):
    for package in packages:
        try:
            # Check if the package is already installed (Python 3.7+)
            importlib.import_module(package)
            print(f"{package} already installed.")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call(['pip', 'install', package])
            print(f"{package} installed successfully.")

install_packages(required_packages)

torch already installed.
torchvision already installed.
Installing segmentation-models-pytorch...
Collecting segmentation-models-pytorch
  Downloading segmentation_models_pytorch-0.3.4-py3-none-any.whl.metadata (30 kB)
Collecting efficientnet-pytorch==0.7.1 (from segmentation-models-pytorch)
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pretrainedmodels==0.7.4 (from segmentation-models-pytorch)
  Downloading pretrainedmodels-0.7.4.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting timm==0.9.7 (from segmentation-models-pytorch)
  Downloading timm-0.9.7-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m 

In [3]:
train_csv = '/kaggle/input/2024-flame-ai-challenge/dataset/train.csv'
train_data_dir = '/kaggle/input/2024-flame-ai-challenge/dataset/train'
test_csv = '/kaggle/input/2024-flame-ai-challenge/dataset/test.csv'
test_data_dir = '/kaggle/input/2024-flame-ai-challenge/dataset/test'

SEQ_LEN = 5
seq_len = SEQ_LEN
batch_size = 4

# Configuration Parameters
use_early_stopping = False  # Set to False to disable early stopping
patience = 5      # Number of epochs to wait for improvement
delta = 0     # Minimum change to qualify as improvement

# Define Model Names
model_names = [
#    'unet',
#    'deeplabtemporal',
#   'resnet50',
#    'convlstm',
#   'residualcnn',
    'residualcnnsa',
#    'residualdensecnn',
#    'dconvlstmsac',
#    'selfattconvlstm',
#    'bidirectconvlstmunet',
#    #'pinn-convlstm',
#     '3dconvlstm',
#    'convlstm_autoencoder',
#    'stconvlstm_autoencoder',
#    'spatiotemporal_transformer',
#    'attentionunet',
#    'tcnmodel',
#    'hybridcnntransformer',
#    'multiscaleconvlstm',
#    'multiscaleselfattconvlstm',
#    'multiscaleresidualcnn',
#    'residualconvlstm',
#    'residualunet',
#    'biconvlstm'
#    'spatiotemporalcnnmdn',
]

# Define default number of epochs
num_epochs_default = 1

# Update model_epochs to include all models or rely on the default
model_epochs = {
#     'unet': 100,
#     'deeplabtemporal': 5,
#      'resnet50': 50,
#      'convlstm': 50,
#      'residualcnn': 50,
     'residualcnnsa': 50,
#      'dconvlstmsac': 30,
#      'selfattconvlstm': 40,
#      'bidirectconvlstmunet': 50,
#      'pinn-convlstm': 30,
#      '3dconvlstm': 30,
#     'convlstm_autoencoder': 30,
#     'stconvlstm_autoencoder': 30,
#     'spatiotemporal_transformer': 30  
#     'attentionunet': 50,
#     'tcnmodel': 50,
#     'hybridcnntransformer': 50,
#     'residualdensecnn': 50,
#     'multiscaleconvlstm': 40,
#     'multiscaleselfattconvlstm': 50,
#     'multiscaleresidualcnn': 50,
#     'residualconvlstm': 50,
#     'residualunet': 50,
#     'biconvlstm': 50,
#     'spatiotemporalcnnmdn': 50,
}

In [4]:
# 1. Import Necessary Libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import (
    resnet18, resnet34, resnet50, resnet101, resnet152,
    ResNet18_Weights, ResNet34_Weights, ResNet50_Weights, ResNet101_Weights, ResNet152_Weights,
    densenet121, DenseNet121_Weights,
    mobilenet_v2, MobileNet_V2_Weights
)
from efficientnet_pytorch import EfficientNet  # Ensure this is installed
import pandas as pd
import numpy as np
import os
import torchvision.ops as ops  # For DeformConv2d
import segmentation_models_pytorch as smp

# Try importing MobileNetV3 from timm
try:
    import timm
    has_timm = True
except ImportError:
    has_timm = False
    print("timm library not found. MobileNetV3 will not be available. Please install it using `pip install timm`.")

In [5]:
import torch

class EarlyStopping:
    """
    Early stops the training if validation loss doesn't improve after a given patience.
    """
    def __init__(self, patience=5, verbose=False, delta=0, path='checkpoint.pth'):
        """
        Args:
            patience (int): How many epochs to wait after last time validation loss improved.
                            Default: 5
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                           Default: 0
            path (str): Path for the checkpoint to be saved to.
                        Default: 'checkpoint.pth'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path
        
    def __call__(self, val_loss, model):
        score = -val_loss  # We want to maximize the score (minimize loss)
        
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0
            
    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decreases.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss


In [6]:
class FireDataset(Dataset):
    def __init__(self, csv_data, data_dir, seq_len=5, is_train=True):
        """
        Args:
            csv_data (str or DataFrame): Path to the csv file with annotations, or the DataFrame itself.
            data_dir (str): Directory with all the .dat files.
            seq_len (int): Number of past timesteps to use as input.
            is_train (bool): Flag indicating training or testing mode.
        """
        if isinstance(csv_data, str):
            self.data_info = pd.read_csv(csv_data)
        else:
            self.data_info = csv_data  # Accept DataFrame directly
        
        self.data_dir = data_dir
        self.seq_len = seq_len
        self.is_train = is_train
        self.samples = self._create_samples()

    def _create_samples(self):
        samples = []
        for _, row in self.data_info.iterrows():
            id = row['id']
            u = row['u']
            alpha = row['alpha']
            Nt = row['Nt']
            # Load filenames
            theta_path = os.path.join(self.data_dir, row['theta_filename'])
            ustar_path = os.path.join(self.data_dir, row['ustar_filename'])
            xi_path = os.path.join(self.data_dir, row['xi_filename'])

            # Check if files exist
            if not os.path.exists(theta_path) or not os.path.exists(ustar_path) or not os.path.exists(xi_path):
                print(f"Missing files for ID {id}. Skipping.")
                continue

            # Load .dat files (assuming binary format)
            try:
                theta = np.fromfile(theta_path, dtype=np.float32).reshape(Nt, 113, 32)
                ustar = np.fromfile(ustar_path, dtype=np.float32).reshape(Nt, 113, 32)
                xi = np.fromfile(xi_path, dtype=np.float32).reshape(Nt, 113, 32)
            except ValueError as e:
                print(f"Error reshaping files for ID {id}: {e}")
                continue

            if self.is_train:
                # Ensure there are enough timesteps to create at least one sample
                if Nt < self.seq_len + 1:
                    print(f"Not enough timesteps for ID {id}. Required: {self.seq_len + 1}, Available: {Nt}")
                    continue

                # Create multiple samples using sliding window
                for t in range(Nt - self.seq_len):
                    # Input sequences: theta, ustar, xi for seq_len timesteps
                    theta_seq = theta[t:t+self.seq_len]  # Shape: [seq_len, 113, 32]
                    ustar_seq = ustar[t:t+self.seq_len]
                    xi_seq = xi[t:t+self.seq_len]

                    # Stack features per time step: [seq_len, 5, 113, 32]
                    # Each time step has channels: [ustar, theta, xi, u, alpha]
                    # u and alpha are scalar features, tiled to [1, 113, 32] each
                    features = []
                    for i in range(self.seq_len):
                        theta_i = theta_seq[i]  # [113,32]
                        ustar_i = ustar_seq[i]
                        xi_i = xi_seq[i]
                        u_i = u
                        alpha_i = alpha

                        # Convert scalar features to tensors and tile
                        u_tensor = torch.tensor(u_i, dtype=torch.float32).unsqueeze(0).unsqueeze(1).repeat(1, 113, 32)  # [1,113,32]
                        alpha_tensor = torch.tensor(alpha_i, dtype=torch.float32).unsqueeze(0).unsqueeze(1).repeat(1, 113, 32)  # [1,113,32]

                        # Stack all features
                        feature = torch.stack([
                            torch.tensor(ustar_i, dtype=torch.float32),  # [113,32]
                            torch.tensor(theta_i, dtype=torch.float32),  # [113,32]
                            torch.tensor(xi_i, dtype=torch.float32),     # [113,32]
                            u_tensor.squeeze(0),                        # [113,32]
                            alpha_tensor.squeeze(0)                     # [113,32]
                        ], dim=0)  # [5,113,32]

                        features.append(feature)
                    
                    feature_sequence = torch.stack(features, dim=0)  # [seq_len, 5, 113,32]

                    # Target is the next xi timestep
                    target = torch.tensor(xi[t + self.seq_len], dtype=torch.float32).unsqueeze(0)  # [1,113,32]

                    samples.append({
                        'id': id,
                        'input': feature_sequence,  # [3,5,113,32]
                        'target': target  # [1,113,32]
                    })
            else:
                # For test set, create a single sample with the initial seq_len timesteps
                if Nt < self.seq_len:
                    print(f"Not enough timesteps for ID {id} in test set. Required: {self.seq_len}, Available: {Nt}")
                    continue

                theta_seq = theta[:self.seq_len]
                ustar_seq = ustar[:self.seq_len]
                xi_seq = xi[:self.seq_len]

                # Stack features per time step: [seq_len, 5, 113, 32]
                features = []
                for i in range(self.seq_len):
                    theta_i = theta_seq[i]  # [113,32]
                    ustar_i = ustar_seq[i]
                    xi_i = xi_seq[i]
                    u_i = u
                    alpha_i = alpha

                    # Convert scalar features to tensors and tile
                    u_tensor = torch.tensor(u_i, dtype=torch.float32).unsqueeze(0).unsqueeze(1).repeat(1, 113, 32)  # [1,113,32]
                    alpha_tensor = torch.tensor(alpha_i, dtype=torch.float32).unsqueeze(0).unsqueeze(1).repeat(1, 113, 32)  # [1,113,32]

                    # Stack all features
                    feature = torch.stack([
                        torch.tensor(ustar_i, dtype=torch.float32),  # [113,32]
                        torch.tensor(theta_i, dtype=torch.float32),  # [113,32]
                        torch.tensor(xi_i, dtype=torch.float32),     # [113,32]
                        u_tensor.squeeze(0),                        # [113,32]
                        alpha_tensor.squeeze(0)                     # [113,32]
                    ], dim=0)  # [5,113,32]

                    features.append(feature)
                
                feature_sequence = torch.stack(features, dim=0)  # [seq_len, 5, 113,32]

                samples.append({
                    'id': id,
                    'input': feature_sequence,  # [3,5,113,32]
                    'target': None  # No target for test set
                })

        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        if self.is_train:
            return sample['input'], sample['target']  # [3,5,113,32], [1,113,32]
        else:
            return sample['input'], None  # [3,5,113,32], None

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    """
    Simplified Residual Block with a single convolution layer and skip connection.
    """
    def __init__(self, channels, kernel_size=3, padding=1, activation=nn.ReLU(inplace=True)):
        super(ResidualBlock, self).__init__()
        self.activation = activation
        self.conv = nn.Conv2d(channels, channels, kernel_size=kernel_size, padding=padding, stride=1, bias=False)
        self.bn = nn.BatchNorm2d(channels)

    def forward(self, x):
        identity = x  # Skip connection
        out = self.conv(x)
        out = self.bn(out)
        out += identity
        out = self.activation(out)
        return out

class ResidualCNN(nn.Module):
    """
    Simplified Residual CNN with fewer residual blocks and reduced width.
    """
    def __init__(self, in_channels=15, num_residual_blocks=3, out_channels=1, kernel_size=3, padding=1, activation=nn.ReLU(inplace=True)):
        super(ResidualCNN, self).__init__()
        self.activation = activation
        self.initial_conv = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, stride=1, padding=padding, bias=False)
        self.bn = nn.BatchNorm2d(in_channels)
        
        # Residual blocks reduced to 3
        self.residual_blocks = nn.Sequential(*[
            ResidualBlock(in_channels, kernel_size=kernel_size, padding=padding, activation=activation)
            for _ in range(num_residual_blocks)
        ])
        
        # Output convolution layer
        self.output_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)

    def forward(self, x):
        # Reshape input to match expected size
        batch_size, seq_len, channels, height, width = x.size()
        x = x.view(batch_size, seq_len * channels, height, width)

        out = self.initial_conv(x)
        out = self.bn(out)
        out = self.activation(out)

        out = self.residual_blocks(out)
        out = self.output_conv(out)

        return out


In [8]:
class ResidualBlock_20(nn.Module):
    """
    Simplified Residual Block with a single convolution layer and skip connection.
    """
    def __init__(self, channels, kernel_size=3, padding=1, activation=nn.ReLU(inplace=True)):
        super(ResidualBlock_20, self).__init__()
        self.activation = activation
        self.conv = nn.Conv2d(channels, channels, kernel_size=kernel_size, padding=padding, stride=1, bias=False)
        self.bn = nn.BatchNorm2d(channels)

    def forward(self, x):
        identity = x  # Skip connection
        out = self.conv(x)
        out = self.bn(out)
        out += identity  # Add skip connection
        out = self.activation(out)
        return out

class ResidualCNN_20(nn.Module):
    """
    Residual CNN tailored for multi-step forecasting.
    """
    def __init__(self, in_channels=25, num_residual_blocks=4, out_channels=1, num_steps=20, kernel_size=3, padding=1, activation=nn.ReLU(inplace=True)):
        super(ResidualCNN_20, self).__init__()
        self.num_steps = num_steps
        self.activation = activation
        self.initial_conv = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, stride=1, padding=padding, bias=False)
        self.bn = nn.BatchNorm2d(in_channels)

        # Residual blocks
        self.residual_blocks = nn.Sequential(*[
            ResidualBlock_20(in_channels, kernel_size=kernel_size, padding=padding, activation=activation)
            for _ in range(num_residual_blocks)
        ])

        # Output convolution layer modified to output num_steps
        self.output_conv = nn.Conv2d(in_channels, out_channels * num_steps, kernel_size=1, stride=1, padding=0, bias=True)

    def forward(self, x):
        """
        Forward pass for ResidualCNN.

        Args:
            x (Tensor): Input tensor of shape [batch_size, seq_len, 5, 113, 32].

        Returns:
            Tensor: Output tensor of shape [batch_size, num_steps, out_channels, 113, 32].
        """
        batch_size, seq_len, channels, height, width = x.size()
        x = x.view(batch_size, seq_len * channels, height, width)  # [batch_size, 25, 113, 32]

        out = self.initial_conv(x)  # [batch_size, 25, 113, 32]
        out = self.bn(out)
        out = self.activation(out)

        out = self.residual_blocks(out)  # [batch_size, 25, 113, 32]
        out = self.output_conv(out)      # [batch_size, 20, 113, 32] assuming num_steps=20

        # Reshape to [batch_size, num_steps, out_channels, 113, 32]
        out = out.view(batch_size, self.num_steps, -1, height, width)

        return out

In [9]:
class SelfAttention(nn.Module):
    def __init__(self, in_dim):
        super(SelfAttention, self).__init__()
        self.query_conv = nn.Conv2d(in_dim, in_dim // 8, kernel_size=1)
        self.key_conv   = nn.Conv2d(in_dim, in_dim // 8, kernel_size=1)
        self.value_conv = nn.Conv2d(in_dim, in_dim, kernel_size=1)
        self.gamma      = nn.Parameter(torch.zeros(1))
        self.softmax    = nn.Softmax(dim=-1)
    
    def forward(self, x):
        """
            inputs :
                x : input feature maps (B X C X H X W)
            returns :
                out : self attention value + input feature 
                attention: B X (H*W) X (H*W)
        """
        m_batchsize, C, width, height = x.size()
        proj_query  = self.query_conv(x).view(m_batchsize, -1, width*height).permute(0, 2, 1)  # B X N X C'
        proj_key    = self.key_conv(x).view(m_batchsize, -1, width*height)  # B X C' X N
        energy      = torch.bmm(proj_query, proj_key)  # batch matrix-matrix product: B X N X N
        attention   = self.softmax(energy)  # B X N X N
        proj_value  = self.value_conv(x).view(m_batchsize, -1, width*height)  # B X C X N
        
        out = torch.bmm(proj_value, attention.permute(0, 2, 1))  # B X C X N
        out = out.view(m_batchsize, C, width, height)
        
        out = self.gamma * out + x
        return out

# 5. Define the Self-Attention ConvLSTM Classes
class SelfAttentionConvLSTMCell(nn.Module):
    def __init__(self, input_dim, hidden_dim, kernel_size, bias=True):
        super(SelfAttentionConvLSTMCell, self).__init__()
        padding = kernel_size // 2
        # Standard ConvLSTMCell
        self.conv = nn.Conv2d(input_dim + hidden_dim, 4 * hidden_dim, kernel_size, padding=padding, bias=bias)
        # Self-Attention
        self.attention = SelfAttention(hidden_dim)
        self.hidden_dim = hidden_dim

    def forward(self, x, h_prev, c_prev):
        combined = torch.cat([x, h_prev], dim=1)  # [batch, input_dim + hidden_dim, H, W]
        conv_output = self.conv(combined)
        cc_i, cc_f, cc_o, cc_g = torch.split(conv_output, self.hidden_dim, dim=1)
        
        i = torch.sigmoid(cc_i)
        f = torch.sigmoid(cc_f)
        o = torch.sigmoid(cc_o)
        g = torch.tanh(cc_g)
        
        c_next = f * c_prev + i * g
        h_next = o * torch.tanh(c_next)
        
        # Apply Self-Attention
        h_next = self.attention(h_next)
        
        return h_next, c_next

class SelfAttentionConvLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, kernel_size, num_layers):
        super(SelfAttentionConvLSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        
        self.cell_list = nn.ModuleList([
            SelfAttentionConvLSTMCell(input_dim if i == 0 else hidden_dim, hidden_dim, kernel_size)
            for i in range(num_layers)
        ])
    
    def forward(self, x):
        # x shape: [batch_size, seq_len, channels, height, width]
        b, seq_len, _, h, w = x.size()
        h_t = [torch.zeros(b, self.hidden_dim, h, w, device=x.device) for _ in range(self.num_layers)]
        c_t = [torch.zeros(b, self.hidden_dim, h, w, device=x.device) for _ in range(self.num_layers)]
        
        outputs = []
        for t in range(seq_len):
            x_t = x[:, t, :, :, :]  # [batch, channels, H, W]
            h_t[0], c_t[0] = self.cell_list[0](x_t, h_t[0], c_t[0])
            for layer in range(1, self.num_layers):
                h_t[layer], c_t[layer] = self.cell_list[layer](h_t[layer - 1], h_t[layer], c_t[layer])
            outputs.append(h_t[-1])  # [batch, hidden_dim, H, W]
        
        outputs = torch.stack(outputs, dim=1)  # [batch, seq_len, hidden_dim, H, W]
        return outputs, (h_t, c_t)

class SelfAttentionConvLSTM_Model(nn.Module):
    def __init__(self, input_dim, hidden_dim, kernel_size, num_layers):
        super(SelfAttentionConvLSTM_Model, self).__init__()
        self.selfatt_convlstm = SelfAttentionConvLSTM(input_dim, hidden_dim, kernel_size, num_layers)
        self.conv_out = nn.Conv2d(hidden_dim, 1, kernel_size=1)
    
    def forward(self, x):
        # x shape: [batch_size, seq_len, channels, height, width]
        outputs, _ = self.selfatt_convlstm(x)
        # Get the output from the last time step
        last_output = outputs[:, -1, :, :, :]  # [batch, hidden_dim, H, W]
        out = self.conv_out(last_output)  # [batch, 1, H, W]
        return out

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock1(nn.Module):
    """
    Simplified Residual Block with a single convolution layer and skip connection.
    """
    def __init__(self, channels, kernel_size=3, padding=1, activation=nn.ReLU(inplace=True)):
        super(ResidualBlock1, self).__init__()
        self.activation = activation
        self.conv = nn.Conv2d(channels, channels, kernel_size=kernel_size, padding=padding, stride=1, bias=False)
        self.bn = nn.BatchNorm2d(channels)

    def forward(self, x):
        identity = x  # Skip connection
        out = self.conv(x)
        out = self.bn(out)
        out += identity
        out = self.activation(out)
        return out

class SelfAttention1(nn.Module):
    """
    Simple Self-Attention Module.
    """
    def __init__(self, in_dim):
        super(SelfAttention1, self).__init__()
        self.chanel_in = in_dim

        # Query, Key, Value transformations
        self.query_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
        self.key_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
        self.value_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)

        # Softmax for attention weights
        self.softmax = nn.Softmax(dim=-1)

        # Learnable scaling parameter
        self.gamma = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        """
        Forward pass for self-attention.

        Args:
            x: Input feature maps (B x C x H x W)
        
        Returns:
            out: Self-attended feature maps
            attention: Attention map
        """
        m_batchsize, C, width, height = x.size()

        # Generate Query, Key, and Value matrices
        proj_query = self.query_conv(x).view(m_batchsize, -1, width * height).permute(0, 2, 1)  # B x (W*H) x C'
        proj_key = self.key_conv(x).view(m_batchsize, -1, width * height)  # B x C' x (W*H)
        energy = torch.bmm(proj_query, proj_key)  # Batch matrix multiplication: B x (W*H) x (W*H)
        attention = self.softmax(energy)  # Apply softmax to get attention weights

        proj_value = self.value_conv(x).view(m_batchsize, -1, width * height)  # B x C x (W*H)

        out = torch.bmm(proj_value, attention.permute(0, 2, 1))  # B x C x (W*H)
        out = out.view(m_batchsize, C, width, height)  # Reshape to original dimensions

        out = self.gamma * out + x  # Weighted sum with input (residual connection)
        return out, attention

class ResidualCNNWithSelfAttention(nn.Module):
    """
    Residual CNN with Self-Attention mechanism.
    """
    def __init__(self, in_channels=25, num_residual_blocks=3, out_channels=1, kernel_size=3, padding=1, activation=nn.ReLU(inplace=True)):
        super(ResidualCNNWithSelfAttention, self).__init__()
        self.activation = activation
        self.initial_conv = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, stride=1, padding=padding, bias=False)
        self.bn = nn.BatchNorm2d(in_channels)
        
        # Residual blocks reduced to 3
        self.residual_blocks = nn.Sequential(*[
            ResidualBlock1(in_channels, kernel_size=kernel_size, padding=padding, activation=activation)
            for _ in range(num_residual_blocks)
        ])
        
        # Self-Attention module
        self.self_attention = SelfAttention1(in_dim=in_channels)
        
        # Output convolution layer
        self.output_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)

    def forward(self, x):
        # Reshape input to match expected size
        batch_size, seq_len, channels, height, width = x.size()
        x = x.view(batch_size, seq_len * channels, height, width)  # [batch, 25, 113, 32]

        out = self.initial_conv(x)
        out = self.bn(out)
        out = self.activation(out)

        out = self.residual_blocks(out)
        
        # Apply Self-Attention
        out, attention = self.self_attention(out)
        
        out = self.output_conv(out)

        return out


In [11]:
def train_model(model, encoder_name, train_loader, val_loader, criterion, optimizer, num_epochs, device, model_save_path, patience=5, delta=0, lambda_phy=1.0):
    """
    Trains the model with Early Stopping and Physics-Informed Loss.

    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for training data.
        val_loader (DataLoader): DataLoader for validation data.
        criterion (nn.Module): Loss function for data loss.
        optimizer (torch.optim.Optimizer): Optimizer.
        num_epochs (int): Maximum number of epochs to train.
        device (torch.device): Device to train on.
        model_save_path (str): Path to save the best model.
        patience (int): Number of epochs with no improvement after which training will be stopped.
        delta (float): Minimum change in the monitored quantity to qualify as an improvement.
        lambda_phy (float): Weight for the physics-informed loss term.

    Returns:
        model (nn.Module): The trained model loaded with the best weights.
        best_val_loss (float): The best validation loss achieved.
    """
    # Initialize EarlyStopping object if enabled
    if use_early_stopping:
        early_stopping = EarlyStopping(patience=patience, verbose=True, delta=delta, path=model_save_path)
        print("Early Stopping is ENABLED.")
    else:
        early_stopping = None
        print("Early Stopping is DISABLED.")
        
    best_val_loss = float('inf')  # Initialize best validation loss
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for inputs, targets in train_loader:
            inputs = inputs.to(device)  # [batch, seq_len, 5, 113,32]           
            targets = targets.to(device)  # [batch, 1, 113,32] or appropriate shape          
            optimizer.zero_grad()           
            outputs = model(inputs)      # [batch,1,113,32]   
            data_loss = criterion(outputs, targets)
            loss = data_loss    
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        epoch_train_loss = running_loss / len(train_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}')
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs = inputs.to(device)
                targets = targets.to(device)
                outputs = model(inputs)      # [batch,1,113,32]   
                loss = criterion(outputs, targets)
                val_loss += loss.item() * inputs.size(0)
        
        epoch_val_loss = val_loss / len(val_loader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {epoch_val_loss:.4f}')

        if use_early_stopping:
            early_stopping(epoch_val_loss, model)
            
            if early_stopping.early_stop:
                print("Early stopping triggered. Stopping training.")
                break
        else:
            if epoch_val_loss < best_val_loss:
                best_val_loss = epoch_val_loss
                torch.save(model.state_dict(), model_save_path)
                print(f"Model saved with val loss: {best_val_loss:.4f}")
    
    if use_early_stopping:
        model.load_state_dict(torch.load(model_save_path))
        print(f'Best Val Loss: {early_stopping.val_loss_min:.4f}')
    else:
        print(f'Final Val Loss: {best_val_loss:.4f}')
    
    return model, best_val_loss

In [12]:
# Parameters for Self-Attention ConvLSTM
input_dim_selfatt = 5
hidden_dim_selfatt = 64  # Number of filters in Self-Attention ConvLSTM
kernel_size_selfatt = 3
num_layers_selfatt = 5

channels_per_timestep = 5

# Create Training Dataset and Define Folds
df_train = pd.read_csv(train_csv)
simulations = df_train['id'].unique()
num_folds = len(simulations)  # 9 folds

# Shuffle simulations for randomness
shuffled_simulations = np.random.permutation(simulations)

# Assign each simulation to a fold (one simulation per fold)
folds = []
for sim in shuffled_simulations:
    folds.append([sim])  # Each fold has one simulation

criterion = nn.MSELoss()
saved_model_paths = []
fold_val_losses = {}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
for fold_idx, val_simulations in enumerate(folds):
    print(f'\nFold {fold_idx+1}/{num_folds}')
    
    val_mask = df_train['id'].isin(val_simulations)
    train_mask = ~val_mask
    
    print(f"\nFold {fold_idx + 1} - Validation 'id's:")
    print(df_train.loc[val_mask, 'id'].unique())  # Print unique IDs in validation set

    print(f"\nFold {fold_idx + 1} - Train 'id's:")
    print(df_train.loc[train_mask, 'id'].unique())  # Print unique IDs in training set

    # Create training and validation subsets
    train_subset = FireDataset(df_train[train_mask], train_data_dir, seq_len=seq_len, is_train=True)
    val_subset = FireDataset(df_train[val_mask], train_data_dir, seq_len=seq_len, is_train=True)

    print(f'Training samples: {len(train_subset)}')
    print(f'Validation samples: {len(val_subset)}')

    # Create DataLoaders
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, worker_init_fn=seed_worker)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, worker_init_fn=seed_worker)

    # For each model
    for encoder_name in model_names:
        print(f'\nTraining model with encoder {encoder_name}')
        if encoder_name == 'residualcnn':
            model = ResidualCNN(in_channels=seq_len * 5, num_residual_blocks=4).to(device)  # 5 channels per time step
        elif encoder_name == 'selfattconvlstm':
            model = SelfAttentionConvLSTM_Model(input_dim=input_dim_selfatt, hidden_dim=hidden_dim_selfatt, kernel_size=kernel_size_selfatt, num_layers=num_layers_selfatt).to(device)
        elif encoder_name == 'residualcnnsa':
            model = ResidualCNNWithSelfAttention(
                in_channels=25,
                num_residual_blocks=4,
                out_channels=1,
                kernel_size=3,
                padding=1,
                activation=nn.ReLU(inplace=True)
            ).to(device)
            
        model_save_path = f'model_{encoder_name}_fold{fold_idx+1}.pth'
        
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
        
        # Retrieve num_epochs for the current model
        current_num_epochs = model_epochs.get(encoder_name, num_epochs_default)  # Fallback to default num_epochs if not specified
        print(f"Training {encoder_name} for {current_num_epochs}")

        # Train the model with Early Stopping
        model, best_val_loss = train_model(
            model=model,
            encoder_name=encoder_name,
            train_loader=train_loader,
            val_loader=val_loader,
            criterion=criterion,
            optimizer=optimizer,
            num_epochs=current_num_epochs,
            device=device,
            model_save_path=model_save_path,
            patience=patience,
            delta=delta,
            lambda_phy=1.0  # Weight for physics loss; adjust as needed
        )
        
        fold_val_losses[(fold_idx + 1, encoder_name)] = best_val_loss
        
        # Append the model save path
        saved_model_paths.append(model_save_path)

# Print average score for each fold and across all models
for fold_idx in range(num_folds):
    fold_losses = [loss for (fold, model), loss in fold_val_losses.items() if fold == fold_idx + 1]
    avg_fold_loss = np.mean(fold_losses) if fold_losses else 0.0  # Handle potential empty list
    print(f"Fold {fold_idx+1} Average Validation Loss: {avg_fold_loss:.4f}")

# Calculate and print overall average validation loss across all folds and models
overall_avg_loss = np.mean(list(fold_val_losses.values()))
print(f"Overall Average Validation Loss: {overall_avg_loss:.4f}")



Fold 1/9

Fold 1 - Validation 'id's:
[633229]

Fold 1 - Train 'id's:
[804025 875935 930086 661713 868570  16525 808631 220212]
Training samples: 1160
Validation samples: 145

Training model with encoder residualcnnsa
Training residualcnnsa for 50
Early Stopping is DISABLED.
Epoch 1/50, Train Loss: 0.0451
Epoch 1/50, Val Loss: 0.0233
Model saved with val loss: 0.0233
Epoch 2/50, Train Loss: 0.0230
Epoch 2/50, Val Loss: 0.0188
Model saved with val loss: 0.0188
Epoch 3/50, Train Loss: 0.0196
Epoch 3/50, Val Loss: 0.0164
Model saved with val loss: 0.0164
Epoch 4/50, Train Loss: 0.0148
Epoch 4/50, Val Loss: 0.0144
Model saved with val loss: 0.0144
Epoch 5/50, Train Loss: 0.0123
Epoch 5/50, Val Loss: 0.0124
Model saved with val loss: 0.0124
Epoch 6/50, Train Loss: 0.0111
Epoch 6/50, Val Loss: 0.0115
Model saved with val loss: 0.0115
Epoch 7/50, Train Loss: 0.0104
Epoch 7/50, Val Loss: 0.0112
Model saved with val loss: 0.0112
Epoch 8/50, Train Loss: 0.0100
Epoch 8/50, Val Loss: 0.0111
Model 

In [14]:
!pip freeze > requirements.txt

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os

# Function to parse the model_path and extract encoder_name, fold, and seq_len.
def parse_model_path(model_path):
    """
    Parses the model path to extract encoder_name and fold.
    Assumes the model_path format: 'model_<encoder_name>_fold<fold>.pth'
    Example: 'model_residualcnn_fold1.pth'
    """
    filename = os.path.basename(model_path)
    parts = filename.split('_')

    if len(parts) < 3:
        raise ValueError(f"Unexpected model filename format: {filename}")

    encoder_name = parts[1]

    # Extract fold
    fold_part = parts[2]  # e.g., 'fold1.pth'
    fold_str = fold_part.split('.')[0]  # 'fold1'
    fold = int(fold_str.replace('fold', ''))

    # Define seq_len based on encoder_name using a predefined mapping
    encoder_seq_len_mapping = {
        'residualcnn': SEQ_LEN,
        'residualcnnsa': SEQ_LEN,
        'convlstm': SEQ_LEN,
        'selfattconvlstm': SEQ_LEN,
        '3dconvlstm': SEQ_LEN,
        'dconvlstmsac': SEQ_LEN,
        'bidirectconvlstmunet': SEQ_LEN,
        'attentionunet': SEQ_LEN,
        'deeplabtemporal': SEQ_LEN,
        'unet': SEQ_LEN,
        'residualdensecnn': SEQ_LEN,
        'multiscaleconvlstm': SEQ_LEN,
        'biconvlstm': SEQ_LEN,
        'spatiotemporalcnnmdn': SEQ_LEN,
    }

    if encoder_name not in encoder_seq_len_mapping:
        raise ValueError(f"Unknown encoder_name '{encoder_name}', cannot determine seq_len.")

    seq_len = encoder_seq_len_mapping[encoder_name]

    return encoder_name, fold, seq_len


# Define the maximum sequence length required for your models
test_seq_len = max([parse_model_path(model_path)[2] for model_path in saved_model_paths])  # e.g., 5
test_dataset = FireDataset(test_csv, test_data_dir, seq_len=test_seq_len, is_train=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=4, pin_memory=True, worker_init_fn=seed_worker)

# Extract IDs
ids = [sample['id'] for sample in test_dataset.samples]

predictions = []

models = []
model_seq_lens = []  # To store seq_len for each model

for model_path in saved_model_paths:
    try:
        encoder_name, fold, seq_len = parse_model_path(model_path)
    except ValueError as e:
        print(f"Skipping model {model_path}: {e}")
        continue

    print(f"Loading model: {model_path}, Encoder: {encoder_name}, Fold: {fold}, Seq_len: {seq_len}")

    # Initialize the model with the correct in_channels based on seq_len
    in_channels = seq_len * 5  # 5 channels per time step

    if encoder_name == 'residualcnn':
        model = ResidualCNN(in_channels=in_channels, num_residual_blocks=4).to(device)
    elif encoder_name == 'selfattconvlstm':
          model = SelfAttentionConvLSTM_Model(input_dim=input_dim_selfatt, hidden_dim=hidden_dim_selfatt, kernel_size=kernel_size_selfatt, num_layers=num_layers_selfatt).to(device)
    elif encoder_name == 'residualcnnsa':
            model = ResidualCNNWithSelfAttention(
                in_channels=25,
                num_residual_blocks=4,
                out_channels=1,
                kernel_size=3,
                padding=1,
                activation=nn.ReLU(inplace=True)
            ).to(device)
            
    # Load state_dict
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    models.append(model)
    model_seq_lens.append(seq_len)  # Store the seq_len for this model


# Prediction Loop
num_timesteps = 20  # Number of future timesteps to predict

for idx in range(len(test_dataset)):
    # Extract the sample
    input_seq, _ = test_dataset[idx]  # FireDataset returns (input_seq, None)
    input_seq = input_seq.unsqueeze(0).to(device)  # Shape: [1, seq_len_max, 5, 113,32]

    preds_per_id = []

    for t in range(num_timesteps):
        ensemble_output = np.zeros((113,32), dtype=np.float32)
        model_count = 0  # To count valid models for averaging

        for model, seq_len in zip(models, model_seq_lens):
            with torch.no_grad():
                # Prepare input based on the model's seq_len
                if input_seq.size(1) < seq_len:
                    raise ValueError(f"Input sequence length {input_seq.size(1)} is less than model's seq_len {seq_len}.")

                # Select the last 'seq_len' time steps
                selected_seq = input_seq[:, -seq_len:, :, :, :]  # [1, seq_len, 5,113,32]
                
                output = model(selected_seq)  # [1,1,113,32]

                # Ensure output shape consistency
                if output.shape != (1, 1, 113, 32):
                    raise ValueError(f"Unexpected output shape: {output.shape}")

                ensemble_output += output.squeeze(0).squeeze(0).cpu().numpy()  # Shape: [113,32]
                model_count += 1

        if model_count == 0:
            raise ValueError("No valid models were processed for averaging.")

        ensemble_output /= model_count  # Averaging over ensemble

        preds_per_id.append(ensemble_output.flatten())  # Flattened prediction: [H*W]

        # Prepare the new prediction to append to the sequence
        new_pred = torch.from_numpy(ensemble_output).unsqueeze(0).unsqueeze(0).to(device)  # [1,1,113,32]

        # Update the input_seq by shifting the sequence
        shifted_seq = input_seq[:, 1:, :, :, :]  # [1, seq_len_max -1, 5,113,32]
        new_time_step = input_seq[:, -1, :, :, :].clone()  # [1,5,113,32]
        new_time_step[:, 2, :, :] = new_pred.squeeze(0).squeeze(0)  # Replace 'xi' channel
        input_seq = torch.cat([shifted_seq, new_time_step.unsqueeze(1)], dim=1)  # [1, seq_len_max,5,113,32]

    # Concatenate all predictions for this ID
    preds_flat = np.concatenate(preds_per_id)  # Shape: [num_timesteps * H * W]
    predictions.append(preds_flat)

# Prepare Submission
submission = pd.DataFrame(predictions)
submission.insert(0, 'id', ids)
expected_pixels = 113 * 32 * num_timesteps  # 72,320 for 20 timesteps
submission.columns = ['id'] + [f'pixel_{i}' for i in range(1, submission.shape[1])]
assert submission.shape == (len(test_dataset), 1 + expected_pixels), f"Expected shape ({len(test_dataset)}, {1 + expected_pixels}), got {submission.shape}"
submission.to_csv('submission.csv', index=False)
print('Submission file saved as submission.csv')


Loading model: model_residualcnnsa_fold1.pth, Encoder: residualcnnsa, Fold: 1, Seq_len: 5
Loading model: model_residualcnnsa_fold2.pth, Encoder: residualcnnsa, Fold: 2, Seq_len: 5
Loading model: model_residualcnnsa_fold3.pth, Encoder: residualcnnsa, Fold: 3, Seq_len: 5
Loading model: model_residualcnnsa_fold4.pth, Encoder: residualcnnsa, Fold: 4, Seq_len: 5
Loading model: model_residualcnnsa_fold5.pth, Encoder: residualcnnsa, Fold: 5, Seq_len: 5
Loading model: model_residualcnnsa_fold6.pth, Encoder: residualcnnsa, Fold: 6, Seq_len: 5
Loading model: model_residualcnnsa_fold7.pth, Encoder: residualcnnsa, Fold: 7, Seq_len: 5
Loading model: model_residualcnnsa_fold8.pth, Encoder: residualcnnsa, Fold: 8, Seq_len: 5
Loading model: model_residualcnnsa_fold9.pth, Encoder: residualcnnsa, Fold: 9, Seq_len: 5


  model.load_state_dict(torch.load(model_path, map_location=device))


Submission file saved as submission.csv
