In [27]:
import xarray as xr
import os
import torch

torch.device("cuda" if torch.cuda.is_available() else "cpu")

LOW_RES_SAMPLE_PATH = "ClimSim_low-res/train/"
LOW_RES_GRID_PATH = "ClimSim_low-res/ClimSim_low-res_grid-info.nc"

In [21]:
def get_data_folders(path):
    data_folders = os.listdir(path)
    data_folders.sort()

    mli_samples = []
    mlo_samples = []
    for dir_name in data_folders:
        files = os.listdir(os.path.join(path, dir_name))
        for f in files:
            if f.split('.')[1] == 'mli':
                mli_samples.append(os.path.join(path, dir_name, f))
            elif f.split('.')[1] == 'mlo':
                mlo_samples.append(os.path.join(path, dir_name, f))
    
    return mli_samples, mlo_samples

def read_sample(file_path):
    return xr.open_dataset(file_path)

In [37]:
mli_samples, mlo_samples = get_data_folders(LOW_RES_SAMPLE_PATH)

print(f"Number of MLI samples: {len(mli_samples)}")
print(f"Number of MLO samples: {len(mlo_samples)}")

grid = xr.open_dataset(LOW_RES_GRID_PATH)

Number of MLI samples: 10872
Number of MLO samples: 10872


In [103]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

from sklearn.model_selection import train_test_split

class ClimSimMLP(nn.Module):
    def __init__(self, input_dim=556):
        super(ClimSimMLP, self).__init__()
        
        # Hidden Layers: [768, 640, 512, 640, 640]
        self.layer1 = nn.Linear(input_dim, 768)
        self.layer2 = nn.Linear(768, 640)
        self.layer3 = nn.Linear(640, 512)
        self.layer4 = nn.Linear(512, 640)
        self.layer5 = nn.Linear(640, 640)
        

        self.last_hidden = nn.Linear(640, 128)
        
        # --- Output Heads ---
        # 120 tendencies (Linear) + 8 surface variables (ReLU)
        self.head_tendencies = nn.Linear(128, 120)
        self.head_surface = nn.Linear(128, 8)
        
        # LeakyReLU alpha=0.15
        self.activation = nn.LeakyReLU(0.15)

    def forward(self, x):
        # Pass through the 5 main hidden layers
        x = self.activation(self.layer1(x))
        x = self.activation(self.layer2(x))
        x = self.activation(self.layer3(x))
        x = self.activation(self.layer4(x))
        x = self.activation(self.layer5(x))
        
        # Pass through the fixed 128 layer
        x = self.activation(self.last_hidden(x))
        
        # Output 1: Tendencies (Linear activation)
        out_linear = self.head_tendencies(x)
        
        # Output 2: Surface variables (ReLU activation)
        out_relu = F.relu(self.head_surface(x))
        
        # Concatenate along the feature dimension (dim=1)
        return torch.cat([out_linear, out_relu], dim=1)

@torch.no_grad()
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_samples = 0

    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        batch_size = inputs.size(0)
        total_loss += loss.item() * batch_size
        total_samples += batch_size

    average_loss = total_loss / total_samples
    return average_loss

def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    total_samples = 0
    
    pbar = tqdm(dataloader, desc="Training", unit="batch")

    for inputs, targets in pbar:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        batch_size = inputs.size(0)
        total_loss += loss.item() * batch_size
        total_samples += batch_size
        
        # Update progress bar description with current loss
        pbar.set_postfix({"loss": f"{loss.item():.6f}"})

    return total_loss / total_samples

In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np
import os

class ClimSimMultiShardDataset(Dataset):
    def __init__(self, shard_dir, shard_indices, transform=None):
        """
        Args:
            shard_dir (str): Folder where .npy files are stored.
            shard_indices (list): List of integers [0, 1, 2...] identifying shards.
            transform (callable): Normalizer/Standardizer.
        """
        self.shard_dir = shard_dir
        self.shard_indices = shard_indices
        self.transform = transform
        
        self.x_files = [os.path.join(shard_dir, f"X_shard_{i}.npy") for i in shard_indices]
        self.y_files = [os.path.join(shard_dir, f"Y_shard_{i}.npy") for i in shard_indices]
        
        # 1. Map out the shards without loading them into memory
        self.shard_lengths = []
        for f in self.x_files:
            # We open the header only to get the shape
            temp_x = np.load(f, mmap_mode='r')
            self.shard_lengths.append(temp_x.shape[0])
            del temp_x
            
        self.cumulative_lengths = np.cumsum(self.shard_lengths)
        self.total_size = self.cumulative_lengths[-1]
        
        # 2. Keep handles to the memory-mapped arrays
        self.x_shards = [np.load(f, mmap_mode='r') for f in self.x_files]
        self.y_shards = [np.load(f, mmap_mode='r') for f in self.y_files]

    def __len__(self):
        return self.total_size

    def __getitem__(self, idx):
        # 3. Figure out which shard 'idx' belongs to
        shard_idx = np.searchsorted(self.cumulative_lengths, idx, side='right')
        
        # 4. Calculate the local index within that specific shard
        if shard_idx == 0:
            local_idx = idx
        else:
            local_idx = idx - self.cumulative_lengths[shard_idx - 1]
            
        # 5. Extract and copy to memory
        x = torch.from_numpy(self.x_shards[shard_idx][local_idx].copy()).float()
        y = torch.from_numpy(self.y_shards[shard_idx][local_idx].copy()).float()
        
        if self.transform:
            x = self.transform(x)
            
        return x, y

In [100]:
BATCH_SIZE = 2880
N_EPOCHS = 10
INPUT_DIM = len(selected_levelized_features) * grid.sizes["lev"]

model = ClimSimMLP(input_dim=INPUT_DIM)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [107]:
mli_files, mlo_files = get_data_folders(LOW_RES_SAMPLE_PATH)

mli_files = mli_files[:3]
mlo_files = mlo_files[:3]

(train_mli_files, train_mlo_files), (test_mli_files, test_mlo_files) = ClimSimDataset.train_test_split(mli_files, mlo_files)

train = ClimSimDataset(train_mli_files, train_mlo_files, selected_levelized_features, target)
test = ClimSimDataset(test_mli_files, test_mlo_files, selected_levelized_features, target)

batch_size = 2880 # It's dataset_size/8 which is 48 columns

train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)

In [108]:
for epoch in range(N_EPOCHS):
    train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device="cpu")
    val_loss = evaluate_model(model, test_loader, criterion, device="cpu")
    
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Training:   0%|          | 0/1 [00:00<?, ?batch/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x138240 and 360x768)

In [1]:
import numpy as np

df = np.load("ClimSimLowResShards/X_shard_0.npy")

Found 0 data folders.
