# Imports

In [44]:
import sys
print('System Version:', sys.version)

System Version: 3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:45:41) [GCC 13.3.0]


In [45]:
print(sys.executable)

/global/homes/b/brelypo/.conda/envs/sic_sie_env/bin/python


In [46]:
import numpy as np
print('Numpy version', np.__version__)

Numpy version 2.2.6


In [47]:
import pandas as pd
print('Pandas version', pd.__version__)

Pandas version 2.3.0


In [48]:
import xarray as xr
print('Xarray version', xr.__version__)

Xarray version 2025.6.0


In [49]:
import matplotlib
import matplotlib.pyplot as plt
print('Matplotlib version', matplotlib.__version__)

Matplotlib version 3.10.3


In [50]:
import torch
from torch.utils.data import Dataset, DataLoader

print('PyTorch version', torch.__version__)

PyTorch version 2.5.1


# Example of one netCDF file with xarray

In [51]:
ds = xr.open_dataset("train/v3.LR.DTESTM.pm-cpu-10yr.mpassi.hist.am.timeSeriesStatsDaily.0010-01-01.nc")

In [52]:
ds.data_vars

Data variables:
    timeDaily_counter             (Time) int32 124B ...
    xtime_startDaily              (Time) |S64 2kB ...
    xtime_endDaily                (Time) |S64 2kB ...
    timeDaily_avg_iceAreaCell     (Time, nCells) float32 58MB ...
    timeDaily_avg_iceVolumeCell   (Time, nCells) float32 58MB ...
    timeDaily_avg_snowVolumeCell  (Time, nCells) float32 58MB ...
    timeDaily_avg_uVelocityGeo    (Time, nVertices) float32 117MB ...
    timeDaily_avg_vVelocityGeo    (Time, nVertices) float32 117MB ...

In [53]:
day_counter = ds["timeDaily_counter"]
day_counter.shape

(31,)

In [54]:
print(ds["xtime_startDaily"])

<xarray.DataArray 'xtime_startDaily' (Time: 31)> Size: 2kB
[31 values with dtype=|S64]
Dimensions without coordinates: Time


In [55]:
print(ds["xtime_startDaily"].values)

[b'0010-01-01_00:00:00' b'0010-01-02_00:00:00' b'0010-01-03_00:00:00'
 b'0010-01-04_00:00:00' b'0010-01-05_00:00:00' b'0010-01-06_00:00:00'
 b'0010-01-07_00:00:00' b'0010-01-08_00:00:00' b'0010-01-09_00:00:00'
 b'0010-01-10_00:00:00' b'0010-01-11_00:00:00' b'0010-01-12_00:00:00'
 b'0010-01-13_00:00:00' b'0010-01-14_00:00:00' b'0010-01-15_00:00:00'
 b'0010-01-16_00:00:00' b'0010-01-17_00:00:00' b'0010-01-18_00:00:00'
 b'0010-01-19_00:00:00' b'0010-01-20_00:00:00' b'0010-01-21_00:00:00'
 b'0010-01-22_00:00:00' b'0010-01-23_00:00:00' b'0010-01-24_00:00:00'
 b'0010-01-25_00:00:00' b'0010-01-26_00:00:00' b'0010-01-27_00:00:00'
 b'0010-01-28_00:00:00' b'0010-01-29_00:00:00' b'0010-01-30_00:00:00'
 b'0010-01-31_00:00:00']


In [56]:
ice_area = ds["timeDaily_avg_iceAreaCell"]
ice_area.shape

(31, 465044)

In [57]:
ice_area.values

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(31, 465044), dtype=float32)

In [58]:
print(ds.coords)
print(ds.dims)

Coordinates:
    *empty*


In [59]:
print(ds)

<xarray.Dataset> Size: 407MB
Dimensions:                       (Time: 31, nCells: 465044, nVertices: 942873)
Dimensions without coordinates: Time, nCells, nVertices
Data variables:
    timeDaily_counter             (Time) int32 124B ...
    xtime_startDaily              (Time) |S64 2kB b'0010-01-01_00:00:00' ... ...
    xtime_endDaily                (Time) |S64 2kB ...
    timeDaily_avg_iceAreaCell     (Time, nCells) float32 58MB 0.0 0.0 ... 0.0
    timeDaily_avg_iceVolumeCell   (Time, nCells) float32 58MB ...
    timeDaily_avg_snowVolumeCell  (Time, nCells) float32 58MB ...
    timeDaily_avg_uVelocityGeo    (Time, nVertices) float32 117MB ...
    timeDaily_avg_vVelocityGeo    (Time, nVertices) float32 117MB ...
Attributes: (12/490)
    case:                                                         v3.LR.DTEST...
    source_id:                                                    9741e0bba2
    realm:                                                        seaIce
    product:              

# Freeboard calculation functions

In [60]:
import numpy as np

# Constants (adjust if you use different units)
D_WATER = 1023  # Density of seawater (kg/m^3)
D_ICE = 917     # Density of sea ice (kg/m^3)
D_SNOW = 330    # Density of snow (kg/m^3)

MIN_AREA = 1e-6

def compute_freeboard(area: np.ndarray, 
                      ice_volume: np.ndarray, 
                      snow_volume: np.ndarray) -> np.ndarray:
    """
    Compute sea ice freeboard from ice and snow volume and area.
    
    Parameters
    ----------
    area : np.ndarray
        Sea ice concentration / area (same shape as ice_volume and snow_volume).
    ice_volume : np.ndarray
        Sea ice volume per grid cell.
    snow_volume : np.ndarray
        Snow volume per grid cell.
    
    Returns
    -------
    freeboard : np.ndarray
        Freeboard height for each cell, same shape as inputs.
    """
    # Initialize arrays
    height_ice = np.zeros_like(ice_volume)
    height_snow = np.zeros_like(snow_volume)

    # Valid mask: avoid dividing by very small or zero area
    valid = area > MIN_AREA

    # Safely compute heights where valid
    height_ice[valid] = ice_volume[valid] / area[valid]
    height_snow[valid] = snow_volume[valid] / area[valid]

    # Compute freeboard using the physical formula
    freeboard = (
        height_ice * (D_WATER - D_ICE) / D_WATER +
        height_snow * (D_WATER - D_SNOW) / D_WATER
    )

    return freeboard


# Custom Pytorch Dataset

In [61]:
import os
from torch.utils.data import Dataset
from datetime import timedelta
from typing import List, Union, Callable, Tuple
from NC_FILE_PROCESSING.nc_utility_functions import *
from perlmutterpath import *

# __ init __ - masks and loads the data into tensors

In [62]:
class DailyNetCDFDataset(Dataset):
    """
    PyTorch Dataset that concatenates a directory of month-wise NetCDF files
    along their 'Time' dimension and yields daily data *plus* its timestamp.

    Parameters
    ----------
    data_dir : str
        Directory containing NetCDF files (e.g. 202501.nc, 202502.nc, …).
    transform : Callable | None
        Optional transform applied to the data tensor *only*.
    decode_time : bool
        Let xarray convert CF-style time coordinates to np.datetime64.
    drop_missing : bool
        If True, drops any days where one of the requested variables is missing.
    """
    def __init__(
        self,
        data_dir: str,
        transform: Callable = None,
        decode_time: bool = True,
        drop_missing: bool = True,
        cell_mask=None
    ):

        """ __init__ needs to 
        1) Gather the sorted daily data from each netCDF file (1 file = 1 month of daily data)
        2) Store the datetime information from each nCell array from the daily data
        3) Apply a mask to nCells (TODO: IMPLEMENT THE MASK)
        4) Store patch_ids so the data loader can use them (TODO: IMPLEMENT THIS)
        5) Perform pre-processing (calculate Freeboard)
        6) Convert the data to tensors for easy loading """
        
        self.transform = transform

        # --- 1. Gather files (sorted for deterministic order) ---------
        self.data_dir = data_dir
        self.file_paths = sorted(
            [
                os.path.join(data_dir, f)
                for f in os.listdir(data_dir)
                if f.endswith(".nc")
            ]
        )
        print(f"Found {len(self.file_paths)} NetCDF files:")
        for f in self.file_paths:
            print(f"  - {f}")

        if not self.file_paths:
            raise FileNotFoundError(f"No *.nc files found in {data_dir!r}")

        # Open all the netCDF files and concatenate them along Time dimension
        print("Loading datasets with xarray.open_mfdataset...")
        
        ds = xr.open_mfdataset(
            self.file_paths,
            combine="nested",
            concat_dim="Time", # Use the NetCDF's Time dimension for concatenation
            decode_times=False,
            parallel=False,
        )

        print("Finished loading full dataset into a local variable.")

        print(f"Dataset dimensions: {ds.dims}")
        print(f"Dataset variables: {list(ds.data_vars)}")
        
        # --- 2. Store a list of datetimes from each file -> helps with retrieving 1 day's data later
        all_times = []
        for path in self.file_paths:
            ds = xr.open_dataset(path)
        
            # Decode byte strings and fix the format
            xtime_strs = ds["xtime_startDaily"].str.decode("utf-8").values
            xtime_strs = [s.replace("_", " ") for s in xtime_strs]  # "0010-01-01_00:00:00" → "0010-01-01 00:00:00"
        
            # Convert to datetime.datetime objects
            times = [datetime.strptime(s, "%Y-%m-%d %H:%M:%S") for s in xtime_strs]
            all_times.extend(times)
        
        # Store in self.times
        self.times = all_times
        self.times = np.array(self.times, dtype='datetime64[s]')

        # Checking the dates
        print(f"Parsed {len(self.times)} total dates")
        print("First few:", self.times[:5])

        print(f"Total days collected: {len(self.times)}")
        print(f"Unique days: {len(np.unique(self.times))}")
        print(f"First 35 days: {self.times[:35]}")
        print(f"First days 360 to 400 days: {self.times[360:401]}")

        # --- 3. Apply a mask to the nCells
        # TODO: MASK DATA

        # --- 4. Get patch IDs (TODO: implement this)

        # --- 5. Derive Freeboard from ice area, snow volume and ice volume
        self.freeboard_all = []
        self.ice_area_all = []

        for path in self.file_paths:
            ds = xr.open_dataset(path)

            # Extract raw data
            area = ds["timeDaily_avg_iceAreaCell"].values
            ice_volume = ds["timeDaily_avg_iceVolumeCell"].values
            snow_volume = ds["timeDaily_avg_snowVolumeCell"].values

            # Optional mask
            if cell_mask is not None:
                area = area[:, cell_mask]
                ice_volume = ice_volume[:, cell_mask]
                snow_volume = snow_volume[:, cell_mask]
            
            freeboard = compute_freeboard(area, ice_volume, snow_volume)

            self.freeboard_all.append(freeboard)
            self.ice_area_all.append(area)

        # Concatenate across time
        self.freeboard = np.concatenate(self.freeboard_all, axis=0)  # (T, nCells)
        self.ice_area = np.concatenate(self.ice_area_all, axis=0)    # (T, nCells)

        # Cleanup (discard the lists that are not needed anymore)
        del self.freeboard_all, self.ice_area_all

        print("Freeboard", self.freeboard.shape)
        print("Ice Area", self.ice_area.shape)

        print("End of __init__")

    def __len__(self) -> int:
        """ Returns how many time steps? (Days for Daily data) """
        
        print("Calling __len__")
        return len(self.times)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, np.datetime64]:
        """__ getitem __ needs to 
        
        1. Select one time step (ex. 1 day). 
        It currently returns (features, timestamp) for a single day.
        2. TODO: Return a set of patches for one time step
        Features are: [freeboard, ice_area] over masked cells. """
    
        # 1. Select timestep (day)
        print("Calling __getitem__")
    
        freeboard_day = self.freeboard[idx]  # shape: (nCells,)
        ice_area_day = self.ice_area[idx]    # shape: (nCells,)
        print(freeboard_day.shape)
        print(ice_area_day.shape)
        
        features = np.stack([freeboard_day, ice_area_day], axis=0)  # shape: (2, nCells)
        data_tensor = torch.as_tensor(features, dtype=torch.float32)
    
        if self.transform:
            data_tensor = self.transform(data_tensor)
            
        print(f"Fetched index {idx}: Time={self.times[idx]}, shape={data_tensor.shape}")
        return data_tensor, self.times[idx]

    def __repr__(self):
        return (
            f"<DailyNetCDFDataset: {len(self)} days, "
            f"{len(self.freeboard[0])} cells/day, "
            f"{len(self.file_paths)} files loaded>"
        )


    def time_to_dataframe(self) -> pd.DataFrame:
            """Return a DataFrame of time features you can merge with predictions."""
            t = pd.to_datetime(self.times)            # pandas Timestamp index
            return pd.DataFrame(
                {
                    "time": t,
                    "year": t.year,
                    "month": t.month,
                    "day": t.day,
                    "doy": t.dayofyear,
                }
            )

In [63]:
from torch.utils.data import DataLoader

#data_dir is not included here for privacy, but is included in the actual code

print("===== Making the Dataset Class ===== ")
dataset = DailyNetCDFDataset(data_dir)

print("===== Printing Dataset ===== ")
print(dataset)                 # → see how many files & days loaded
sample, ts = dataset[0]        # sample is tensor, ts is np.datetime64

# wrap in a DataLoader as usual
loader = DataLoader(dataset, batch_size=8, shuffle=False)

# quickly get engineered time-features if you want them numerically
df_time = dataset.time_to_dataframe()


===== Making the Dataset Class ===== 
Found 12 NetCDF files:
  - /global/u2/b/brelypo/python_model_visualization/Predicting_SIC_SIE/train/v3.LR.DTESTM.pm-cpu-10yr.mpassi.hist.am.timeSeriesStatsDaily.0010-01-01.nc
  - /global/u2/b/brelypo/python_model_visualization/Predicting_SIC_SIE/train/v3.LR.DTESTM.pm-cpu-10yr.mpassi.hist.am.timeSeriesStatsDaily.0010-02-01.nc
  - /global/u2/b/brelypo/python_model_visualization/Predicting_SIC_SIE/train/v3.LR.DTESTM.pm-cpu-10yr.mpassi.hist.am.timeSeriesStatsDaily.0010-03-01.nc
  - /global/u2/b/brelypo/python_model_visualization/Predicting_SIC_SIE/train/v3.LR.DTESTM.pm-cpu-10yr.mpassi.hist.am.timeSeriesStatsDaily.0010-04-01.nc
  - /global/u2/b/brelypo/python_model_visualization/Predicting_SIC_SIE/train/v3.LR.DTESTM.pm-cpu-10yr.mpassi.hist.am.timeSeriesStatsDaily.0010-05-01.nc
  - /global/u2/b/brelypo/python_model_visualization/Predicting_SIC_SIE/train/v3.LR.DTESTM.pm-cpu-10yr.mpassi.hist.am.timeSeriesStatsDaily.0010-06-01.nc
  - /global/u2/b/brelypo/py