In [None]:
import xarray as xr
import numpy as np
import os
import json
import torch
from tqdm import tqdm

class ClimSimNumpySharder:
    def __init__(self):
        # Configuration of Variables (The 557/128 architecture)
        self.input_profiles = ['state_t', 'state_q0001', 'state_q0002', 'state_q0003', 'state_u', 'state_v', 'pbuf_ozone', 'pbuf_CH4', 'pbuf_N2O']
        self.input_scalars = ['state_ps', 'pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX', 'pbuf_TAUX', 'pbuf_TAUY', 'cam_in_ALDIF', 'cam_in_ALDIR', 'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_ICEFRAC', 'cam_in_LANDFRAC', 'cam_in_LWUP', 'cam_in_OCNFRAC', 'cam_in_SNOWHICE', 'cam_in_SNOWHLAND', 'pbuf_COSZRS']
        
        self.target_profiles = ['state_t', 'state_q0001'] 
        self.target_scalars = ['cam_out_NETSW', 'cam_out_FLWDS', 'cam_out_PRECC', 'cam_out_PRECSC', 'cam_out_SOLL', 'cam_out_SOLLD', 'cam_out_SOLS', 'cam_out_SOLSD']

        # Build Internal Mappings
        self.input_indices, self.total_input_dim = self._build_index_map(self.input_profiles, self.input_scalars)
        self.target_indices, self.total_target_dim = self._build_index_map(self.target_profiles, self.target_scalars)

    def _build_index_map(self, profiles, scalars):
        mapping = {}
        curr = 0
        for p in profiles:
            mapping[p] = {"start": curr, "end": curr + 60}
            curr += 60
        for s in scalars:
            mapping[s] = {"start": curr, "end": curr + 1}
            curr += 1
        return mapping, curr

    def create_shards(self, mli_paths, mlo_paths, output_dir, shard_size=100):
        os.makedirs(output_dir, exist_ok=True)
        all_input_means, all_input_sq_means = [], []
        num_files = len(mli_paths)

        for shard_idx, start_i in enumerate(range(0, num_files, shard_size)):
            X_shard, Y_shard = [], []
            end_i = min(start_i + shard_size, num_files)
            
            for mli, mlo in tqdm(zip(mli_paths[start_i:end_i], mlo_paths[start_i:end_i]), 
                                 total=end_i-start_i, desc=f"Shard {shard_idx}"):
                try:
                    with xr.open_dataset(mli) as ds_in, xr.open_dataset(mlo) as ds_out:
                        # Vectorized stacking
                        X_file = np.hstack([ds_in[v].values.T if v in self.input_profiles else ds_in[v].values.reshape(-1, 1) for v in self.input_profiles + self.input_scalars])
                        Y_file = np.hstack([ds_out[v].values.T if v in self.target_profiles else ds_out[v].values.reshape(-1, 1) for v in self.target_profiles + self.target_scalars])
                        X_shard.append(X_file)
                        Y_shard.append(Y_file)
                except Exception as e:
                    print(f"Error processing {mli}: {e}")

            # vstack rows (384 * shard_size, features)
            X_final = np.vstack(X_shard).astype(np.float32)
            Y_final = np.vstack(Y_shard).astype(np.float32)

            # Accumulate statistics
            all_input_means.append(np.mean(X_final, axis=0))
            all_input_sq_means.append(np.mean(X_final**2, axis=0))

            np.save(os.path.join(output_dir, f"X_shard_{shard_idx}.npy"), X_final)
            np.save(os.path.join(output_dir, f"Y_shard_{shard_idx}.npy"), Y_final)

        # Save Final Metadata
        final_mean = np.mean(all_input_means, axis=0)
        final_std = np.sqrt(np.mean(all_input_sq_means, axis=0) - final_mean**2)
        
        metadata = {
            "input_indices": self.input_indices,
            "target_indices": self.target_indices,
            "input_mean": final_mean.tolist(),
            "input_std": final_std.tolist(),
            "total_input_dim": self.total_input_dim,
            "total_target_dim": self.total_target_dim
        }
        
        with open(os.path.join(output_dir, "metadata.json"), "w") as f:
            json.dump(metadata, f, indent=4)
        print(f"Done! Metadata saved to {output_dir}/metadata.json")

    @staticmethod
    def get_variable(data, var_name, mapping):
        """Helper to extract a variable from a loaded numpy shard or torch tensor."""
        if var_name not in mapping:
            raise ValueError(f"Variable {var_name} not found in mapping.")
        start = mapping[var_name]['start']
        end = mapping[var_name]['end']
        return data[..., start:end]
    
    def get_input_indices(self):
        idx_map = {}
        current_idx = 0
        
        # 1. Map Profiles (Typically 60 levels each)
        for var in self.input_profiles:
            # ClimSim low-res usually has 60 vertical levels per profile
            num_levels = 60 
            idx_map[var] = list(range(current_idx, current_idx + num_levels))
            current_idx += num_levels
            
        # 2. Map Scalars (1 level each)
        for var in self.input_scalars:
            idx_map[var] = [current_idx]
            current_idx += 1
            
        return idx_map

In [2]:
def get_data_folders(path):
    data_folders = os.listdir(path)
    data_folders.sort()

    print(f"Found {len(data_folders)} data folders.")

    mli_samples = []
    mlo_samples = []
    for dir_name in data_folders:
        files = os.listdir(os.path.join(path, dir_name))
        for f in files:
            if f.split('.')[1] == 'mli':
                mli_samples.append(os.path.join(path, dir_name, f))
            elif f.split('.')[1] == 'mlo':
                mlo_samples.append(os.path.join(path, dir_name, f))
    
    return mli_samples, mlo_samples

def read_sample(file_path):
    return xr.open_dataset(file_path)

In [None]:
import xarray as xr
import os
import shutil
from tqdm.auto import tqdm

zarr_path = "/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res.zarr"
train_root = "/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/"

# --- √âTAPE CRUCIALE : RESET ---
if os.path.exists(zarr_path):
    print("üßπ Nettoyage du Zarr corrompu...")
    shutil.rmtree(zarr_path)

mli_samples, mlo_samples = get_data_folders(train_root)
samples = list(zip(mli_samples, mlo_samples))

chunk_size = 100
chunk_number = (len(samples) + chunk_size - 1) // chunk_size

for i in tqdm(range(chunk_number), desc="Progression totale"):
    buffer = []
    current_samples = samples[i*chunk_size : (i+1)*chunk_size]
    
    for mli_path, mlo_path in tqdm(current_samples, desc=f"Batch {i+1}", leave=False):
        # Utilisation de engine='netcdf4' ou 'h5netcdf'
        with xr.open_dataset(mli_path, engine='netcdf4') as mli_ds, \
             xr.open_dataset(mlo_path, engine='netcdf4') as mlo_ds:

            # On load en RAM pour casser le lien avec le fichier NetCDF
            ds_i = mli_ds.rename({v: f"in_{v}" for v in mli_ds.data_vars}).load()
            ds_o = mlo_ds.rename({v: f"out_{v}" for v in mlo_ds.data_vars}).load()

            # Fusion et ajout manuel de la dimension 'sample'
            merged = xr.merge([ds_i, ds_o], compat="override").expand_dims("sample")
            buffer.append(merged)

    if buffer:
        ds_batch = xr.concat(buffer, dim="sample")
        
        # On v√©rifie si c'est le TOUT PREMIER bloc √©crit dans cette session
        if i == 0:
            # Cr√©ation initiale (mode 'w')
            ds_batch.to_zarr(zarr_path, mode="w", consolidated=True)
        else:
            # Ajout (mode 'a')
            ds_batch.to_zarr(zarr_path, mode="a", append_dim="sample", consolidated=True)
        
        # Nettoyage pour le prochain chunk
        del ds_batch
        import gc; gc.collect()

Found 5 data folders.


Progression totale:   0%|          | 0/109 [00:00<?, ?it/s]

Batch 1:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 2:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 3:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 4:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 5:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 6:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 7:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 8:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 9:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 10:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 11:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 12:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 13:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 14:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 15:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 16:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 17:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 18:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 19:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 20:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 21:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 22:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 23:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 24:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 25:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 26:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 27:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 28:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 29:   0%|          | 0/100 [00:00<?, ?it/s]



Batch 30:   0%|          | 0/100 [00:00<?, ?it/s]

OSError: [Errno -51] NetCDF: Unknown file format: '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc'

In [26]:
import os
import xarray as xr

file_path = '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc'

print(f"--- Diagnostic du fichier ---")
if os.path.exists(file_path):
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"Taille du fichier : {size_mb:.2f} MB")
    
    # Test de lecture du header (les 4 premiers octets)
    with open(file_path, 'rb') as f:
        header = f.read(4)
        print(f"Signature du header (HEX) : {header.hex()}")
        print(f"Signature du header (ASCII) : {header}")
else:
    print("ERREUR : Le fichier n'existe pas √† cet emplacement.")

print(f"\n--- Test de lecture Xarray ---")
for engine in ['netcdf4', 'h5netcdf', 'scipy']:
    try:
        ds = xr.open_dataset(file_path, engine=engine)
        print(f"‚úÖ Succ√®s avec le moteur : {engine}")
        ds.close()
    except Exception as e:
        print(f"‚ùå √âchec avec le moteur {engine} : {e}")

--- Diagnostic du fichier ---
ERREUR : Le fichier n'existe pas √† cet emplacement.

--- Test de lecture Xarray ---
‚ùå √âchec avec le moteur netcdf4 : [Errno 2] No such file or directory: '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc'
‚ùå √âchec avec le moteur h5netcdf : [Errno 2] Unable to synchronously open file (unable to open file: name = '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
‚ùå √âchec avec le moteur scipy : [Errno 2] No such file or directory: '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc'


In [None]:
path = '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-01/E3SM-MMF.mli.0002-01-01-00000.nc'

xr.open_dataset(path, engine='netcdf4')