In [2]:
import xarray as xr
import os
import shutil
from tqdm.auto import tqdm

zarr_path = "/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_one_sample"
train_root = "/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/custom/"

# --- √âTAPE CRUCIALE : RESET ---
if os.path.exists(zarr_path):
    print("üßπ Nettoyage du Zarr corrompu...")
    shutil.rmtree(zarr_path)

list_files = sorted(os.listdir(train_root))
mli_samples = [os.path.join(train_root, f) for f in list_files if "mli" in f]
mlo_samples = [os.path.join(train_root, f) for f in list_files if "mlo" in f]

samples = list(zip(mli_samples, mlo_samples))

chunk_size = 19
chunk_number = (len(samples) + chunk_size - 1) // chunk_size

for i in tqdm(range(chunk_number), desc="Progression totale"):
    buffer = []
    current_samples = samples[i*chunk_size : (i+1)*chunk_size]
    
    for mli_path, mlo_path in tqdm(current_samples, desc=f"Batch {i+1}", leave=False):
        # Utilisation de engine='netcdf4' ou 'h5netcdf'
        with xr.open_dataset(mli_path, engine='netcdf4') as mli_ds, \
             xr.open_dataset(mlo_path, engine='netcdf4') as mlo_ds:

            # On load en RAM pour casser le lien avec le fichier NetCDF
            ds_i = mli_ds.rename({v: f"in_{v}" for v in mli_ds.data_vars}).load()
            ds_o = mlo_ds.rename({v: f"out_{v}" for v in mlo_ds.data_vars}).load()

            # Fusion et ajout manuel de la dimension 'sample'
            merged = xr.merge([ds_i, ds_o], compat="override").expand_dims("sample")
            buffer.append(merged)

    if buffer:
        ds_batch = xr.concat(buffer, dim="sample")
        
        # On v√©rifie si c'est le TOUT PREMIER bloc √©crit dans cette session
        if i == 0:
            # Cr√©ation initiale (mode 'w')
            ds_batch.to_zarr(zarr_path, mode="w", consolidated=True)
        else:
            # Ajout (mode 'a')
            ds_batch.to_zarr(zarr_path, mode="a", append_dim="sample", consolidated=True)
        
        # Nettoyage pour le prochain chunk
        del ds_batch
        import gc; gc.collect()

Progression totale:   0%|          | 0/1 [00:00<?, ?it/s]

Batch 1:   0%|          | 0/19 [00:00<?, ?it/s]



In [26]:
import os
import xarray as xr

file_path = '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc'

print(f"--- Diagnostic du fichier ---")
if os.path.exists(file_path):
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"Taille du fichier : {size_mb:.2f} MB")
    
    # Test de lecture du header (les 4 premiers octets)
    with open(file_path, 'rb') as f:
        header = f.read(4)
        print(f"Signature du header (HEX) : {header.hex()}")
        print(f"Signature du header (ASCII) : {header}")
else:
    print("ERREUR : Le fichier n'existe pas √† cet emplacement.")

print(f"\n--- Test de lecture Xarray ---")
for engine in ['netcdf4', 'h5netcdf', 'scipy']:
    try:
        ds = xr.open_dataset(file_path, engine=engine)
        print(f"‚úÖ Succ√®s avec le moteur : {engine}")
        ds.close()
    except Exception as e:
        print(f"‚ùå √âchec avec le moteur {engine} : {e}")

--- Diagnostic du fichier ---
ERREUR : Le fichier n'existe pas √† cet emplacement.

--- Test de lecture Xarray ---
‚ùå √âchec avec le moteur netcdf4 : [Errno 2] No such file or directory: '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc'
‚ùå √âchec avec le moteur h5netcdf : [Errno 2] Unable to synchronously open file (unable to open file: name = '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
‚ùå √âchec avec le moteur scipy : [Errno 2] No such file or directory: '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-02/E3SM-MMF.mli.0002-02-11-33600.nc'


In [None]:
path = '/media/alexandre-tonon/UBUNTU 24_0/articleDL/data/ClimSim_low-res/train/0002-01/E3SM-MMF.mli.0002-01-01-00000.nc'

xr.open_dataset(path, engine='netcdf4')

In [None]:
import xarray as xr
import numpy as np

# Ouvre le Zarr
ds = xr.open_zarr(zarr_path, chunks=None)

print("=== STRUCTURE DU ZARR ===")
print(f"Nombre total de samples: {ds.dims['sample']}")
print(f"Variables disponibles: {list(ds.data_vars)[:10]}...")

# V√©rifie la continuit√© des donn√©es
print("\n=== TEST DE CONTINUIT√â ===")
for i in [0, 10, 100, 500]:
    if i < ds.dims['sample']:
        state_t = ds['in_state_t'].isel(sample=i, ncol=0, lev=0).values
        print(f"Sample {i}, state_t[0,0]: {state_t:.6f}")

# Compare avec les fichiers NetCDF sources
print("\n=== COMPARAISON AVEC SOURCES ===")
# Premier fichier de ton train
first_nc = xr.open_dataset(mli_samples[0], engine='netcdf4')
zarr_sample_0 = ds['in_state_t'].isel(sample=0).values
nc_sample_0 = first_nc['state_t'].values

print(f"Zarr sample 0 range: [{zarr_sample_0.min():.2f}, {zarr_sample_0.max():.2f}]")
print(f"NetCDF source range: [{nc_sample_0.min():.2f}, {nc_sample_0.max():.2f}]")
print(f"Identiques ? {np.allclose(zarr_sample_0, nc_sample_0, atol=1e-5)}")