## Valiation dataset downloading

The datasets are on hugging face.

As models were not trained on the last years it was mandatory to download some samples to do a proper evaluation. However as they're too heavy, I __downloading 10% of the dataset__ making sure to download __pairs__ (I spent 10h of debugging after forgetting that...)

<span style="color:red">Do not launch this 100 minutes script</span>

### Downloading

In [None]:
import os
import random
from huggingface_hub import HfFileSystem, hf_hub_download

fs = HfFileSystem()
repo_id = "LEAP/ClimSim_low-res"
local_dir = "./public_data/ClimSim_low-res/"
os.makedirs(local_dir, exist_ok=True)

print("Recherche des fichiers mli...")
all_files = fs.glob(f"datasets/{repo_id}/train/0008-*/*.mli.*.nc")

print(f"{len(all_files)} fichiers mli trouv√©s.")

sample_size = int(0.01 * len(all_files)) 
sampled_mli = random.sample(all_files, sample_size)

# 3. Reconstituer la liste finale avec les paires correspondantes (mli + mlo)
final_list = []
for mli_path in sampled_mli:
    final_list.append(mli_path)
    final_list.append(mli_path.replace(".mli.", ".mlo."))

print(f"Pr√™t √† t√©l√©charger {len(final_list)} fichiers.")

# 4. T√©l√©chargement direct
for i, hf_path in enumerate(final_list):
    filename = hf_path.replace(f"datasets/{repo_id}/", "")
    print(f"[{i+1}/{len(final_list)}] -> {filename}")
    
    hf_hub_download(
        repo_id=repo_id,
        filename=filename,
        repo_type="dataset",
        local_dir=local_dir,
        local_dir_use_symlinks=False
    )

### .zarr convertion

In [None]:
import xarray as xr
import os
import shutil
from tqdm.auto import tqdm

def get_paired_samples(path):
    data_folders = sorted(os.listdir(path))
    print(f"Found {len(data_folders)} data folders.")

    mli_dict = {}
    mlo_dict = {}

    for dir_name in data_folders:
        dir_path = os.path.join(path, dir_name)
        if not os.path.isdir(dir_path):
            continue

        for fname in os.listdir(dir_path):
            print(fname)
            parts = fname.split('.')
            
            print(parts)
            base = parts[0]        # identifiant commun
            ext = parts[1]         # mli ou mlo
            timestamp = parts[2]   # timestamp
            full_path = os.path.join(dir_path, fname)

            key = f"{base}.{timestamp}"

            if ext == "mli":
                mli_dict[key] = full_path
            elif ext == "mlo":
                mlo_dict[key] = full_path

    # --- Cr√©ation des paires s√ªres ---
    common_keys = sorted(set(mli_dict) & set(mlo_dict))
    missing_mli = sorted(set(mlo_dict) - set(mli_dict))
    missing_mlo = sorted(set(mli_dict) - set(mlo_dict))

    print(f"‚úÖ Paired samples: {len(common_keys)}")
    print(f"‚ùå Missing MLI: {len(missing_mli)}")
    print(f"‚ùå Missing MLO: {len(missing_mlo)}")

    if missing_mli:
        print("Example missing MLI:", missing_mli[:5])
    if missing_mlo:
        print("Example missing MLO:", missing_mlo[:5])

    pairs = [(mli_dict[k], mlo_dict[k]) for k in common_keys]
    return pairs


def read_sample(file_path):
    return xr.open_dataset(file_path)

zarr_path = "/home/alexandre-tonon/test/test_data/ClimSim_low-res_validation.zarr"
train_root = "/home/alexandre-tonon/test/test_data/train"

# --- √âTAPE CRUCIALE : RESET ---
if os.path.exists(zarr_path):
    print("üßπ Nettoyage du Zarr corrompu...")
    shutil.rmtree(zarr_path)

samples = get_paired_samples(train_root)

chunk_size = 100
chunk_number = (len(samples) + chunk_size - 1) // chunk_size

for i in tqdm(range(chunk_number), desc="Progression totale"):
    buffer = []
    current_samples = samples[i*chunk_size : (i+1)*chunk_size]
    
    for mli_path, mlo_path in tqdm(current_samples, desc=f"Batch {i+1}", leave=False):
        # Utilisation de engine='netcdf4' ou 'h5netcdf'
        with xr.open_dataset(mli_path, engine='netcdf4') as mli_ds, \
             xr.open_dataset(mlo_path, engine='netcdf4') as mlo_ds:

            # On load en RAM pour casser le lien avec le fichier NetCDF
            ds_i = mli_ds.rename({v: f"in_{v}" for v in mli_ds.data_vars}).load()
            ds_o = mlo_ds.rename({v: f"out_{v}" for v in mlo_ds.data_vars}).load()

            # Fusion et ajout manuel de la dimension 'sample'
            merged = xr.merge([ds_i, ds_o], compat="override").expand_dims("sample")
            buffer.append(merged)

    if buffer:
        ds_batch = xr.concat(buffer, dim="sample")
        
        # On v√©rifie si c'est le TOUT PREMIER bloc √©crit dans cette session
        if i == 0:
            # Cr√©ation initiale (mode 'w')
            ds_batch.to_zarr(zarr_path, mode="w", consolidated=True)
        else:
            # Ajout (mode 'a')
            ds_batch.to_zarr(zarr_path, mode="a", append_dim="sample", consolidated=True)
        
        # Nettoyage pour le prochain chunk
        del ds_batch
        import gc; gc.collect()