In [12]:
import os
from datetime import datetime

import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from src.utils import (
    NetCDFToZarrConverter
)
raw_data_path = "../data/raw/"

In [6]:
# Scaling only performed for time dimension 
# 20MB per chunk is aimed

converter = NetCDFToZarrConverter()

chunk_analysis = converter.analyze_netcdf_files("../data/raw/seviri/hrv_lr2*.nc")

chunk_analysis["recommended_chunks"]

2026-02-01 19:37:57,178 - INFO - Analyzing NetCDF files matching: ../data/raw/seviri/hrv_lr2*.nc


2026-02-01 19:37:57,228 - INFO - Found 56 files
  dims = dict(ds.dims)
2026-02-01 19:38:04,896 - INFO - Calculated chunk size: 19.97 MB | chunks={'time': 261, 'y': 92, 'x': 109}
2026-02-01 19:38:04,898 - INFO - Analysis complete: {'num_files': 56, 'files': ['hrv_lr200401.nc', 'hrv_lr200411.nc', 'hrv_lr200412.nc', 'hrv_lr200501.nc', 'hrv_lr200511.nc', 'hrv_lr200512.nc', 'hrv_lr200601.nc', 'hrv_lr200611.nc', 'hrv_lr200612.nc', 'hrv_lr200701.nc', 'hrv_lr200711.nc', 'hrv_lr200712.nc', 'hrv_lr200801.nc', 'hrv_lr200811.nc', 'hrv_lr200812.nc', 'hrv_lr200901.nc', 'hrv_lr200911.nc', 'hrv_lr200912.nc', 'hrv_lr201001.nc', 'hrv_lr201011.nc', 'hrv_lr201012.nc', 'hrv_lr201101.nc', 'hrv_lr201111.nc', 'hrv_lr201112.nc', 'hrv_lr201201.nc', 'hrv_lr201202.nc', 'hrv_lr201211.nc', 'hrv_lr201212.nc', 'hrv_lr201301.nc', 'hrv_lr201302.nc', 'hrv_lr201311.nc', 'hrv_lr201312.nc', 'hrv_lr201401.nc', 'hrv_lr201402.nc', 'hrv_lr201411.nc', 'hrv_lr201412.nc', 'hrv_lr201501.nc', 'hrv_lr201502.nc', 'hrv_lr201511.nc', '

{'time': 261, 'y': 92, 'x': 109}

In [7]:
from pathlib import Path
input_file_pattern="../data/raw/seviri/hrv_lr2*.nc"

files = sorted(Path().glob(input_file_pattern))

In [8]:
"""
- Issue:Segmentation fault (core dumped)
- Reason: parallel=True, chunks='auto' on xr.open_mfdataset
- Solution: set parallel false and chunks to None
"""

converter.convert_multiple_files_to_single_zarr(
    file_pattern = input_file_pattern,
    output_path = f"../data/processed/seviri/hrv_lr{files[0].as_posix()[-9:-5]}_{files[-1].as_posix()[-9:-5]}.zarr",
    custom_chunks = chunk_analysis["recommended_chunks"]
)

2026-02-01 19:38:04,933 - INFO - Converting multiple files to single Zarr: ../data/raw/seviri/hrv_lr2*.nc
2026-02-01 19:38:04,936 - INFO - Found 56 files to convert
2026-02-01 19:38:06,209 - INFO - Writing consolidated Zarr with chunks: {'time': 261, 'y': 92, 'x': 109}


3.955028712749481 GB dataset
Frozen({'time': (261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261, 261,

2026-02-01 19:38:46,669 - INFO - Successfully created consolidated Zarr at ../data/processed/seviri/hrv_lr2004_2019.zarr
2026-02-01 19:38:46,671 - INFO - 
2026-02-01 19:38:46,673 - INFO - Zarr Store Information: ../data/processed/seviri/hrv_lr2004_2019.zarr
2026-02-01 19:38:46,675 - INFO - /
 ├── hrv (52928, 92, 109) float64
 ├── lat (92, 109) float64
 ├── lon (92, 109) float64
 └── time (52928,) int64



In [14]:

from src.data.datasets import CloudHoleDataset
first_year, last_year = files[0].as_posix()[-9:-5], files[-1].as_posix()[-9:-5]

processed_data_path = "../data/processed"

dataset = CloudHoleDataset(
    labels=f"{processed_data_path}/julia_labels.csv",
    data_dir=f"{processed_data_path}/seviri/hrv_lr{first_year}_{last_year}.zarr",
    years=range(int(first_year), int(last_year)+1)
)

mean, std = dataset.mean, dataset.std

print(f"Mean: {mean}, Std: {std}")

Mean: 12.788631439208984, Std: 7.1760993003845215
