## Instalar librerias

In [None]:
%%capture
!pip install luigi pyyaml netCDF4 h5netcdf scipy boto3 s3fs numpy tqdm

In [None]:
%%capture
!pip install dask[complete] zarr xarray[io]

In [None]:
%%capture
!pip install --upgrade s3fs

In [None]:
%%capture
!pip install "xarray[complete]"==2023.8.0 s3fs --user

In [None]:
%%capture
!pip install --upgrade dask

## Cargar librerias

In [32]:
import xarray as xr
import pandas as pd
import numpy as np
import s3fs
import os
import re
from collections import defaultdict
import zarr
from numcodecs import blosc
from glob import glob
import dask
from dask import delayed, compute
from tqdm import tqdm
from dask.distributed import Client
from dask.diagnostics import ProgressBar
from datetime import datetime

## Funciones

In [33]:
def kelvin_a_celsius(temp_k):
    return xr.where(np.isnan(temp_k), temp_k, temp_k - 273.15)

def extraer_fecha_y_tipo(nombre_archivo):
    match = re.search(r'Temperature-Air-2m-(Max|Mean|Min)-24h.*_(\d{8})_', nombre_archivo)
    if match:
        tipo, fecha = match.groups()
        return fecha, tipo
    return '00000000', ''

In [34]:
#dataframe de archivos

def archivos_mes(directorio):
    archivos = [f for f in os.listdir(directorio) if f.endswith('.nc')]
    data = []

    for archivo in archivos:
        fecha, tipo = extraer_fecha_y_tipo(archivo)
        if fecha != '00000000':
            data.append({
                'año': int(fecha[:4]),
                'mes': int(fecha[4:6]),
                'fecha': pd.to_datetime(fecha, format='%Y%m%d'),
                'tipo': tipo,
                'path': os.path.join(directorio, archivo)
            })

    df = pd.DataFrame(data)
    df.sort_values(by=['fecha', 'tipo'], inplace=True)
    df['tipo'] = pd.Categorical(df['tipo'], categories=['Max', 'Mean', 'Min'], ordered=True)
    df.sort_values(by=['fecha', 'tipo'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

In [35]:
directorio = '/home/ec2-user/SageMaker/datalake/data/agera/temperature/landing'
df = archivos_mes(directorio)
df.head()

Unnamed: 0,año,mes,fecha,tipo,path
0,1980,1,1980-01-01,Max,/home/ec2-user/SageMaker/datalake/data/agera/t...
1,1980,1,1980-01-01,Mean,/home/ec2-user/SageMaker/datalake/data/agera/t...
2,1980,1,1980-01-01,Min,/home/ec2-user/SageMaker/datalake/data/agera/t...
3,1980,1,1980-01-02,Max,/home/ec2-user/SageMaker/datalake/data/agera/t...
4,1980,1,1980-01-02,Mean,/home/ec2-user/SageMaker/datalake/data/agera/t...


### Prueba para ejecutar mes

In [5]:
año = 1980
mes = 1

In [6]:
start_date = pd.Timestamp(f"{año}-{mes:02d}-01")
end_date = start_date + pd.offsets.MonthEnd(0)

In [7]:
df_archivos = df
df_mes = df_archivos[(df_archivos['fecha'] >= start_date) & (df_archivos['fecha'] <= end_date)]
#df_mes

In [8]:
datasets = xr.open_mfdataset(df_mes['path'].tolist())
datasets

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 63 graph layers,31 chunks in 63 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 766.72 MiB 24.73 MiB Shape (31, 1801, 3600) (1, 1801, 3600) Dask graph 31 chunks in 63 graph layers Data type float32 numpy.ndarray",3600  1801  31,

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 63 graph layers,31 chunks in 63 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 63 graph layers,31 chunks in 63 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 766.72 MiB 24.73 MiB Shape (31, 1801, 3600) (1, 1801, 3600) Dask graph 31 chunks in 63 graph layers Data type float32 numpy.ndarray",3600  1801  31,

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 63 graph layers,31 chunks in 63 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 63 graph layers,31 chunks in 63 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 766.72 MiB 24.73 MiB Shape (31, 1801, 3600) (1, 1801, 3600) Dask graph 31 chunks in 63 graph layers Data type float32 numpy.ndarray",3600  1801  31,

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 63 graph layers,31 chunks in 63 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [9]:
# Convertir de Kelvin a Celsius
for var_name in datasets.data_vars:
    datasets[var_name] = kelvin_a_celsius(datasets[var_name])
    datasets[var_name].attrs['units'] = 'C'

In [10]:
bucket_salida = 'climate-action-datalake'
ruta_salida_s3 = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'

s3 = s3fs.S3FileSystem(anon=False)
zarr_store = s3fs.S3Map(root=f's3://{bucket_salida}/{ruta_salida_s3}', s3=s3)

In [11]:
# Configurar compresión y encoding
blosc.set_nthreads(8)
compressor = zarr.Blosc(cname='lz4', clevel=1, shuffle=False)
encoding = {vname: {'compressor': compressor, 'chunks': (1, 1801, 3600)} for vname in datasets.data_vars}

In [12]:
with tqdm(total=100, desc=f"Procesando {año}-{mes:02d}") as pbar:
    try:
        datasets.to_zarr(zarr_store, mode='a', append_dim='time', consolidated=True)
    except ValueError as e:
        # Si el archivo no existe, crear un nuevo archivo
        print(f"Error al abrir el archivo en modo append: {e}. Creando archivo nuevo.")
        datasets.to_zarr(zarr_store, mode='w', consolidated=True, encoding=encoding)
    pbar.update(100)

Procesando 1980-01:   0%|          | 0/100 [00:00<?, ?it/s]

Error al abrir el archivo en modo append: append_dim='time' does not match any existing dataset dimensions {}. Creando archivo nuevo.


Procesando 1980-01: 100%|██████████| 100/100 [00:13<00:00,  7.17it/s]


In [13]:
# Conexión a S3
s3 = s3fs.S3FileSystem(anon=False)
# Ubicación del archivo Zarr en S3
bucket_name = 'climate-action-datalake'
zarr_path = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'
s3_url_1 = f's3://{bucket_name}/{zarr_path}'
# Abrir el archivo Zarr
ds1 = xr.open_zarr(s3fs.S3Map(s3_url_1, s3=s3))
ds1

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 2 graph layers,31 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 766.72 MiB 24.73 MiB Shape (31, 1801, 3600) (1, 1801, 3600) Dask graph 31 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  31,

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 2 graph layers,31 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 2 graph layers,31 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 766.72 MiB 24.73 MiB Shape (31, 1801, 3600) (1, 1801, 3600) Dask graph 31 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  31,

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 2 graph layers,31 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 2 graph layers,31 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 766.72 MiB 24.73 MiB Shape (31, 1801, 3600) (1, 1801, 3600) Dask graph 31 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  31,

Unnamed: 0,Array,Chunk
Bytes,766.72 MiB,24.73 MiB
Shape,"(31, 1801, 3600)","(1, 1801, 3600)"
Dask graph,31 chunks in 2 graph layers,31 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [14]:
año = 1980
mes = 2
start_date = pd.Timestamp(f"{año}-{mes:02d}-01")
end_date = start_date + pd.offsets.MonthEnd(0)
df_archivos = df
df_mes = df_archivos[(df_archivos['fecha'] >= start_date) & (df_archivos['fecha'] <= end_date)]
datasets = xr.open_mfdataset(df_mes['path'].tolist())
# Convertir de Kelvin a Celsius
for var_name in datasets.data_vars:
    datasets[var_name] = kelvin_a_celsius(datasets[var_name])
    datasets[var_name].attrs['units'] = 'C'
#datasets

Unnamed: 0,Array,Chunk
Bytes,717.26 MiB,24.73 MiB
Shape,"(29, 1801, 3600)","(1, 1801, 3600)"
Dask graph,29 chunks in 62 graph layers,29 chunks in 62 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 717.26 MiB 24.73 MiB Shape (29, 1801, 3600) (1, 1801, 3600) Dask graph 29 chunks in 62 graph layers Data type float32 numpy.ndarray",3600  1801  29,

Unnamed: 0,Array,Chunk
Bytes,717.26 MiB,24.73 MiB
Shape,"(29, 1801, 3600)","(1, 1801, 3600)"
Dask graph,29 chunks in 62 graph layers,29 chunks in 62 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,717.26 MiB,24.73 MiB
Shape,"(29, 1801, 3600)","(1, 1801, 3600)"
Dask graph,29 chunks in 62 graph layers,29 chunks in 62 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 717.26 MiB 24.73 MiB Shape (29, 1801, 3600) (1, 1801, 3600) Dask graph 29 chunks in 62 graph layers Data type float32 numpy.ndarray",3600  1801  29,

Unnamed: 0,Array,Chunk
Bytes,717.26 MiB,24.73 MiB
Shape,"(29, 1801, 3600)","(1, 1801, 3600)"
Dask graph,29 chunks in 62 graph layers,29 chunks in 62 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,717.26 MiB,24.73 MiB
Shape,"(29, 1801, 3600)","(1, 1801, 3600)"
Dask graph,29 chunks in 62 graph layers,29 chunks in 62 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 717.26 MiB 24.73 MiB Shape (29, 1801, 3600) (1, 1801, 3600) Dask graph 29 chunks in 62 graph layers Data type float32 numpy.ndarray",3600  1801  29,

Unnamed: 0,Array,Chunk
Bytes,717.26 MiB,24.73 MiB
Shape,"(29, 1801, 3600)","(1, 1801, 3600)"
Dask graph,29 chunks in 62 graph layers,29 chunks in 62 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [15]:
with tqdm(total=100, desc=f"Procesando {año}-{mes:02d}") as pbar:
    try:
        datasets.to_zarr(zarr_store, mode='a', append_dim='time', consolidated=True)
    except ValueError as e:
        # Si el archivo no existe, crear un nuevo archivo
        print(f"Error al abrir el archivo en modo append: {e}. Creando archivo nuevo.")
        datasets.to_zarr(zarr_store, mode='w', consolidated=True, encoding=encoding)
    pbar.update(100)

Procesando 1980-02: 100%|██████████| 100/100 [00:11<00:00,  8.71it/s]


In [16]:
# Conexión a S3
s3 = s3fs.S3FileSystem(anon=False)
# Ubicación del archivo Zarr en S3
bucket_name = 'climate-action-datalake'
zarr_path = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'
s3_url_1 = f's3://{bucket_name}/{zarr_path}'
# Abrir el archivo Zarr
ds1 = xr.open_zarr(s3fs.S3Map(s3_url_1, s3=s3))
ds1

Unnamed: 0,Array,Chunk
Bytes,1.45 GiB,24.73 MiB
Shape,"(60, 1801, 3600)","(1, 1801, 3600)"
Dask graph,60 chunks in 2 graph layers,60 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.45 GiB 24.73 MiB Shape (60, 1801, 3600) (1, 1801, 3600) Dask graph 60 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  60,

Unnamed: 0,Array,Chunk
Bytes,1.45 GiB,24.73 MiB
Shape,"(60, 1801, 3600)","(1, 1801, 3600)"
Dask graph,60 chunks in 2 graph layers,60 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.45 GiB,24.73 MiB
Shape,"(60, 1801, 3600)","(1, 1801, 3600)"
Dask graph,60 chunks in 2 graph layers,60 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.45 GiB 24.73 MiB Shape (60, 1801, 3600) (1, 1801, 3600) Dask graph 60 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  60,

Unnamed: 0,Array,Chunk
Bytes,1.45 GiB,24.73 MiB
Shape,"(60, 1801, 3600)","(1, 1801, 3600)"
Dask graph,60 chunks in 2 graph layers,60 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.45 GiB,24.73 MiB
Shape,"(60, 1801, 3600)","(1, 1801, 3600)"
Dask graph,60 chunks in 2 graph layers,60 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.45 GiB 24.73 MiB Shape (60, 1801, 3600) (1, 1801, 3600) Dask graph 60 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  60,

Unnamed: 0,Array,Chunk
Bytes,1.45 GiB,24.73 MiB
Shape,"(60, 1801, 3600)","(1, 1801, 3600)"
Dask graph,60 chunks in 2 graph layers,60 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [17]:
año = 1980
mes = 3
start_date = pd.Timestamp(f"{año}-{mes:02d}-01")
end_date = start_date + pd.offsets.MonthEnd(0)
df_archivos = df
df_mes = df_archivos[(df_archivos['fecha'] >= start_date) & (df_archivos['fecha'] <= end_date)]
datasets = xr.open_mfdataset(df_mes['path'].tolist())
# Convertir de Kelvin a Celsius
for var_name in datasets.data_vars:
    datasets[var_name] = kelvin_a_celsius(datasets[var_name])
    datasets[var_name].attrs['units'] = 'C'

In [18]:
with tqdm(total=100, desc=f"Procesando {año}-{mes:02d}") as pbar:
    try:
        datasets.to_zarr(zarr_store, mode='a', append_dim='time', consolidated=True)
    except ValueError as e:
        # Si el archivo no existe, crear un nuevo archivo
        print(f"Error al abrir el archivo en modo append: {e}. Creando archivo nuevo.")
        datasets.to_zarr(zarr_store, mode='w', consolidated=True, encoding=encoding)
    pbar.update(100)

Procesando 1980-03: 100%|██████████| 100/100 [00:26<00:00,  3.83it/s]


In [19]:
# Conexión a S3
s3 = s3fs.S3FileSystem(anon=False)
# Ubicación del archivo Zarr en S3
bucket_name = 'climate-action-datalake'
zarr_path = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'
s3_url_1 = f's3://{bucket_name}/{zarr_path}'
# Abrir el archivo Zarr
ds1 = xr.open_zarr(s3fs.S3Map(s3_url_1, s3=s3))
ds1

Unnamed: 0,Array,Chunk
Bytes,2.20 GiB,24.73 MiB
Shape,"(91, 1801, 3600)","(1, 1801, 3600)"
Dask graph,91 chunks in 2 graph layers,91 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.20 GiB 24.73 MiB Shape (91, 1801, 3600) (1, 1801, 3600) Dask graph 91 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  91,

Unnamed: 0,Array,Chunk
Bytes,2.20 GiB,24.73 MiB
Shape,"(91, 1801, 3600)","(1, 1801, 3600)"
Dask graph,91 chunks in 2 graph layers,91 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.20 GiB,24.73 MiB
Shape,"(91, 1801, 3600)","(1, 1801, 3600)"
Dask graph,91 chunks in 2 graph layers,91 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.20 GiB 24.73 MiB Shape (91, 1801, 3600) (1, 1801, 3600) Dask graph 91 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  91,

Unnamed: 0,Array,Chunk
Bytes,2.20 GiB,24.73 MiB
Shape,"(91, 1801, 3600)","(1, 1801, 3600)"
Dask graph,91 chunks in 2 graph layers,91 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.20 GiB,24.73 MiB
Shape,"(91, 1801, 3600)","(1, 1801, 3600)"
Dask graph,91 chunks in 2 graph layers,91 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.20 GiB 24.73 MiB Shape (91, 1801, 3600) (1, 1801, 3600) Dask graph 91 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  91,

Unnamed: 0,Array,Chunk
Bytes,2.20 GiB,24.73 MiB
Shape,"(91, 1801, 3600)","(1, 1801, 3600)"
Dask graph,91 chunks in 2 graph layers,91 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


### Función ejecución mes

In [36]:
def consolidate_nc(df, año, mes):
    start_date = pd.Timestamp(f"{año}-{mes:02d}-01")
    end_date = start_date + pd.offsets.MonthEnd(0)
    df_archivos = df
    df_mes = df_archivos[(df_archivos['fecha'] >= start_date) & (df_archivos['fecha'] <= end_date)]
    datasets = xr.open_mfdataset(df_mes['path'].tolist())
    # Convertir de Kelvin a Celsius
    for var_name in datasets.data_vars:
        datasets[var_name] = kelvin_a_celsius(datasets[var_name])
        datasets[var_name].attrs['units'] = 'C'
    return datasets

In [25]:
año = 1980
mes = 4
ds = consolidate_nc(df, año, mes)
ds

Unnamed: 0,Array,Chunk
Bytes,741.99 MiB,24.73 MiB
Shape,"(30, 1801, 3600)","(1, 1801, 3600)"
Dask graph,30 chunks in 64 graph layers,30 chunks in 64 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 741.99 MiB 24.73 MiB Shape (30, 1801, 3600) (1, 1801, 3600) Dask graph 30 chunks in 64 graph layers Data type float32 numpy.ndarray",3600  1801  30,

Unnamed: 0,Array,Chunk
Bytes,741.99 MiB,24.73 MiB
Shape,"(30, 1801, 3600)","(1, 1801, 3600)"
Dask graph,30 chunks in 64 graph layers,30 chunks in 64 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,741.99 MiB,24.73 MiB
Shape,"(30, 1801, 3600)","(1, 1801, 3600)"
Dask graph,30 chunks in 64 graph layers,30 chunks in 64 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 741.99 MiB 24.73 MiB Shape (30, 1801, 3600) (1, 1801, 3600) Dask graph 30 chunks in 64 graph layers Data type float32 numpy.ndarray",3600  1801  30,

Unnamed: 0,Array,Chunk
Bytes,741.99 MiB,24.73 MiB
Shape,"(30, 1801, 3600)","(1, 1801, 3600)"
Dask graph,30 chunks in 64 graph layers,30 chunks in 64 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,741.99 MiB,24.73 MiB
Shape,"(30, 1801, 3600)","(1, 1801, 3600)"
Dask graph,30 chunks in 64 graph layers,30 chunks in 64 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 741.99 MiB 24.73 MiB Shape (30, 1801, 3600) (1, 1801, 3600) Dask graph 30 chunks in 64 graph layers Data type float32 numpy.ndarray",3600  1801  30,

Unnamed: 0,Array,Chunk
Bytes,741.99 MiB,24.73 MiB
Shape,"(30, 1801, 3600)","(1, 1801, 3600)"
Dask graph,30 chunks in 64 graph layers,30 chunks in 64 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [37]:
def append_zarr(datasets, año, mes, bucket_salida, ruta_salida_s3):
    s3 = s3fs.S3FileSystem(anon=False)
    zarr_store = s3fs.S3Map(root=f's3://{bucket_salida}/{ruta_salida_s3}', s3=s3)

    # Configurar compresión y encoding
    blosc.set_nthreads(8)
    compressor = zarr.Blosc(cname='lz4', clevel=1, shuffle=False)
    encoding = {vname: {'compressor': compressor, 'chunks': (1, 1801, 3600)} for vname in datasets.data_vars}
    with tqdm(total=100, desc=f"Procesando {año}-{mes:02d}") as pbar:
        try:
            datasets.to_zarr(zarr_store, mode='a', append_dim='time', consolidated=True)
        except ValueError as e:
            # Si el archivo no existe, crear un nuevo archivo
            print(f"Error al abrir el archivo en modo append: {e}. Creando archivo nuevo.")
            datasets.to_zarr(zarr_store, mode='w', consolidated=True, encoding=encoding)
        pbar.update(100)
    print(f"Datos del mes {año}-{mes:02d} procesados y guardados en S3")

In [28]:
datasets = ds
año = 1980
mes = 4
bucket_name = 'climate-action-datalake'
ruta_salida_s3 = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'
append_zarr(datasets, año, mes, bucket_salida, ruta_salida_s3)

Procesando 1980-04: 100%|██████████| 100/100 [00:24<00:00,  4.03it/s]

Datos del mes 1980-04 procesados y guardados en S3





In [29]:
# Conexión a S3
s3 = s3fs.S3FileSystem(anon=False)
# Ubicación del archivo Zarr en S3
bucket_name = 'climate-action-datalake'
zarr_path = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'
s3_url_1 = f's3://{bucket_name}/{zarr_path}'
# Abrir el archivo Zarr
ds1 = xr.open_zarr(s3fs.S3Map(s3_url_1, s3=s3))
ds1

Unnamed: 0,Array,Chunk
Bytes,2.92 GiB,24.73 MiB
Shape,"(121, 1801, 3600)","(1, 1801, 3600)"
Dask graph,121 chunks in 2 graph layers,121 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.92 GiB 24.73 MiB Shape (121, 1801, 3600) (1, 1801, 3600) Dask graph 121 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  121,

Unnamed: 0,Array,Chunk
Bytes,2.92 GiB,24.73 MiB
Shape,"(121, 1801, 3600)","(1, 1801, 3600)"
Dask graph,121 chunks in 2 graph layers,121 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.92 GiB,24.73 MiB
Shape,"(121, 1801, 3600)","(1, 1801, 3600)"
Dask graph,121 chunks in 2 graph layers,121 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.92 GiB 24.73 MiB Shape (121, 1801, 3600) (1, 1801, 3600) Dask graph 121 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  121,

Unnamed: 0,Array,Chunk
Bytes,2.92 GiB,24.73 MiB
Shape,"(121, 1801, 3600)","(1, 1801, 3600)"
Dask graph,121 chunks in 2 graph layers,121 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.92 GiB,24.73 MiB
Shape,"(121, 1801, 3600)","(1, 1801, 3600)"
Dask graph,121 chunks in 2 graph layers,121 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.92 GiB 24.73 MiB Shape (121, 1801, 3600) (1, 1801, 3600) Dask graph 121 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  121,

Unnamed: 0,Array,Chunk
Bytes,2.92 GiB,24.73 MiB
Shape,"(121, 1801, 3600)","(1, 1801, 3600)"
Dask graph,121 chunks in 2 graph layers,121 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [30]:
año = 1980
mes = 5
datasets = consolidate_nc(df, año, mes)
bucket_name = 'climate-action-datalake'
ruta_salida_s3 = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'
#append_zarr(datasets, año, mes, bucket_salida, ruta_salida_s3)

Procesando 1980-05: 100%|██████████| 100/100 [00:26<00:00,  3.79it/s]

Datos del mes 1980-05 procesados y guardados en S3





In [31]:
# Conexión a S3
s3 = s3fs.S3FileSystem(anon=False)
# Ubicación del archivo Zarr en S3
bucket_name = 'climate-action-datalake'
zarr_path = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'
s3_url_1 = f's3://{bucket_name}/{zarr_path}'
# Abrir el archivo Zarr
#ds1 = xr.open_zarr(s3fs.S3Map(s3_url_1, s3=s3))
#ds1

Unnamed: 0,Array,Chunk
Bytes,3.67 GiB,24.73 MiB
Shape,"(152, 1801, 3600)","(1, 1801, 3600)"
Dask graph,152 chunks in 2 graph layers,152 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.67 GiB 24.73 MiB Shape (152, 1801, 3600) (1, 1801, 3600) Dask graph 152 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  152,

Unnamed: 0,Array,Chunk
Bytes,3.67 GiB,24.73 MiB
Shape,"(152, 1801, 3600)","(1, 1801, 3600)"
Dask graph,152 chunks in 2 graph layers,152 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.67 GiB,24.73 MiB
Shape,"(152, 1801, 3600)","(1, 1801, 3600)"
Dask graph,152 chunks in 2 graph layers,152 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.67 GiB 24.73 MiB Shape (152, 1801, 3600) (1, 1801, 3600) Dask graph 152 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  152,

Unnamed: 0,Array,Chunk
Bytes,3.67 GiB,24.73 MiB
Shape,"(152, 1801, 3600)","(1, 1801, 3600)"
Dask graph,152 chunks in 2 graph layers,152 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.67 GiB,24.73 MiB
Shape,"(152, 1801, 3600)","(1, 1801, 3600)"
Dask graph,152 chunks in 2 graph layers,152 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 3.67 GiB 24.73 MiB Shape (152, 1801, 3600) (1, 1801, 3600) Dask graph 152 chunks in 2 graph layers Data type float32 numpy.ndarray",3600  1801  152,

Unnamed: 0,Array,Chunk
Bytes,3.67 GiB,24.73 MiB
Shape,"(152, 1801, 3600)","(1, 1801, 3600)"
Dask graph,152 chunks in 2 graph layers,152 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
año = 1980
mes = 5
datasets = consolidate_nc(df, año, mes)
bucket_name = 'climate-action-datalake'
ruta_salida_s3 = 'zone=raw/source=agera5-v1-1/variable=temperature.zarr'
append_zarr(datasets, año, mes, bucket_salida, ruta_salida_s3)

### Dask

In [None]:
def procesar_rango_fechas(fecha_inicio, fecha_fin, ruta_local_entrada, bucket_salida, ruta_salida_s3):
    archivos_por_dia = ordenar_y_agrupar_archivos(ruta_local_entrada)
    
    tareas = []
    fecha_actual = fecha_inicio
    
    while fecha_actual <= fecha_fin:
        fecha_str = fecha_actual.strftime('%Y%m%d')
        if fecha_str in archivos_por_dia:
            tarea = dask.delayed(procesar_rango_fechas_individual)(fecha_str, archivos_por_dia[fecha_str], bucket_salida, ruta_salida_s3)
            tareas.append(tarea)
        else:
            print(f"No se encontraron archivos para la fecha: {fecha_str}")
        
        fecha_actual += pd.Timedelta(days=1)
    
    # Iniciar un cliente Dask
    client = Client()

    # Ejecutar todas las tareas en paralelo
    with ProgressBar():
        results = dask.compute(*tareas, scheduler='processes')
    
    # Consolidar metadatos al final del procesamiento
    s3 = s3fs.S3FileSystem(anon=False)
    zarr_store = s3fs.S3Map(root=f's3://{bucket_salida}/{ruta_salida_s3}', s3=s3)
    zarr.consolidate_metadata(zarr_store)
    print("Procesamiento completado y metadatos consolidados.")
    
    # Cerrar el cliente Dask
    client.close()

## Ejecución

In [None]:
ruta_local_entrada = '/home/ec2-user/SageMaker/datalake/data/agera/temperature/landing'
bucket_salida = 'climate-action-datalake'
ruta_salida_s3 = 'zone=raw/source=agera5-v1-1/variable=TemperatureAir_v2.zarr'
fecha_inicio = pd.Timestamp('1980-01-01')
fecha_fin = pd.Timestamp('1980-01-31')

procesar_rango_fechas(fecha_inicio, fecha_fin, ruta_local_entrada, bucket_salida, ruta_salida_s3)

## Validar zarr

In [None]:
# Conexión a S3
s3 = s3fs.S3FileSystem(anon=False)
# Ubicación del archivo Zarr en S3
bucket_name = 'climate-action-datalake'
zarr_path = 'zone=raw/source=agera5-v1-1/variable=TemperatureAir_v2.zarr'
s3_url_1 = f's3://{bucket_name}/{zarr_path}'
# Abrir el archivo Zarr
ds1 = xr.open_zarr(s3fs.S3Map(s3_url_1, s3=s3))
ds1