## Import packages

In [None]:
!pip install rioxarray "xarray[complete]" 

In [2]:
# Libraries import
import xarray as xr
import s3fs
import pandas as pd
import re
import os
import multiprocessing
import fsspec
import boto3
import numpy as np
import rioxarray

## General functions (They can be executed for very variable)

In [9]:
def upload_files(files, s3, file_name, bucket_name, s3_file_key):
    
    """
    Uploads NetCDF files to an AWS S3 bucket.

    This function saves a given xarray Dataset to a local file in NetCDF format,
    uploads the file to the specified S3 bucket, and then removes the local file.

    Parameters:
    files (xarray.Dataset): The NetCDF datasets to be saved and uploaded.
    s3 (boto3.session.Session.client): The S3 client used for uploading files.
    local_file_path (str): The path where the NetCDF file will be saved locally.
    bucket_name (str): The name of the S3 bucket to upload the file to.
    s3_file_key (str): The S3 path and name under which the file will be stored.

    Returns:
    None
    """
    files.to_netcdf(file_name, engine='h5netcdf', format='NETCDF4')
    s3.upload_file(file_name, bucket_name, s3_file_key)
    os.remove(file_name)
    print(f'File {file_name} uploaded')

In [10]:
def file_exists_in_bucket(bucket_name, folder_path, file_name, s3_client):
    """
    Checks if a file exists in an S3 bucket.
    
    This function attempts to retrieve metadata from the specified file
    in the given S3 bucket and returns True if the file exists, otherwise False.

    Parameters:
    s3_client (boto3.session.Session.client): The S3 client used for accessing the bucket.
    bucket_name (str): The name of the S3 bucket.
    folder_path (str): The path to the folder within the S3 bucket.
    file_name (str): The name of the file to check for existence (includig extension).

    Returns:
    bool: True if the file exists, False otherwise.
    """
    try:

        s3_client.head_object(Bucket=bucket_name, Key=os.path.join(folder_path, file_name))
        return True
    except:
        return False

In [None]:
def resample_data(nc, res, bbox, file_name, s3_file_key, bucket_name, s3_client):
    
    """
    Resamples and crops a NetCDF dataset and uploads the processed file to an S3 bucket.

    This function resamples a NetCDF to a specified resolution using bilinear interpolation, 
    crops it to a bounding box,and uploads the processed data to an S3 bucket.

    Parameters:
    nc (xarray.Dataset): The NetCDF to be processed.
    res (float): The target resolution.
    bbox (tuple): The bounding box to crop the data to as (min_lon, min_lat, max_lon, max_lat).
    file_name (str): The name of the NetCDF file.
    s3_file_key (str): The S3 key under which the file will be stored in the bucket (path and name).
    bucket_name (str): The name of the S3 bucket to upload the file to.
    s3_client (boto3.session.Session.client): The S3 client used for uploading files.

    Returns:
    None
    """
    
    #Assign CRS to the data
    nc.rio.write_crs("EPSG:4326", inplace=True)
    
    # Resample the data to the new resolution using bilinear interpolation
    resampled_data = nc.rio.reproject("EPSG:4326", resolution=res, resampling=2)
    
    # Crop the resampled data to the bounding box
    cropped_data = resampled_data.rio.clip_box(*bbox)
    
    #Rename coordinates
    cropped_data = cropped_data.rename({'x': 'lon', 'y': 'lat'})

    #Upload the processed data to the bucket
    upload_files(files = cropped_data
                 ,s3 = s3_client
                 ,file_name = file_name
                 ,bucket_name = bucket_name
                 ,s3_file_key = s3_file_key)

## To process temperatures

In [12]:
def convert_kelvin_to_celsius(nc_files, bucket_name, folder_path, s3):
    
    """
    This function opens a NetCDF temperature file from an S3 bucket, converts the temperature data
    from Kelvin to Celsius (excluding NaN values), do the resample and clip process and upload tthe
    processed data to another S3 bucket.

    Parameters:
    nc_files (str): The S3 path to the NetCDF file.
    bucket_name (str): The name of the S3 bucket to upload the file to.
    folder_path (str): The folder path within the S3 bucket to upload the data.
    s3 (s3fs.S3FileSystem): The S3 file system object used for accessing the S3 bucket.

    Returns:
    None
    """
    
    #Set up the folders 
    s3_path = "s3://" + nc_files
    file_name = os.path.basename(nc_files)
    s3_file_key = os.path.join(folder_path, file_name)
    
    # Initialize S3 boto client with explicit credentials
    s3_client = boto3.client('s3',
                      aws_access_key_id='AKIATHPVGK3ZVLVSHEEF',
                      aws_secret_access_key='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',
                      region_name='us-west-2')
    
    # Check if the file already exists in the bucket
    if file_exists_in_bucket(bucket_name, folder_path, file_name, s3_client):
        print(f"File {nc_files} already exists in the bucket. Skipping process.")
        return None  
    
    else:
        print(f"File {nc_files} do not exists in the bucket. Starting process.")
        
        with s3.open(s3_path) as fileObj:
            # Open and do the conversion from Kelvin to Celsius just in not NA values
            nc =  xr.open_dataset(fileObj, engine='h5netcdf')
            temp = nc['Temperature_Air_2m_Min_24h']
            temp.values[~np.isnan(temp)] = temp.values[non_na_mask] - 273.15
            
            #Regrid and clip the nc to a certain resolution and bounding 
            resample_data(nc = nc
                          ,res = 0.05
                          ,bbox =  (-180, -50, 180, 50)
                          ,file_name = file_name
                          ,s3_file_key = s3_file_key
                          ,bucket_name = bucket_name
                          ,s3_client = s3_client)
            #Close the nc
            nc.close()
        

In [None]:
###### Test ###############

s3 = s3fs.S3FileSystem(key='AKIATHPVGK3ZVLVSHEEF',secret='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')

#Temperature folder in the bucket
folder = "s3://climate-action-datalake/zone=landing/source=agera5/variable=2mTemperature/"

# List files
files = s3.ls(folder)

# Define the pattern to match
pattern = re.compile(r'.*-Min-.*\.nc')

# Find files with the matching pattern
matching_files = [file for file in files if pattern.match(file)]

bucket_name = 'climate-action-datalake'
folder_path = 'zone=temporal/source=agera5/variable=airTemperatureMin/'

arguments = [(nc_files, 0.05, (-180, -50, 180, 50),  bucket_name, folder_path, s3) for nc_files in matching_files]

with multiprocessing.Pool(20) as pool:
            results = pool.starmap(convert_kelvin_to_celsius, arguments)
print('Finished')