In [1]:
 # !pip install rioxarray "xarray[complete]"

In [2]:
import rioxarray as rx
import pandas as pd
import boto3
import xarray as xr
import glob
import shutil
import os

In [3]:

continentalBox=[
    ("Africa", -20.0, -35.0, 55.0, 40.0),
    ("Antarctica", -180.0, -90.0, 180.0, -60.0),
    ("Asia", 25.0, -12.0, 180.0, 75.0),
    ("Europe", -25.0, 35.0, 45.0, 75.0),
    ("North_America", -180.0, 10.0, -35.0, 75.0),
    ("Oceania", 100.0, -50.0, 180.0, -10.0),
    ("South_America", -90.0, -60.0, -30.0, 15.0)
]

continentalDf = pd.DataFrame(
    continentalBox,
    columns=["continent", "xmin", "ymin", "xmax", "ymax"]
)

bucketName = "climate-action-datalake"
s3LandingPath = "zone=landing/source=agera5/variable=2mTemperature/"
zone = "raw"
source = "agera5"
variable = "2mTemperature"


In [4]:

def listS3Files(bucketName: str, path: str):
    objectArr = []
    s3 = boto3.resource("s3")
    s3Bucket = s3.Bucket(bucketName)
    for blob in s3Bucket.objects.filter(Prefix=path):
        objectArr.append(blob)
    return objectArr

def convertKelvinCelsius(k):
    c = k - 273.15
    return c

def filesToProcess(s3Files: list, year: int=None):
    filesToProcess = pd.DataFrame(columns=['s3_path', 'file_name'])
    s3Path = []
    fileName = []
    if year:
        for blob in s3Files:
            if str(year) in blob.key:
                file = blob.key
                s3Path.append(file)
                fileName.append(file.split('/')[3].split('.')[0])
    else:
        for blob in s3Files:
            file = blob.key
            s3Path.append(file)
            fileName.append(file.split('/')[3].split('.')[0])
    filesToProcess['s3_path'] = s3Path
    filesToProcess['file_name'] = fileName
    return filesToProcess

def downloadData(bucketName: str, s3Path: str, outputName: str, s3):
    s3.download_file(bucketName, s3Path, outputName)

def cutGeoTiff(continentsDf, geoTiff, fileName: str, outputPath: str):
    for index, row in continentsDf.iterrows():
        continentName = row['continent']
        xmin = row['xmin']
        ymin = row['ymin']
        xmax = row['xmax']
        ymax = row['ymax']
        _geoTiff = geoTiff.rio.clip_box(xmin, ymin, xmax, ymax)
        _geoTiff.rio.to_raster(f"{outputPath}/{fileName}{continentName}.tiff")

def uploadData(s3_client, bucket, filename, uploadPath):
    s3_client.upload_file(Filename=filename, Bucket=bucket, Key=uploadPath)

In [5]:
s3Files =  listS3Files(bucketName = bucketName
                       ,path = s3LandingPath)

In [None]:
for _year in range(1986,2024):
    print(_year)
    df = filesToProcess(s3Files=s3Files,
                        year=_year)
    s3 = boto3.client("s3")
    outputPath = "data"
    originaExtension = "nc"
    for index, row in df.iterrows():
        _file_name = row['file_name']
        _s3_path = row['s3_path']
        downloadFile = outputPath+"/"+_file_name+"."+originaExtension
        downloadData(bucketName=bucketName
                , s3Path=_s3_path
                , outputName=downloadFile
                , s3=s3
                    )
        geoFile= xr.open_dataset(downloadFile)
        geoFileC = geoFile.apply(convertKelvinCelsius)
        variable = list(geoFileC.keys())[0]
        #print(variable)
        variableValues = geoFileC[variable]
        variableValues = variableValues.rio.set_spatial_dims(x_dim='lon', y_dim='lat')
        variableValues.rio.write_crs("epsg:4326", inplace=True)
        variableValues.rio.to_raster(r"data/dev.tiff")
        rxDf = rx.open_rasterio("data/dev.tiff")
        cutGeoTiff(continentsDf = continentalDf
                       , geoTiff =  rxDf
                       ,fileName = _file_name
                       ,outputPath = outputPath
                      )
        uploadFilesList = glob.glob(outputPath+"/*")
        uploadFilesList = [f  for f in uploadFilesList  if variable.replace('_','-') in f and ".tiff" in f ]
        for up in uploadFilesList:
            filename = up.split("/")[1]
            continent = up.split("v1")[1].split(".")[0]
            uploadLandingPath = f"zone={zone}/source={source}/continent={continent}/variable={variable}/{filename}"
            # s3_client, bucket,filename, uploadPath
            uploadData(s3_client=s3
                       ,filename=up
                       ,bucket=bucketName
                       ,uploadPath=uploadLandingPath)
            #print(filename,continent)
        shutil.rmtree("data")
        os.makedirs("data")

1986
1987
1988


In [None]:
geoFile= xr.open_dataset("data/Temperature-Air-2m-Min-24h_C3S-glob-agric_AgERA5_19791029_final-v1.nc")
geoFileC = geoFile.apply(convertKelvinCelsious)
variable = list(geoFileC.keys())[0]

In [None]:
variable

In [None]:
df[df.file_name.str.contains("Temperature-Air-2m-Min")]