## Notebook Overview

This notebook processes oceanographic data for historical and future climate scenarios (SSP585 and SSP370) within the 30S-50S latitude band, focusing on the depth interval from 300m to 1500m.

For each scenario and model, the following steps are performed:

1.  **Data Loading**: NetCDF files from the 'Medias_nc\\50S_20S' folder are opened using xarray.
2.  **Variable Conversion**:
    *   Depth ('lev') is converted to pressure ('p') using `gsw.p_from_z`.
    *   Preformed salinity ('so') is converted to absolute salinity using `gsw.SA_from_Sstar`.
3.  **Depth Selection**: The data is sliced to retain only the depth range between 300m and 1500m.
4.  **Volume Calculation**: A 'vol' variable is created, representing the volume for every 5m depth interval, based on the 'area' variable.
5.  **Interval Binning**:
    *   Temperature ('thetao') data is binned into 0.2°C intervals from -2°C to 20°C.
    *   Salinity ('so') data is binned into 0.02 intervals from 33 to 37.
6.  **Data Aggregation**: The total volume ('vol') is summed for each unique combination of binned temperature and salinity.
7.  **Mean Calculation**: The midpoint (average) of the temperature and salinity bins are calculated and added as new columns.
8.  **DataFrame Restructuring**: The resulting aggregated data is formatted into a pandas DataFrame with 'thetao', 'so', and 'vol' columns.
9.  **Data Saving**: Each processed DataFrame is saved as a Parquet file in the 'Dataframes_50S_20S' folder, with a filename derived from the original NetCDF file.

In [None]:
import pygmt
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import os
import gsw

### Creating an historical dataframe


In [None]:
import os
# Folder where the files are located
pasta = 'Medias_nc\\50S_20S'

# Folder to save the files
saves = 'Dataframes_50S_20S'

# List the files in the folder
arquivos = os.listdir(pasta)

# Loop through the files
for arquivo in arquivos:
    if "historical" in arquivo:
        caminho_arquivo = os.path.join(pasta, arquivo)

        # Open the file using xarray
        ds = xr.open_dataset(caminho_arquivo)

        # Convert Depth to pressure.
        ds['p'] = gsw.p_from_z(-ds['lev'], ds['lat'])

        # Convert preformed salinity to absolute salinity.
        ds['so'] = gsw.SA_from_Sstar(ds['so'], ds['p'], ds['lon'],ds['lat'])

        # Select the depth interval
        ds_lev = ds.sel(lev=slice(300, 1500))

        if 'lev_bnds' in ds_lev:
            ds_lev = ds_lev.drop('lev_bnds')

        if 'p' in ds_lev:
            ds_lev = ds_lev.drop('p')

        # Create the volume variable, according to the area for every 5m
        ds_lev["vol"] = 5 * ds_lev["area"]

        # Create the list for temperature intervals
        lista_temp = []           # Create an empty list
        i = -2                    # Define a minimum for temperature
        lista_temp.append(i)      # Add this minimum to the list
        while round(i, 2) < 20:   # Define a maximum for temperature, round is just to ensure 19.999 becomes 20.
            i += 0.2
            lista_temp.append(round(i, 2))   # round to ensure two decimal places and not e.g.: 2.400002.

        # Create the list for salinity intervals --> Same considerations as for the temperature list
        lista_sal = []
        i=33
        lista_sal.append(i)
        while round(i, 3) < 37:
            i+=0.02
            lista_sal.append(round(i, 3))

        # Create a dataframe to store
        df_final = pd.DataFrame()

        df = ds_lev.to_dataframe().dropna().reset_index()

        # Apply pd.cut with defined limits.
        cut_temp = pd.cut(df['thetao'], bins=lista_temp, include_lowest=True)   # Bins separate into 1-1.02; 1.02-1.04 ...
        cut_salin = pd.cut(df['so'], bins=lista_sal, include_lowest=True)       # Bins separate into 1-1.002; 1.002-1.004 ...

        # Group data by category intersections.
        agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()

        # Concatenate my data into a final dataframe
        df_final = pd.concat([df_final, agrupado], ignore_index=True)

        # Create empty lists
        thetao_medias = []
        so_medias = []

        # Calculate interval averages and add to lists
        for _, row in df_final.iterrows():
            thetao_medias.append(row['thetao'].mid)
            so_medias.append(row['so'].mid)

        # Add lists as new columns to df_agrupado
        df_final['thetao_media'] = thetao_medias
        df_final['so_media'] = so_medias

        df_final = df_final[["thetao_media", "so_media", "vol"]]

        # Rename
        df_final.rename(columns={"thetao_media": "thetao", "so_media": "so"}, inplace=True)

        # Extract the base file name (without path)
        nome_base = os.path.basename(arquivo)

        # Extract the relevant part of the name to use as DataFrame name
        nome_partes = nome_base.split('.')
        nome_dataframe = '.'.join(nome_partes[0:4])

        # Define the output path for the parquet file based on the DataFrame name
        caminho_saida = os.path.join(saves, f"{nome_dataframe}.parquet")

        # Save the DataFrame in parquet format
        df_final.to_parquet(caminho_saida)

        print('Processing Completed: {}'.format(nome_dataframe))
        # Close the xarray dataset
        ds.close()

  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: CMIP.CAMS.CAMS-CSM1-0.historical


  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: CMIP.CNRM-CERFACS.CNRM-ESM2-1.historical


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: CMIP.IPSL.IPSL-CM6A-LR.historical


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: CMIP.MIROC.MIROC6.historical


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: CMIP.NCAR.CESM2.historical


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: CMIP.NOAA-GFDL.GFDL-ESM4.historical


## Creating an SSP585 dataframe


In [None]:
import os
# Folder where the files are located
pasta = 'Medias_nc\\50S_20S'

# Folder to save the files
saves = 'Dataframes_50S_20S'

# List the files in the folder
arquivos = os.listdir(pasta)

# Loop through the files
for arquivo in arquivos:
    if "ssp585" in arquivo:
        caminho_arquivo = os.path.join(pasta, arquivo)

        # Open the file using xarray
        ds = xr.open_dataset(caminho_arquivo)

        # Convert Depth to pressure.
        ds['p'] = gsw.p_from_z(-ds['lev'], ds['lat'])

        # Convert preformed salinity to absolute salinity.
        ds['so'] = gsw.SA_from_Sstar(ds['so'], ds['p'], ds['lon'],ds['lat'])

        # Select the depth interval from 500 to 1500m
        ds_lev = ds.sel(lev=slice(300, 1500))

        # Create the volume variable, according to the area for every 5m
        ds_lev["vol"] = 5 * ds_lev["area"]

        if 'lev_bnds' in ds_lev:
            ds_lev = ds_lev.drop('lev_bnds')

        if 'p' in ds_lev:
            ds_lev = ds_lev.drop('p')

        # Create the list for temperature intervals
        lista_temp = []
        i = -2
        lista_temp.append(i)
        while round(i, 2) < 20:
            i += 0.2
            lista_temp.append(round(i, 2))

        # Create the list for salinity intervals
        lista_sal = []
        i=33
        lista_sal.append(i)
        while round(i, 3) < 37:
            i+=0.02
            lista_sal.append(round(i, 3))

        # Create a dataframe to store
        df_final = pd.DataFrame()

        df = ds_lev.to_dataframe().dropna().reset_index()

        # Apply pd.cut with defined limits
        cut_temp = pd.cut(df['thetao'], bins=lista_temp, include_lowest=True)
        cut_salin = pd.cut(df['so'], bins=lista_sal, include_lowest=True)

        # Group data by category intersections
        agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()

        # Concatenate my data into a final dataframe
        df_final = pd.concat([df_final, agrupado], ignore_index=True)

        # Create empty lists
        thetao_medias = []
        so_medias = []

        # Calculate interval averages and add to lists
        for _, row in df_final.iterrows():
            thetao_medias.append(row['thetao'].mid)
            so_medias.append(row['so'].mid)

        # Add lists as new columns to df_agrupado
        df_final['thetao_media'] = thetao_medias
        df_final['so_media'] = so_medias

        df_final = df_final[["thetao_media", "so_media", "vol"]]

        # Rename
        df_final.rename(columns={"thetao_media": "thetao", "so_media": "so"}, inplace=True)

        # Extract the base file name (without path)
        nome_base = os.path.basename(arquivo)

        # Extract the relevant part of the name to use as DataFrame name
        nome_partes = nome_base.split('.')
        nome_dataframe = '.'.join(nome_partes[0:4])

        # Define the output path for the parquet file based on the DataFrame name
        caminho_saida = os.path.join(saves, f"{nome_dataframe}.parquet")

        # Save the DataFrame in parquet format
        df_final.to_parquet(caminho_saida)

        print('Processing Completed: {}'.format(nome_dataframe))
        # Close the xarray dataset
        ds.close()

  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: ScenarioMIP.CAMS.CAMS-CSM1-0.ssp585


  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: ScenarioMIP.CNRM-CERFACS.CNRM-ESM2-1.ssp585


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp585


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: ScenarioMIP.MIROC.MIROC6.ssp585


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: ScenarioMIP.NCAR.CESM2.ssp585


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')
  agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()


Processamento Concluído: ScenarioMIP.NOAA-GFDL.GFDL-ESM4.ssp585


## Creating an SSP370 dataframe


In [None]:
import os
# Folder where the files are located
pasta = 'Medias_nc\\50S_20S'

# Folder to save the files
saves = 'Dataframes_50S_20S'

# List the files in the folder
arquivos = os.listdir(pasta)

# Loop through the files
for arquivo in arquivos:
    if "ssp370" in arquivo:
        caminho_arquivo = os.path.join(pasta, arquivo)

        # Open the file using xarray
        ds = xr.open_dataset(caminho_arquivo)

        # Convert Depth to pressure.
        ds['p'] = gsw.p_from_z(-ds['lev'], ds['lat'])

        # Convert preformed salinity to absolute salinity.
        ds['so'] = gsw.SA_from_Sstar(ds['so'], ds['p'], ds['lon'],ds['lat'])

        # Select the depth interval from 500 to 1500m
        ds_lev = ds.sel(lev=slice(300, 1500))

        # Create the volume variable, according to the area for every 5m
        ds_lev["vol"] = 5 * ds_lev["area"]

        if 'lev_bnds' in ds_lev:
            ds_lev = ds_lev.drop('lev_bnds')

        if 'p' in ds_lev:
            ds_lev = ds_lev.drop('p')

        # Create the list for temperature intervals
        lista_temp = []
        i = -2
        lista_temp.append(i)
        while round(i, 2) < 20:
            i += 0.2
            lista_temp.append(round(i, 2))

        # Create the list for salinity intervals
        lista_sal = []
        i=33
        lista_sal.append(i)
        while round(i, 3) < 37:
            i+=0.02
            lista_sal.append(round(i, 3))

        # Create a dataframe to store
        df_final = pd.DataFrame()

        df = ds_lev.to_dataframe().dropna().reset_index()

        # Apply pd.cut with defined limits
        cut_temp = pd.cut(df['thetao'], bins=lista_temp, include_lowest=True)
        cut_salin = pd.cut(df['so'], bins=lista_sal, include_lowest=True)

        # Group data by category intersections
        agrupado = df.groupby([cut_temp, cut_salin])['vol'].sum().reset_index()

        # Concatenate my data into a final dataframe
        df_final = pd.concat([df_final, agrupado], ignore_index=True)

        # Create empty lists
        thetao_medias = []
        so_medias = []

        # Calculate interval averages and add to lists
        for _, row in df_final.iterrows():
            thetao_medias.append(row['thetao'].mid)
            so_medias.append(row['so'].mid)

        # Add lists as new columns to df_agrupado
        df_final['thetao_media'] = thetao_medias
        df_final['so_media'] = so_medias

        df_final = df_final[["thetao_media", "so_media", "vol"]]

        # Rename
        df_final.rename(columns={"thetao_media": "thetao", "so_media": "so"}, inplace=True)

        # Extract the base file name (without path)
        nome_base = os.path.basename(arquivo)

        # Extract the relevant part of the name to use as DataFrame name
        nome_partes = nome_base.split('.')
        nome_dataframe = '.'.join(nome_partes[0:4])

        # Define the output path for the parquet file based on the DataFrame name
        caminho_saida = os.path.join(saves, f"{nome_dataframe}.parquet")

        # Save the DataFrame in parquet format
        df_final.to_parquet(caminho_saida)

        print('Processing Completed: {}'.format(nome_dataframe))
        # Close the xarray dataset
        ds.close()

  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')


Processamento Concluído: ScenarioMIP.CAMS.CAMS-CSM1-0.ssp370


  ds_lev = ds_lev.drop('p')


Processamento Concluído: ScenarioMIP.CNRM-CERFACS.CNRM-ESM2-1.ssp370


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')


Processamento Concluído: ScenarioMIP.IPSL.IPSL-CM6A-LR.ssp370


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')


Processamento Concluído: ScenarioMIP.MIROC.MIROC6.ssp370


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')


Processamento Concluído: ScenarioMIP.NCAR.CESM2.ssp370


  ds_lev = ds_lev.drop('lev_bnds')
  ds_lev = ds_lev.drop('p')


Processamento Concluído: ScenarioMIP.NOAA-GFDL.GFDL-ESM4.ssp370
