# Prepare ARGO

Converts ARGO netcdfs (previously downloaded through FTP) to Feather storing only the required variables and adding additional data to compare with models in the future. Additional data is CT and depth information computed using gsw package. ARGO data can be stored in daily or monthly files.

In [5]:
import warnings; warnings.simplefilter('ignore')
import os
import glob
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import xarray as xr
import os
import gsw
import numpy as np
import matplotlib.pyplot as plt

root_folder = 'db'
outdir = 'argos_collocated'
coords = 'coords025.nc'
year = '2000'

def format_date(row):
    """
    Converts the Julian Date column to datetime
    """
    return row['JULD'].strftime("%Y%m%dT%H%M%S")


def compute_AS(row):
    return gsw.SA_from_SP(row['PSAL_ADJUSTED'], row['PRES_ADJUSTED'], row['LONGITUDE'], row['LATITUDE'])


def compute_CT(row):
    """
    Computes Conservative temperature from In-situ temperature, absolute salinity and pressure
    """
    return gsw.CT_from_t(row['ASAL'], row['TEMP_ADJUSTED'], row['PRES_ADJUSTED'])


def compute_Z(row):
    """
    Computes Height from sea pressure and latitude
    """
    return gsw.z_from_p(row['PRES_ADJUSTED'], row['LATITUDE'], 0, 0)


def process_folder(folder_path, variables, outdir, name_offset, coords):
    for filename in os.listdir(folder_path):
        process_argo_profile(filename,
                             variables, coords)


def process_argo_profile(filename, variables, coords):

    try:
        ds = xr.open_dataset(filename)
        coordinates = xr.open_dataset(coords)
        # subset a les variables que me interesan
        ds = ds[variables]
        df = ds.to_dataframe().reset_index()
        df = df.drop(['N_PROF', 'N_LEVELS'], axis=1)
        df = df.dropna(axis=0)
        
        if not df.empty and df['PRES_ADJUSTED'].max() > 700:
            # Convierto las variables guardadas como 'bytes' a string o a int
            df['PSAL_ADJUSTED_QC'] = df['PSAL_ADJUSTED_QC'].str.decode(
                'utf-8').astype(int)
            df['TEMP_ADJUSTED_QC'] = df['TEMP_ADJUSTED_QC'].str.decode(
                'utf-8').astype(int)
            df['PRES_ADJUSTED_QC'] = df['PRES_ADJUSTED_QC'].str.decode(
                'utf-8').astype(int)

            df['DIRECTION'] = df['DIRECTION'].str.decode('utf-8')

            df = df[df['PSAL_ADJUSTED_QC'] == 1]
            if (df.empty):
                return pd.DataFrame()
            
            df = df[df['TEMP_ADJUSTED_QC'] == 1]
            if (df.empty):
                return pd.DataFrame()
            
            df = df[df['PRES_ADJUSTED_QC'] == 1]
            if (df.empty):
                return pd.DataFrame()
            
            # Apply the custom formatting function to the 'JULD' column
            df['JULD'] = df.apply(format_date, axis=1)
            
            # Compute AS (absolute salinity)
            df['ASAL'] = df.apply(compute_AS, axis=1)
            # Compute CT
            df['CTEMP'] = df.apply(compute_CT, axis=1)
            # Compute Z
            df['HEIGHT'] = df.apply(compute_Z, axis=1)
            df = df.sort_values(by='HEIGHT')
            
            # Remove ghost measures, keep only one
            df['spacing_diff'] = df['HEIGHT'].diff()
            
            threshold = 1.5 # miden cada 2m, a veces mas. Cuando hay ghosts, lo hacen sobre 20cm
            
            mask = (df['spacing_diff'] >= threshold) & ~df['HEIGHT'].isna()
            df[df['spacing_diff'].isna()]['HEIGHT'] = 2
            filtered_df = df[df['spacing_diff'] >= threshold]
      
            df = filtered_df.drop(columns=['spacing_diff'])
            
            

            # latitude bining in the CMEMS grid
            lat = coordinates['latitude'].values
            digitized = np.digitize(df['LATITUDE'], lat, right=True)
            df['LATITUDE_MODEL'] = [lat[i - 1] if i > 0 else lat[0]
                              for i in digitized]
            df['ILAT_MODEL'] = digitized

            # longitude bining in the CMEMS grid
            lon = coordinates['longitude'].values
            digitized = np.digitize(df['LONGITUDE'], lon, right=True)
            df['LONGITUDE_MODEL'] = [lon[i - 1] if i > 0 else lon[0]
                               for i in digitized]
            df['ILON_MODEL'] = digitized

            # longitude bining in the CMEMS grid
            depth = coordinates['depth'].values * -1
            digitized = np.digitize(df['HEIGHT'], depth, right=True)
            df['HEIGHT_MODEL'] = [depth[i - 1] if i > 0 else depth[0] for i in digitized]
            df['IHEIGHT_MODEL'] = digitized

            
            file = os.path.splitext(filename)[0]
            df['ORIGINAL'] = file
            df = df.drop(['POSITION_QC', 'PSAL_ADJUSTED', 'PSAL_ADJUSTED_QC', 'TEMP_ADJUSTED',
                         'TEMP_ADJUSTED_QC', 'PRES_ADJUSTED', 'PRES_ADJUSTED_QC'], axis=1)
            return df
        else:
            return pd.DataFrame()

    except Exception as error:
        print("err: ", error, "in file ", filename)
        return pd.DataFrame()

# Function to process files for a given date
def process_files(outdir, date, files, coords):
    # List of variables that we want to store from ARGO files
    variables = [
        'LATITUDE',
        'LONGITUDE',
        'POSITION_QC',
        'PSAL_ADJUSTED',
        'PSAL_ADJUSTED_QC',
        'TEMP_ADJUSTED',
        'TEMP_ADJUSTED_QC',
        'PRES_ADJUSTED',
        'PRES_ADJUSTED_QC',
        'DIRECTION',
        'JULD'
    ]

    print(f"Processing files for date {date}:")
    datasets = []
    for file_path in files:
        dataset = process_argo_profile(file_path, variables, coords)
        if len(dataset.columns) > 0:
            datasets.append(dataset)

    if (len(datasets) > 0):
        daily_ds = pd.concat(datasets, axis=0).reset_index()
        daily_ds.to_feather(f'{outdir}/{date}.feather')


def get_unique_dates(root_folder):
    """
    Returns a list of all unique dates in the directory structure.
    
    Parameters:
        root_folder (str): The root folder path.
        
    Returns:
        list: A list of unique dates.
    """
    unique_dates = set()

    for provider_folder in os.listdir(root_folder):
        provider_path = os.path.join(root_folder, provider_folder)

        if os.path.isdir(provider_path):
            for date_folder in os.listdir(provider_path):
                unique_dates.add(date_folder)

    return list(unique_dates)

def get_files_for_date(root_folder, target_date):
    """
    Returns a list of all files for a given date in the directory structure.
    
    Parameters:
        root_folder (str): The root folder path.
        target_date (str): The date for which to retrieve files.
        
    Returns:
        list: A list of file paths for the given date.
    """
    files_for_date = []

    for provider_folder in os.listdir(root_folder):
        provider_path = os.path.join(root_folder, provider_folder)

        if os.path.isdir(provider_path):
            date_folder_path = os.path.join(provider_path, target_date)

            if os.path.isdir(date_folder_path):
                files_for_date.extend([os.path.join(date_folder_path, file) for file in os.listdir(date_folder_path)])

    return files_for_date

dates = get_unique_dates(root_folder)
dates_year = [s for s in dates if s.startswith(year)]
for date in dates_year:
    process_files(outdir, date, get_files_for_date(root_folder, date), coords) 


Processing files for date 20000326:
Processing files for date 20000214:
Processing files for date 20000811:
Processing files for date 20000517:
Processing files for date 20000125:
Processing files for date 20000713:
Processing files for date 20000803:
Processing files for date 20000405:
Processing files for date 20001010:
Processing files for date 20000406:
Processing files for date 20001030:
Processing files for date 20000217:
Processing files for date 20000615:
Processing files for date 20001210:
Processing files for date 20000510:
Processing files for date 20000427:
Processing files for date 20001111:
Processing files for date 20000126:
Processing files for date 20000127:
Processing files for date 20000216:
Processing files for date 20000528:
Processing files for date 20001204:
Processing files for date 20000722:
Processing files for date 20001114:
Processing files for date 20000613:
Processing files for date 20000402:
Processing files for date 20000120:
Processing files for date 20

Processing files for date 20000603:
Processing files for date 20000223:
Processing files for date 20000709:
Processing files for date 20001121:
Processing files for date 20000410:
Processing files for date 20000715:
Processing files for date 20000802:
Processing files for date 20000903:
Processing files for date 20000526:
Processing files for date 20000305:
Processing files for date 20000208:
Processing files for date 20000516:
Processing files for date 20000129:
Processing files for date 20000222:
Processing files for date 20000218:
Processing files for date 20001017:
Processing files for date 20001028:
Processing files for date 20000130:
Processing files for date 20001027:
Processing files for date 20000908:
Processing files for date 20000325:
Processing files for date 20000302:
Processing files for date 20000311:
Processing files for date 20000901:
Processing files for date 20000706:
Processing files for date 20000115:
Processing files for date 20001005:
Processing files for date 20