In [1]:
import os 
import xarray as xr
import pandas as pd 
from datetime import datetime, timedelta

# Loads one day of data
def add_surface_data(data_path, data_surface_path, date, time_window):    

    sst_file_path = os.path.join(data_surface_path, f'{date}_sst.nc')
    sst = xr.open_dataset(sst_file_path)

    sss_file_path = os.path.join(data_surface_path, f'{date}_sss.nc')
    sss = xr.open_dataset(sss_file_path)

    uo_file_path = os.path.join(data_surface_path, f'{date}_uo.nc')
    uo = xr.open_dataset(uo_file_path)

    vo_file_path = os.path.join(data_surface_path, f'{date}_vo.nc')
    vo = xr.open_dataset(vo_file_path)

    mlt_file_path = os.path.join(data_surface_path, f'{date}_mlt.nc')
    mlt = xr.open_dataset(mlt_file_path)

    ssh_file_path = os.path.join(data_surface_path, f'{date}_ssh.nc')
    ssh = xr.open_dataset(ssh_file_path)
    
    parsed_date = datetime.strptime(date, "%Y%m%d")

    start_date = parsed_date - timedelta(days=time_window)
    end_date = parsed_date + timedelta(days=time_window)

    # load all files in +-window
    dates = pd.date_range(start=start_date, end=end_date).strftime("%Y%m%d")

    dfs = []
    
    for date in dates:
        file_path = f"{data_path}/{date}.feather"
        if os.path.exists(file_path):
            df = pd.read_feather(file_path)
            dfs.append(df)
        else:
            print(f"File not found for date: {date}")

    if dfs:
        profiles = pd.concat(dfs, ignore_index=True)
    else:
        return None
        
    # get unique lat-lon pairs to boost efficiency
    unique_coordinates_df = profiles[['LATITUDE', 'LONGITUDE']].drop_duplicates(subset=['LATITUDE', 'LONGITUDE'])
    
    # i get the sss and assign it. Same with sst
    unique_coordinates_df['SSS'] = unique_coordinates_df.apply(
        lambda row: sss.so_oras.sel(latitude=row['LATITUDE'], longitude=row['LONGITUDE'], method='nearest').values,
        axis=1
    )
    
    unique_coordinates_df['SST'] = unique_coordinates_df.apply(
        lambda row: sst.thetao_oras.sel(latitude=row['LATITUDE'], longitude=row['LONGITUDE'], method='nearest').values,
        axis=1
    )

    unique_coordinates_df['VO'] = unique_coordinates_df.apply(
        lambda row: vo.vo_oras.sel(latitude=row['LATITUDE'], longitude=row['LONGITUDE'], method='nearest').values,
        axis=1
    )  

    unique_coordinates_df['UO'] = unique_coordinates_df.apply(
        lambda row: uo.uo_oras.sel(latitude=row['LATITUDE'], longitude=row['LONGITUDE'], method='nearest').values,
        axis=1
    )  
    
    unique_coordinates_df['SSH'] = unique_coordinates_df.apply(
        lambda row: ssh.zos_oras.sel(latitude=row['LATITUDE'], longitude=row['LONGITUDE'], method='nearest').values,
        axis=1
    )  

    unique_coordinates_df['MLD'] = unique_coordinates_df.apply(
        lambda row: mlt.mlotst_oras.sel(latitude=row['LATITUDE'], longitude=row['LONGITUDE'], method='nearest').values,
        axis=1
    )  
    
    unique_coordinates_df['SSS'] = unique_coordinates_df['SSS'].astype(float)
    unique_coordinates_df['SST'] = unique_coordinates_df['SST'].astype(float)    
    unique_coordinates_df['VO'] = unique_coordinates_df['VO'].astype(float)
    unique_coordinates_df['UO'] = unique_coordinates_df['UO'].astype(float)    
    unique_coordinates_df['SSH'] = unique_coordinates_df['SSH'].astype(float)
    unique_coordinates_df['MLD'] = unique_coordinates_df['MLD'].astype(float)
    
    profiles = pd.merge(profiles, unique_coordinates_df, on=['LATITUDE', 'LONGITUDE'], how='left')
    profiles.drop(columns=['index', 'PRES'], inplace=True)
    profiles['DATE'] = pd.to_datetime(date)
    profiles['DATE'] = profiles['DATE'].astype(int) // 10**9
    
    return profiles

input_data = '/storage/model_collocated_simple'
input_data_surface = '/storage/model_surface'
output_data = '/storage/model_collocated_10d'
start_date = '20221015'
end_date = '20221231'

# Get all feather files (named by date)
files = [f for f in os.listdir(input_data) if f.endswith('.feather') and os.path.isfile(os.path.join(input_data, f))]

file_names = [os.path.splitext(f)[0] for f in files]
for file in file_names:
    if ((file >= start_date) and (file <= end_date)):
        print(file)
        ds = add_surface_data(input_data, input_data_surface, file, 5) 
        ds.to_feather(f'{output_data}/{file}.feather')

20221015
20221016
20221017
20221018
20221019
20221020
20221021
20221022
20221023
20221024
20221025
20221026
20221027
20221028
20221029
20221030
20221031
20221101
20221102
20221103
20221104
20221105
20221106
20221107
20221108
20221109
20221110
20221111
20221112
20221113
20221114
20221115
20221116
20221117
20221118
20221119
20221120
20221121
20221122
20221123
20221124
20221125
20221126
20221127
20221128
20221129
20221130
20221201
20221202
20221203
20221204
20221205
20221206
20221207
20221208
20221209
20221210
20221211
20221212
20221213
20221214
20221215
20221216
20221217
20221218
20221219
20221220
20221221
20221222
20221223
20221224
20221225
20221226
20221227
File not found for date: 20230101
20221228
File not found for date: 20230101
File not found for date: 20230102
20221229
File not found for date: 20230101
File not found for date: 20230102
File not found for date: 20230103
20221230
File not found for date: 20230101
File not found for date: 20230102
File not found for date: 20230103
F

In [3]:
import pandas as pd

pd.read_feather('/storage/model_collocated_10d/2010/20100101.feather')

Unnamed: 0,LATITUDE,LONGITUDE,HEIGHT,HEIGHT_MODEL,ASAL,CTEMP,SSS,SST,VO,UO,SSH,MLD,DATE,DENSITY
0,17.510,129.752,-1386.093031,-1265.861450,34.755959,2.587137,34.674809,25.869041,-0.191176,-0.201085,0.779364,64.492256,1262736000,1033.504338
1,17.510,129.752,-1188.516056,-1151.991211,34.743303,2.816395,34.674809,25.869041,-0.191176,-0.201085,0.779364,64.492256,1262736000,1032.941071
2,17.510,129.752,-1090.053401,-1045.854248,34.725934,3.068060,34.674809,25.869041,-0.191176,-0.201085,0.779364,64.492256,1262736000,1032.406851
3,17.510,129.752,-1040.211256,-947.447876,34.700409,3.355790,34.674809,25.869041,-0.191176,-0.201085,0.779364,64.492256,1262736000,1031.897356
4,17.510,129.752,-941.480907,-856.678955,34.660553,3.710914,34.674809,25.869041,-0.191176,-0.201085,0.779364,64.492256,1262736000,1031.403416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62121,62.427,-32.797,-24.064951,-22.757616,35.171554,5.454432,35.005314,5.495102,-0.027868,0.020059,-0.854847,433.800446,1262736000,1027.731818
62122,62.427,-32.797,-18.717429,-16.525322,35.171599,5.455117,35.005314,5.495102,-0.027868,0.020059,-0.854847,433.800446,1262736000,1027.702759
62123,62.427,-32.797,-13.864928,-11.773680,35.171644,5.455679,35.005314,5.495102,-0.027868,0.020059,-0.854847,433.800446,1262736000,1027.680605
62124,62.427,-32.797,-9.012310,-8.092519,35.171689,5.456238,35.005314,5.495102,-0.027868,0.020059,-0.854847,433.800446,1262736000,1027.663435
