In [2]:
import numpy as np
import argopy
from argopy import DataFetcher
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import gsw
from datetime import datetime
from cartopy.geodesic import Geodesic
from scipy import linalg
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel as C
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import haversine_distances
from matplotlib.patches import Circle

import ArgoHeatContentDataCollater as ahdc
import ArgoGPR as ag

In [3]:
# 1. LOAD DATA
# Adjust path to your folder
df_argo = ahdc.load_argo_data_advanced(
    nc_dir='/home/avik2007/ArgoEBUSAnalysis/ArgoData/', 
    start_date='2018-01-01', 
    end_date='2018-03-01', # Grab a few months to test
    lat_bounds=[30, 40], 
    lon_bounds=[-126, -119]
)


ðŸ“‚ FOUND LOCAL DATASET: /home/avik2007/ArgoEBUSAnalysis/ArgoData/argo_2018-01-01_to_2018-03-01_lat30_40_lon-126_-119_z0_200.pkl
   Loading processed DataFrame...
   âœ… Loaded 2127 observations from disk.


In [4]:
jan_index = 1
feb_index = 2
jan_mask = [(df_argo['date'][i].month == jan_index) for i in range(0, len(df_argo))]
df_argo_january = df_argo[jan_mask]
feb_mask = [(df_argo['date'][i].month == feb_index) for i in range(0, len(df_argo))]
df_argo_february = df_argo[feb_mask]

# have gemini add a warning about the convergence. if the optimal value is near the limit, then it will likely be a divergent length scale. 

In [5]:
# 2. RUN LOOO (Baseline Check)
# This validates if the model works mathematically
# IN PRACTICE - YOU HAVE TO CREATE A LOOP WHERE YOU CHECK IF THE STD IS TOO LARGE OR TOO SMALL, AND THEN ADJUST. The AUTOTUNE IS PRETTY GOOD, BUT THAT MAY NOT ALWAYS BE TRUE
z_looo = ag.generalized_cross_validation(
    df_argo_january, 
    feature_cols=['lat', 'lon'], # 2D Spatial Only for now
    method='KFold', k_fold_data_percent=2
)



ðŸš€ STARTING GLOBAL VALIDATION: KFold
   ðŸ¤– AutoTuning: Running 5 iterations on 100 points (5.0%) to estimate correlation lengths/times...
      (Constraint: Length Scale capped at 7.12 standard deviations)




      âœ… LEARNED HYPERPARAMETERS (Avg of 5 runs):
         Noise (Uncertainty): Â±2.369 Â°C
         Correlation Lengths:
           - lat: 2.074Â°  (~230 km)
           - lon: 4.748Â°  (~429 km at 35.5N)
   âš¡ Strategy: 50-Fold CV (Testing 2% per fold)




   Processed fold 26...

KeyboardInterrupt: 

In [14]:
# 2. RUN LOOO (Baseline Check) (2ND ITERATION)
# This validates if the model works mathematically
# IN PRACTICE - YOU HAVE TO CREATE A LOOP WHERE YOU CHECK IF THE STD IS TOO LARGE OR TOO SMALL, AND THEN ADJUST. The AUTOTUNE IS PRETTY GOOD, BUT THAT MAY NOT ALWAYS BE TRUE
z_looo = ag.generalized_cross_validation(
    df_argo_february, 
    feature_cols=['lat', 'lon'], # 2D Spatial Only for now
    method='KFold', k_fold_data_percent=5
)


ðŸš€ STARTING GLOBAL VALIDATION: KFold
   ðŸ¤– AutoTuning: Running 5 iterations on 100 points (5.0%) to estimate correlation lengths/times...
      (Constraint: Length Scale capped at 7.22 standard deviations)




      âœ… LEARNED HYPERPARAMETERS (Avg of 5 runs):
         Noise (Uncertainty): Â±2.089 Â°C
         Correlation Lengths:
           - lat: 3.440Â°  (~382 km)
           - lon: 6.198Â°  (~557 km at 35.9N)
   âš¡ Strategy: 20-Fold CV (Testing 5% per fold)
   Processed fold 19...
âœ… RESULTS (KFold):
   RMSE:                2.108 Â°C
   Rel. Error (RMSRE):  0.1900 (dimensionless)
   Mean Z:              0.001
   Std Z:               1.006 (Ideal: 1.0)


In [17]:


# 3. RUN LOFO (The "Hard" Test)
# This validates if the model works SCIENTIFICALLY
z_lofo = ag.generalized_cross_validation(
    df_argo_january, 
    feature_cols=['lat', 'lon'], 
    method='LOFO'
)


ðŸš€ STARTING GLOBAL VALIDATION: LOFO
   ðŸ¤– AutoTuning: Running 5 iterations on 100 points (5.0%) to estimate correlation lengths/times...
      (Constraint: Length Scale capped at 7.12 standard deviations)




      âœ… LEARNED HYPERPARAMETERS (Avg of 5 runs):
         Noise (Uncertainty): Â±2.352 Â°C
         Correlation Lengths:
           - lat: 2.805Â°  (~311 km)
           - lon: 4.534Â°  (~410 km at 35.5N)
   Processed float 1...
âœ… RESULTS (LOFO):
   RMSE:                3.157 Â°C
   Rel. Error (RMSRE):  0.2588 (dimensionless)
   Mean Z:              0.172
   Std Z:               1.106 (Ideal: 1.0)


In [19]:

# 3. RUN LOFO (The "Hard" Test)
# This validates if the model works SCIENTIFICALLY
z_lofo = ag.generalized_cross_validation(
    df_argo_february, 
    feature_cols=['lat', 'lon'], 
    method='LOFO')


ðŸš€ STARTING GLOBAL VALIDATION: LOFO
   ðŸ¤– AutoTuning: Running 5 iterations on 100 points (5.0%) to estimate correlation lengths/times...
      (Constraint: Length Scale capped at 7.22 standard deviations)




      âœ… LEARNED HYPERPARAMETERS (Avg of 5 runs):
         Noise (Uncertainty): Â±2.104 Â°C
         Correlation Lengths:
           - lat: 7.268Â°  (~807 km)
           - lon: 4.659Â°  (~419 km at 35.9N)
   Processed float 1...
âœ… RESULTS (LOFO):
   RMSE:                2.195 Â°C
   Rel. Error (RMSRE):  0.1945 (dimensionless)
   Mean Z:              -0.016
   Std Z:               0.991 (Ideal: 1.0)


#MOVING WINDOW WITH VARIABLE PARAMETERS REQUIRES PARALLELIZATION! 

# 4. RUN LOOO with Moving Window (Baseline Check) 
# This validates if the model works mathematically, but properly accounts for how variable the physics is at different points in space
# Here, we have to consider the effects of the moving window as well. 
z_mv_LOOO = ag.validate_moving_window(df_argo_january, feature_cols=['lat', 'lon'], target_col='temp', 
                           method='KFold', radius_km = 300, tune_iterations=10)

# 5. RUNNING IN TIME AND ACTUAL KRIGING 
## We plan on estimating OHC at each argo point and then kriging on the 2D resultant fields. This way, each separate layer gets its own correlation lengths (which would be key
## seeing any difference between the separate profiles.)