In [1]:
# CV Strategy
# Batch normalization
import os
import xarray as xr
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import random
from tensorflow.keras import layers, models, Input
import tensorflow as tf
import time
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 0=all, 1=info, 2=warning, 3=error

print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("GPU available to TF:", tf.test.is_gpu_available())
start = time.time()
folder = "../../data/model_data/mini_data_set"
full_paths = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nc")]
random.shuffle(full_paths)

# sensor_df = pd.read_csv("../../data/sensor_data/final_sensor_withgrid.csv")
# sensor_df = sensor_df[['lat','lon', 'state']]
# sensor_df_la = sensor_df[sensor_df['state']=='Louisiana'].reset_index(drop=True)
# sensor_df_la = sensor_df_la[['lat','lon']]
# sensor_df_la = sensor_df_la.drop_duplicates().reset_index(drop=True)
# sensor_df_la['cv_group'] = 'test'
# sensor_df_tx = sensor_df[sensor_df['state']=='Texas'].reset_index(drop=True)
# sensor_df_tx = sensor_df_tx[['lat','lon']]
# sensor_df_tx = sensor_df_tx.drop_duplicates().reset_index(drop=True)

# # # Shuffle the indices
# sensor_df_tx = sensor_df_tx.sample(frac=1, random_state=42).reset_index(drop=True)

# # UPDATE: 10% Test set, 5 folds 
# # Assign groups evenly
# sensor_df_tx['cv_group'] = sensor_df_tx.index % 5 + 1  # Groups: 0 to 4
# sensor_df = pd.concat([sensor_df_la, sensor_df_tx], axis=0)
# sensor_df.to_csv("../../data/sensor_data/final_sensor_cvgroups.csv")

sensor_df = pd.read_csv("../../data/sensor_data/final_sensor_cvgroups.csv", index_col=0)


2025-09-14 00:53:50.377257: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-14 00:53:51.298409: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib
2025-09-14 00:53:51.298498: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local

Num GPUs Available: 0
Built with CUDA: True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU available to TF: False


In [2]:
class ncDataLoader(keras.utils.Sequence):
    """Helper to iterate over the data (as Numpy arrays)."""

    def __init__(self, batch_size, img_size, img_paths, drop_sensor_locs_df):
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_paths = img_paths
        self.drop_sensor_locs_df = drop_sensor_locs_df

    def __len__(self):
        return len(self.img_paths) // self.batch_size

    def __getitem__(self, idx):
        """Returns tuple (input, target) correspond to batch #idx."""
        i = idx * self.batch_size
        batch_input_img_paths = self.img_paths[i : i + self.batch_size]

        x = np.zeros((self.batch_size,) + self.img_size + (22,), dtype="float32")
        y = np.zeros((self.batch_size,) + self.img_size + (1,), dtype="float32")

        for j, path in enumerate(batch_input_img_paths):
            # print(path)
            ds = xr.open_dataset(path, engine='netcdf4')
            input_ds = ds.copy()
            input_ds = input_ds.drop_vars("y_sensor_no2")
            input_array = input_ds.to_array().values
            x[j] = np.transpose(input_array, (1, 2, 0))
            
            target_ds = ds['y_sensor_no2']
            
            # Loop over rows to mask each (lat, lon) pair
            for _, row in self.drop_sensor_locs_df.iterrows():
                target_ds.loc[dict(lat=row['lat'], lon=row['lon'])] = np.nan
            
            target_ds = target_ds.values
            # target_ds = np.where(np.isnan(target_ds), -999, target_ds)
            y[j, :, :, 0] = target_ds
 
        return x, y






def build_baseline_model(
    input_shape=(479, 1059, 22)
):
    """
    Build a convolution-free, per-pixel linear regression model.
    Each pixel's 22 input features are combined linearly to produce 1 output value.
    
    Args:
        input_shape (tuple): Input shape (H, W, C)

    Returns:
        Keras Model
    """
    inputs = Input(shape=input_shape)
    x = inputs

    # Per-pixel linear regression (no hidden layer)
    outputs = layers.Dense(1, activation='linear', use_bias=True)(x)

    model = models.Model(inputs=inputs, outputs=outputs)
    return model


def masked_mse(y_true, y_pred):
    # Create mask: True where y_true is not NaN
    mask = tf.math.logical_not(tf.math.is_nan(y_true))
    # tf.print("Valid pixels:", tf.reduce_sum(tf.cast(mask, tf.int32)))

    # Apply mask to both y_true and y_pred
    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    
    # Compute MSE only on valid values
    diff_squared = tf.square(y_true_masked - y_pred_masked)
    
    # Avoid division by zero
    denom = tf.cast(tf.size(diff_squared), tf.float32)
    denom = tf.maximum(denom, 1e-6)

    return tf.reduce_sum(diff_squared) / denom


In [3]:
################################################
##      Cross Validation and Save Results     ##
################################################

# Parameters
batch_size = 4
img_size = (479, 1059)
img_paths = full_paths
kernel_sizes_param = [3, 5, 3]
CNN_filters_param = 64
activation_param = 'relu'
epochs_param = 4

cv_results = []

for cv_group in range(1, 6):  # cv_group 1 to 5
    print(f"Running CV group {cv_group}/5")
    
    drop_locs_train_set = sensor_df[sensor_df['cv_group'] == cv_group][['lat', 'lon']]
    drop_locs_val_set = sensor_df[sensor_df['cv_group'] != cv_group][['lat', 'lon']]

    train_loader = ncDataLoader(
        batch_size=batch_size,
        img_size=img_size,
        img_paths=full_paths,
        drop_sensor_locs_df=drop_locs_train_set
    )

    val_loader = ncDataLoader(
        batch_size=batch_size,
        img_size=img_size,
        img_paths=full_paths,
        drop_sensor_locs_df=drop_locs_val_set
    )

    # Reset model for each fold
    model = build_baseline_model(input_shape=(479, 1059, 22))
    model.compile(optimizer='adam', loss=masked_mse)

    model.fit(
        train_loader,
        validation_data=val_loader,
        epochs=epochs_param,
        verbose=1
    )

    # Predict and collect results
    for i, (batch_x, batch_y_true) in enumerate(val_loader):
        batch_y_pred = model.predict(batch_x)

        for j in range(len(batch_y_true)):
            y_true = batch_y_true[j]
            y_pred = batch_y_pred[j]

            mask = ~np.isnan(y_true)
            rows, cols, _ = np.where(mask)

            true_vals = y_true[rows, cols]
            pred_vals = y_pred[rows, cols]

            # Optional: use corresponding file name for traceability
            file_idx = i * batch_size + j
            file_name = os.path.basename(full_paths[file_idx]) if file_idx < len(full_paths) else None

            for r, c, t, p in zip(rows, cols, true_vals, pred_vals):
                cv_results.append({
                    'cv_group': cv_group,
                    'file': file_name,
                    'row': r,
                    'col': c,
                    'true': t[0],
                    'predicted': p[0],
                    'lat': r * 0.01 + 28.605,
                    'lon': c * 0.01 -98.895
                })

df_cv = pd.DataFrame(cv_results)

Running CV group 1/5


getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230804_180000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230812_130000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230804_090000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230816_080000.nc: Operation not supported


Epoch 1/4


getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230815_170000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230811_090000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230811_080000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230807_100000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230807_080000.nc: Operation not supported


 1/25 [>.............................] - ETA: 6:08 - loss: 11.9419

getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230805_070000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_140000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230812_170000.nc: Operation not supported


 2/25 [=>............................] - ETA: 4:46 - loss: 13.5787

getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230806_140000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230805_180000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230815_150000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_150000.nc: Operation not supported


 3/25 [==>...........................] - ETA: 4:39 - loss: 14.2067

getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230806_180000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230809_140000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230816_110000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230812_080000.nc: Operation not supported


 4/25 [===>..........................] - ETA: 4:29 - loss: 13.1528

getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230809_100000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230811_070000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230809_150000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230805_090000.nc: Operation not supported


 5/25 [=====>........................] - ETA: 4:17 - loss: 12.3266

getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230806_150000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230809_130000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230809_110000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230816_070000.nc: Operation not supported




getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230804_120000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230807_130000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230807_160000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230804_130000.nc: Operation not supported




getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230807_090000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230804_100000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230808_180000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230807_070000.nc: Operation not supported




getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230804_180000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230812_130000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230804_090000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230816_080000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230806_070000.nc: Operation not supported


KeyboardInterrupt: 

In [None]:
df_cv.to_csv('test_model_results_baseline.csv')

In [None]:
end = time.time()
print(f"Elapsed time: {end - start:.2f} seconds")