In [1]:
import os
import xarray as xr
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import random
from tensorflow.keras import layers, models, Input
import tensorflow as tf
import time


2025-09-24 22:43:42.980726: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-24 22:43:51.911927: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib
2025-09-24 22:43:51.912100: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local

In [2]:
class PreloadedData:
    def __init__(self, img_paths, img_size=(479, 1059), n_channels=22):
        self.img_paths = img_paths
        self.img_size = img_size
        self.n_channels = n_channels

        # Preallocate arrays
        self.x = np.zeros((len(img_paths),) + img_size + (n_channels,), dtype="float32")

        # We'll build y as an xarray Dataset
        y_list = []

        for j, path in enumerate(img_paths):
            ds = xr.open_dataset(path, engine="netcdf4")

            # Input tensor
            input_ds = ds.drop_vars("y_sensor_no2")
            input_array = input_ds.to_array().values
            self.x[j] = np.transpose(input_array, (1, 2, 0))

            # Target tensor
            target_ds = ds["y_sensor_no2"].copy()
            y_list.append(target_ds)

        # Combine all targets into a single Dataset
        self.y = xr.concat(y_list, dim="sample")

    def get_targets_with_mask(self, drop_sensor_locs_df):
        """
        Returns a *new copy* of targets with NaNs applied at drop_sensor_locs.
        """
        target_ds = self.y.copy()

        # Loop over rows to mask each (lat, lon) pair
        for _, row in drop_sensor_locs_df.iterrows():
            target_ds.loc[dict(lat=row['lat'], lon=row['lon'])] = np.nan

        y_masked = target_ds.values
        y_masked = np.expand_dims(y_masked, axis=-1)

        return y_masked


class CrossValidator:
    @staticmethod
    def masked_mse(y_true, y_pred):
        mask = tf.math.logical_not(tf.math.is_nan(y_true))
        y_true_masked = tf.boolean_mask(y_true, mask)
        y_pred_masked = tf.boolean_mask(y_pred, mask)

        diff_squared = tf.square(y_true_masked - y_pred_masked)
        denom = tf.cast(tf.size(diff_squared), tf.float32)
        denom = tf.maximum(denom, 1e-6)
        return tf.reduce_sum(diff_squared) / denom

    @staticmethod
    def build_cnn_model(input_shape, kernel_sizes, filters, activation,
                    use_maxpool=True, pool_size=(2, 2), upsample_method="nearest"):
        inputs = Input(shape=input_shape)
        x = inputs
        for k in kernel_sizes:
            x = layers.Conv2D(filters, (k, k), padding="same")(x)
            x = layers.Activation(activation)(x)
            if use_maxpool:
                x = layers.MaxPooling2D(pool_size=pool_size)(x)
                x = layers.UpSampling2D(size=pool_size, interpolation=upsample_method)(x)
        outputs = layers.Conv2D(1, (kernel_sizes[-1], kernel_sizes[-1]), activation="linear", padding="same")(x)
        return models.Model(inputs=inputs, outputs=outputs)

    @staticmethod
    def build_baseline_model(input_shape=(479, 1059, 22)):
        """
        Build a convolution-free, per-pixel linear regression model.
        Each pixel's 22 input features are combined linearly to produce 1 output value.

        Args:
            input_shape (tuple): Input shape (H, W, C)

        Returns:
            Keras Model
        """
        inputs = Input(shape=input_shape)
        x = inputs

        # Per-pixel linear regression (no hidden layer)
        outputs = layers.Dense(1, activation='linear', use_bias=True)(x)
        model = models.Model(inputs=inputs, outputs=outputs)
        return model
    
    def __init__(self, sensor_df, full_paths, output_dir, trial_name, batch_size, epochs, 
                 kernel_sizes=[3, 5, 3], filters=64, use_maxpool=False, pool_size=None,
                 activation="relu", upsample_method="nearest", img_size=(479, 1059)):  
        self.trial_name = trial_name
        self.sensor_df = sensor_df
        self.full_paths = full_paths
        self.batch_size = batch_size
        self.img_size = img_size
        self.kernel_sizes = kernel_sizes
        self.filters = filters
        self.activation = activation
        self.epochs = epochs
        self.use_maxpool = use_maxpool
        self.pool_size = pool_size
        self.upsample_method = upsample_method
        self.results = []
        self.output_dir = output_dir  # store output directory

        # Preload all data
        print("Preloading all data...")
        self.data = PreloadedData(full_paths, img_size=img_size, n_channels=22)
        print("Done preloading.")

    def run_model(self, CNN=True):
        for cv_group in range(1, 6):
            print(f"Running CV group {cv_group}/6")

            drop_locs_train_set = self.sensor_df[self.sensor_df["cv_group"].isin([f"{cv_group}", "test"])][["lat", "lon"]]
            drop_locs_val_set = self.sensor_df[~self.sensor_df["cv_group"].isin([f"{cv_group}"])][["lat", "lon"]]
            
            # Generate masked targets
            Y_train = self.data.get_targets_with_mask(drop_locs_train_set)
            Y_val = self.data.get_targets_with_mask(drop_locs_val_set)

            # Wrap in Keras Sequence for batching
            train_loader = tf.data.Dataset.from_tensor_slices((self.data.x, Y_train)).batch(self.batch_size)
            val_loader = tf.data.Dataset.from_tensor_slices((self.data.x, Y_val)).batch(self.batch_size)

            with tf.device("/GPU:0"):
                if CNN==True:
                    model = self.build_cnn_model(
                        input_shape=self.img_size + (22,),
                        kernel_sizes=self.kernel_sizes,
                        filters=self.filters,
                        activation=self.activation,
                        use_maxpool=self.use_maxpool,
                        pool_size=self.pool_size,
                        upsample_method=self.upsample_method,
                    )
                else:
                    model = self.build_baseline_model(input_shape=(479, 1059, 22))

                    model.compile(optimizer="adam", loss=self.masked_mse)

                    model.fit(train_loader, validation_data=val_loader, epochs=self.epochs, verbose=1)

            # Predict on full dataset
            y_pred_full = model.predict(self.data.x)
            
            # Collect results
            for j in range(len(self.data.x)):
                y_true = Y_val[j, :, :, 0]  # original target
                y_pred = y_pred_full[j, :, :, 0]  # model predictions

                mask = ~np.isnan(y_true)
                rows, cols = np.where(mask)
                true_vals = y_true[rows, cols]
                pred_vals = y_pred[rows, cols]

                file_name = os.path.basename(self.full_paths[j])

                for r, c, t, p in zip(rows, cols, true_vals, pred_vals):
                    self.results.append({
                        "file": file_name,
                        "row": r,
                        "col": c,
                        "true": t,
                        "predicted": p,
                        "lat": round(r * 0.01 + 28.605, 3),
                        "lon": round(c * 0.01 - 98.895, 3),
                    })

        final_results_all = pd.DataFrame(self.results)
        final_results_all.to_csv(os.path.join(self.output_dir, f'all_val_sensors_{self.trial_name}.csv'))
        
        y_true = final_results_all['true'].values
        y_pred = final_results_all['predicted'].values
        # Metrics
        rmse = np.sqrt(np.mean((y_true - y_pred)**2))
        mae = np.mean(np.abs(y_true - y_pred))
        # R-squared
        ss_res = np.sum((y_true - y_pred)**2)
        ss_tot = np.sum((y_true - np.mean(y_true))**2)
        r2 = 1 - ss_res / ss_tot
        bias = np.mean(y_pred - y_true)
        max_error = np.max(np.abs(y_pred - y_true))
        min_error = np.min(np.abs(y_pred - y_true))
        
        metrics_df = pd.DataFrame({
            "Trial": [self.trial_name],
            "Model": 'CNN',
            "Batch_Size": [self.batch_size],
            "Kernel_Sizes": [str(self.kernel_sizes)],
            "Filters": [self.filters],
            "Activation": [self.activation],
            "Epochs": [self.epochs],
            "Use_MaxPool": [self.use_maxpool],
            "Pool_Size": [str(self.pool_size)],
            "Upsample_Method": [self.upsample_method],
            "RMSE": [rmse],
            "MAE": [mae],
            "R2": [r2],
            "Bias": [bias],
            "Max_Error": [max_error],
            "Min_Error": [min_error]
        })

        metrics_df.to_csv(os.path.join(self.output_dir, f'metrics_{self.trial_name}.csv'))

        return pd.DataFrame(self.results)


In [4]:
if __name__ == "__main__":
    
    folder = "../../data/model_data/mini_data_set"
    full_paths = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".nc")][:4]
    sensor_df = pd.read_csv("../../data/sensor_data/final_sensor_cvgroups.csv", index_col=0)
    # # Test 1
    cv1 = CrossValidator(
        # Required
        sensor_df=sensor_df,
        full_paths=full_paths,
        output_dir = ".",
        trial_name='CNN_batch4_nopool',
        batch_size=4,
        epochs=5,
        # Rest are optional
        kernel_sizes=[4,4,4],
        filters=64,
        use_maxpool=False,
        pool_size=(4,4),
    )

    cv1.run_model(CNN=True)

    # # Test 2
    cv2 = CrossValidator(
        # Required
        sensor_df=sensor_df,
        output_dir = ".",
        full_paths=full_paths,
        trial_name='CNN_batch4_pool',
        batch_size=4,
        epochs=5,
        # Rest are optional
        kernel_sizes=[4,4,4],
        filters=64,
        use_maxpool=True,
        pool_size=(4,4)
    )
    cv2.run_model(CNN=True)

    # Test 3
    cv3 = CrossValidator(
        # Required
        sensor_df=sensor_df,
        output_dir = ".",
        full_paths=full_paths,
        trial_name='baseline_batch4_pool',
        batch_size=4,
        epochs=5,
        # Rest are optional
    )
    cv3.run_model(CNN=False)

Preloading all data...


getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_100000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_110000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_120000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_130000.nc: Operation not supported


Done preloading.
Running CV group 1/6


2025-09-24 22:44:31.224539: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-09-24 22:44:33.462301: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-09-24 22:44:33.466145: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2025-09-24 22:44:33.584995: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Running CV group 2/6
Running CV group 3/6
Running CV group 4/6
Running CV group 5/6
Preloading all data...


getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_100000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_110000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_120000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_130000.nc: Operation not supported


Done preloading.
Running CV group 1/6
Running CV group 2/6
Running CV group 3/6
Running CV group 4/6
Running CV group 5/6
Preloading all data...


getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_100000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_110000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_120000.nc: Operation not supported
getfattr: /home/jupyter/data/model_data/mini_data_set/time_20230802_130000.nc: Operation not supported


Done preloading.
Running CV group 1/6
Epoch 1/5


2025-09-24 22:44:57.199010: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f8b99245e60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-09-24 22:44:57.199058: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-09-24 22:44:57.689880: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.




2025-09-24 22:44:59.934045: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Running CV group 2/6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Running CV group 3/6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Running CV group 4/6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Running CV group 5/6
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [93]:
sensor_df

Unnamed: 0,lat,lon,cv_group
0,30.225,-90.965,test
1,30.265,-93.285,test
2,30.505,-91.215,test
3,30.465,-91.175,test
4,30.705,-91.055,test
5,30.045,-90.275,test
6,30.225,-91.315,test
7,29.995,-90.115,test
8,30.315,-90.815,test
0,28.885,-97.885,1


In [89]:
len([(29.045, -95.475), (29.135, -98.145), (29.255, -94.865), (29.275, -98.315), (29.515, -98.625), (29.525, -98.395), (29.525, -95.395), (29.585, -95.015), (29.675, -95.125), (29.685, -95.295), (29.735, -95.255), (29.755, -95.075), (29.765, -95.225), (29.775, -95.395), (29.805, -95.125), (29.815, -95.385), (29.835, -95.485), (29.865, -94.315), (29.975, -94.005), (30.035, -94.075), (30.085, -93.765), (30.355, -97.765), (30.355, -95.425), (30.375, -88.535), (32.035, -96.395), (32.345, -95.415), (32.375, -94.715), (32.565, -96.315), (32.665, -97.335), (32.675, -96.875), (32.805, -97.355), (32.825, -96.865), (32.915, -96.805), (32.925, -97.285), (32.985, -97.065)])

35

In [90]:
len([(29.635, -98.565), (29.995, -90.115), (30.045, -90.275), (30.225, -91.315), (30.225, -90.965), (30.265, -93.285), (30.315, -90.815), (30.355, -97.695), (30.465, -91.175), (30.505, -91.215), (30.705, -91.055), (31.085, -97.675), (32.655, -97.085), (32.665, -94.165), (32.985, -97.475), (33.155, -96.115)])

16

In [91]:
len([(29.135, -98.145), (29.515, -98.625), (29.525, -98.395), (29.585, -95.015), (29.635, -98.565), (29.685, -95.295), (29.735, -95.255), (29.755, -95.075), (29.765, -95.225), (29.775, -95.395), (29.805, -95.125), (29.835, -95.485), (29.865, -94.315), (30.035, -94.075), (30.085, -93.765), (30.355, -97.765), (30.355, -97.695), (30.355, -95.425), (31.085, -97.675), (32.035, -96.395), (32.345, -95.415), (32.375, -94.715), (32.565, -96.315), (32.655, -97.085), (32.665, -97.335), (32.665, -94.165), (32.675, -96.875), (32.825, -96.865), (32.915, -96.805), (32.925, -97.285), (32.985, -97.475), (32.985, -97.065), (33.155, -96.115)])


33

In [92]:
len([(29.045, -95.475), (29.255, -94.865), (29.275, -98.315), (29.525, -95.395), (29.675, -95.125), (29.815, -95.385), (29.975, -94.005), (29.995, -90.115), (30.045, -90.275), (30.225, -91.315), (30.225, -90.965), (30.265, -93.285), (30.315, -90.815), (30.375, -88.535), (30.465, -91.175), (30.505, -91.215), (30.705, -91.055), (32.805, -97.355)])

18

In [81]:
check1 = [(29.045, -95.475), (29.135, -98.145), (29.255, -94.865), (29.275, -98.315), (29.515, -98.625), (29.525, -98.395), (29.525, -95.395), (29.585, -95.015), (29.635, -98.565), (29.685, -95.295), (29.775, -95.395), (29.805, -95.125), (29.815, -95.385), (29.975, -94.005), (30.035, -94.075), (30.085, -93.765), (30.355, -97.765), (30.355, -97.695), (30.355, -95.425), (30.375, -88.535), (31.085, -97.675), (32.035, -96.395), (32.345, -95.415), (32.565, -96.315), (32.655, -97.085), (32.665, -97.335), (32.675, -96.875), (32.805, -97.355), (32.825, -96.865), (32.925, -97.285), (32.985, -97.475), (32.985, -97.065), (33.155, -96.115)]
check1 = set(check1)

In [82]:
check2=[(29.675, -95.125), (29.735, -95.255), (29.755, -95.075), (29.765, -95.225), (29.835, -95.485), (29.865, -94.315), (29.995, -90.115), (30.045, -90.275), (30.225, -91.315), (30.225, -90.965), (30.265, -93.285), (30.315, -90.815), (30.375, -88.535), (30.465, -91.175), (30.505, -91.215), (30.705, -91.055), (32.375, -94.715), (32.665, -94.165), (32.915, -96.805)]
check2=set(check2)

In [83]:
common_locs = check1 & check2

In [84]:
common_locs

{(30.375, -88.535)}

In [85]:
sensor_df

Unnamed: 0,lat,lon,cv_group
0,30.225,-90.965,test
1,30.265,-93.285,test
2,30.505,-91.215,test
3,30.465,-91.175,test
4,30.705,-91.055,test
5,30.045,-90.275,test
6,30.225,-91.315,test
7,29.995,-90.115,test
8,30.315,-90.815,test
0,29.865,-94.315,1


In [78]:
test = pd.read_csv('all_val_sensors_CNN_batch4_pool.csv')
test

Unnamed: 0.1,Unnamed: 0,cv_group,file,row,col,true,predicted,lat,lon
0,0,1,time_20230802_100000.nc,44,342,0.4,0.698850,29.045,-95.475
1,1,1,time_20230802_100000.nc,53,75,1.6,0.335348,29.135,-98.145
2,2,1,time_20230802_100000.nc,65,403,0.9,1.727070,29.255,-94.865
3,3,1,time_20230802_100000.nc,67,58,2.2,3.463570,29.275,-98.315
4,4,1,time_20230802_100000.nc,91,27,2.8,2.098141,29.515,-98.625
...,...,...,...,...,...,...,...,...,...
990,990,5,time_20230802_130000.nc,431,209,-0.1,1.105108,32.915,-96.805
991,991,5,time_20230802_130000.nc,432,161,1.8,0.669457,32.925,-97.285
992,992,5,time_20230802_130000.nc,438,142,0.9,0.296530,32.985,-97.475
993,993,5,time_20230802_130000.nc,438,183,0.3,0.442912,32.985,-97.065


In [62]:
sensor_df[~sensor_df["cv_group"].isin([f"1", "test"])][["lat", "lon"]]

Unnamed: 0,lat,lon
1,32.665,-97.335
2,33.155,-96.115
3,29.525,-95.395
4,30.355,-95.425
6,32.345,-95.415
7,30.035,-94.075
8,29.045,-95.475
9,32.655,-97.085
11,29.805,-95.125
12,32.925,-96.755
