In [1]:
from netCDF4 import Dataset
import numpy as np
import warnings

In [2]:
def collect_data(file_name):
    """
    Collects and processes data from a specified file.
    
    Args:
        file_name (str): The path to the file from which to extract data.
                         The file should have a .nc extension (for netCDF) or another
                         specified type.

    Returns:
        dict: A dictionary where keys are variable names from the file and values
              are the corresponding data arrays. The dictionary is used to create
              tensors of the data.

    Raises:
        ValueError: If the file does not have the expected extension.

    Example:
        >>> data = collect_data('path/to/file.nc')
    """
    
    # Check file extension (e.g., '.nc' for netCDF)
    if not file_name.endswith('.nc'):
        raise ValueError("File must have a .nc extension")
    
    # Open the netCDF file
    nc = Dataset(file_name, 'r')
    
    # Create a dictionary to store the variables
    data = {
        'title': nc.title,
        'institution': nc.institution,
        'source': nc.source,
        'date_created': nc.date_created,
        
        'l1b_file': nc.groups['instrument']['l1b_file'][:].data,
        'pixel_id': nc.groups['instrument']['pixel_id'][:].data,
        'scanline': nc.groups['instrument']['scanline'][:].data,
        'ground_pixel': nc.groups['instrument']['ground_pixel'][:].data,
        'time': nc.groups['instrument']['time'][:].data,
        'solar_zenith_angle': nc.groups['instrument']['solar_zenith_angle'][:].data,
        'viewing_zenith_angle': nc.groups['instrument']['viewing_zenith_angle'][:].data,
        'relative_azimuth_angle': nc.groups['instrument']['relative_azimuth_angle'][:].data,
        'latitude_center': nc.groups['instrument']['latitude_center'][:].data,
        'longitude_center': nc.groups['instrument']['longitude_center'][:].data,
        'latitude_corners': nc.groups['instrument']['latitude_corners'][:].data,
        'longitude_corners': nc.groups['instrument']['longitude_corners'][:].data,
        'glintflag': nc.groups['instrument']['glintflag'][:].data,
        
        'altitude_levels': nc.groups['meteo']['altitude_levels'][:].data,
        'surface_altitude': nc.groups['meteo']['surface_altitude'][:].data,
        'surface_altitude_stdv': nc.groups['meteo']['surface_altitude_stdv'][:].data,
        'dp': nc.groups['meteo']['dp'][:].data,
        'surface_pressure': nc.groups['meteo']['surface_pressure'][:].data,
        'dry_air_subcolumns': nc.groups['meteo']['dry_air_subcolumns'][:].data,
        'landflag': nc.groups['meteo']['landflag'][:].data,
        'u10': nc.groups['meteo']['u10'][:].data,
        'v10': nc.groups['meteo']['v10'][:].data,
        'fluorescence_apriori': nc.groups['meteo']['fluorescence_apriori'][:].data,
        'cloud_fraction': nc.groups['meteo']['cloud_fraction'][:].data,
        'weak_h2o_column': nc.groups['meteo']['weak_h2o_column'][:].data,
        'strong_h2o_column': nc.groups['meteo']['strong_h2o_column'][:].data,
        'weak_ch4_column': nc.groups['meteo']['weak_ch4_column'][:].data,
        'strong_ch4_column': nc.groups['meteo']['strong_ch4_column'][:].data,
        'cirrus_reflectance': nc.groups['meteo']['cirrus_reflectance'][:].data,
        'stdv_h2o_ratio': nc.groups['meteo']['stdv_h2o_ratio'][:].data,
        'stdv_ch4_ratio': nc.groups['meteo']['stdv_ch4_ratio'][:].data,
        
        'xch4': nc.groups['target_product']['xch4'][:].data,
        'xch4_precision': nc.groups['target_product']['xch4_precision'][:].data,
        'xch4_column_averaging_kernel': nc.groups['target_product']['xch4_column_averaging_kernel'][:].data,
        'ch4_profile_apriori': nc.groups['target_product']['ch4_profile_apriori'][:].data,
        'xch4_apriori': nc.groups['target_product']['xch4_apriori'][:].data,
        'xch4_corrected': nc.groups['target_product']['xch4_corrected'][:].data,
        
        'fluorescence': nc.groups['side_product']['fluorescence'][:].data,
        'co_column': nc.groups['side_product']['co_column'][:].data,
        'co_column_precision': nc.groups['side_product']['co_column_precision'][:].data,
        'h2o_column': nc.groups['side_product']['h2o_column'][:].data,
        'h2o_column_precision': nc.groups['side_product']['h2o_column_precision'][:].data,
        'spectral_shift': nc.groups['side_product']['spectral_shift'][:].data,
        'aerosol_size': nc.groups['side_product']['aerosol_size'][:].data,
        'aerosol_size_precision': nc.groups['side_product']['aerosol_size_precision'][:].data,
        'aerosol_column': nc.groups['side_product']['aerosol_column'][:].data,
        'aerosol_column_precision': nc.groups['side_product']['aerosol_column_precision'][:].data,
        'aerosol_altitude': nc.groups['side_product']['aerosol_altitude'][:].data,
        'aerosol_altitude_precision': nc.groups['side_product']['aerosol_altitude_precision'][:].data,
        'aerosol_optical_thickness': nc.groups['side_product']['aerosol_optical_thickness'][:].data,
        'surface_albedo': nc.groups['side_product']['surface_albedo'][:].data,
        'surface_albedo_precision': nc.groups['side_product']['surface_albedo_precision'][:].data,
        'reflectance_max': nc.groups['side_product']['reflectance_max'][:].data,
        
        'processing_quality_flags': nc.groups['diagnostics']['processing_quality_flags'][:].data,
        'convergence': nc.groups['diagnostics']['convergence'][:].data,
        'error_id': nc.groups['diagnostics']['error_id'][:].data,
        'iterations': nc.groups['diagnostics']['iterations'][:].data,
        'chi_squared': nc.groups['diagnostics']['chi_squared'][:].data,
        'chi_squared_band': nc.groups['diagnostics']['chi_squared_band'][:].data,
        'number_of_spectral_points_in_retrieval': nc.groups['diagnostics']['number_of_spectral_points_in_retrieval'][:].data,
        'degrees_of_freedom': nc.groups['diagnostics']['degrees_of_freedom'][:].data,
        'degrees_of_freedom_ch4': nc.groups['diagnostics']['degrees_of_freedom_ch4'][:].data,
        'degrees_of_freedom_aerosol': nc.groups['diagnostics']['degrees_of_freedom_aerosol'][:].data,
        'signal_to_noise_ratio': nc.groups['diagnostics']['signal_to_noise_ratio'][:].data,
        'rms': nc.groups['diagnostics']['rms'][:].data,
        'qa_value': nc.groups['diagnostics']['qa_value'][:].data
    }
    
    nc.close()
    
    return data


In [3]:
def proccessed_data(data):
    """
    SRON RemoTeC-S5P XCH4 product has given recomendations
    for the parameters used in the filtering. Though I choose to
    use the parameters that were described in the research paper. 
    """


    # This is to get rid of indeces in the data set that dont correspond to any data.
    non_masked_indices = np.where(~data['pixel_id'])[0]



    # Extracting SWIR and NIR aerosol optical depths
    swir_aerosol_optical_depth = data['aerosol_optical_thickness'][:, 1]
    nir_aerosol_optical_depth = data['aerosol_optical_thickness'][:, 0]
    
    #Extracting SWIR and NIR surface albedo
    swir_surface_albedo = data['surface_albedo'][:, 1]
    nir_surface_albedo = data['surface_albedo'][:, 0]
    

    # Mixed Albedo
    mixed_albedo = 2.4*nir_surface_albedo[non_masked_indices] \
                - 1.13*swir_surface_albedo[non_masked_indices]


    """
    SRON RemoTeC-S5P XCH4 product has given recomendations
    for the parameters used in the filtering. Though I choose to
    use the parameters that were described in the research paper. 
    """


    filtered_indices = non_masked_indices[
        (data['qa_value'][non_masked_indices] >= 0.4) &
        (data['xch4_precision'][non_masked_indices] < 10) &
        (swir_aerosol_optical_depth[non_masked_indices] < 0.13) &
        (nir_aerosol_optical_depth[non_masked_indices] <0.30) &
        (swir_surface_albedo[non_masked_indices] > 0.02) &
        (mixed_albedo < 0.95) &
        (data['cloud_fraction'][non_masked_indices][:, 0] <0.02)
    ]

    proccessed_data = {
        'scanline': data['scanline'][filtered_indices],
        'ground_pixel': data['ground_pixel'][filtered_indices],
        'time': data['time'][filtered_indices],
        'solar_zenith_angle': data['solar_zenith_angle'][filtered_indices],
        'viewing_zenith_angle': data['viewing_zenith_angle'][filtered_indices],
        'relative_azimuth_angle': data['relative_azimuth_angle'][filtered_indices],
        'latitude_center': data['latitude_center'][filtered_indices],
        'longitude_center': data['longitude_center'][filtered_indices],
        'latitude_corners': data['latitude_corners'][filtered_indices],
        'longitude_corners': data['longitude_corners'][filtered_indices],
        'altitude_levels': data['altitude_levels'][filtered_indices],
        'surface_altitude': data['surface_altitude'][filtered_indices],
        'surface_altitude_stdv': data['surface_altitude_stdv'][filtered_indices],
        'dp': data['dp'][filtered_indices],
        'surface_pressure': data['surface_pressure'][filtered_indices],
        'dry_air_subcolumns': data['dry_air_subcolumns'][filtered_indices],
        'u10': data['u10'][filtered_indices],
        'v10': data['v10'][filtered_indices],
        'fluorescence_apriori': data['fluorescence_apriori'][filtered_indices],
        'cloud_fraction': data['cloud_fraction'][filtered_indices],
        'weak_h2o_column': data['weak_h2o_column'][filtered_indices],
        'strong_h2o_column': data['strong_h2o_column'][filtered_indices],
        'weak_ch4_column': data['weak_ch4_column'][filtered_indices],
        'strong_ch4_column': data['strong_ch4_column'][filtered_indices],
        'cirrus_reflectance': data['cirrus_reflectance'][filtered_indices],
        'stdv_h2o_ratio': data['stdv_h2o_ratio'][filtered_indices],
        'stdv_ch4_ratio': data['stdv_ch4_ratio'][filtered_indices],
        'xch4': data['xch4'][filtered_indices],
        'xch4_precision': data['xch4_precision'][filtered_indices],
        'xch4_column_averaging_kernel': data['xch4_column_averaging_kernel'][filtered_indices],
        'ch4_profile_apriori': data['ch4_profile_apriori'][filtered_indices],
        'xch4_apriori': data['xch4_apriori'][filtered_indices],
        'xch4_corrected': data['xch4_corrected'][filtered_indices],
        'fluorescence': data['fluorescence'][filtered_indices],
        'co_column': data['co_column'][filtered_indices],
        'co_column_precision': data['co_column_precision'][filtered_indices],
        'h2o_column': data['h2o_column'][filtered_indices],
        'h2o_column_precision': data['h2o_column_precision'][filtered_indices],
        'spectral_shift': data['spectral_shift'][filtered_indices],
        'aerosol_size': data['aerosol_size'][filtered_indices],
        'aerosol_size_precision': data['aerosol_size_precision'][filtered_indices],
        'aerosol_column': data['aerosol_column'][filtered_indices],
        'aerosol_column_precision': data['aerosol_column_precision'][filtered_indices],
        'aerosol_altitude': data['aerosol_altitude'][filtered_indices],
        'aerosol_altitude_precision': data['aerosol_altitude_precision'][filtered_indices],
        'aerosol_optical_thickness': data['aerosol_optical_thickness'][filtered_indices],
        'surface_albedo': data['surface_albedo'][filtered_indices],
        'surface_albedo_precision': data['surface_albedo_precision'][filtered_indices],
        'reflectance_max': data['reflectance_max'][filtered_indices],
        'convergence': data['convergence'][filtered_indices],
        'iterations': data['iterations'][filtered_indices],
        'chi_squared': data['chi_squared'][filtered_indices],
        'chi_squared_band': data['chi_squared_band'][filtered_indices],
        'number_of_spectral_points_in_retrieval': data['number_of_spectral_points_in_retrieval'][filtered_indices],
        'degrees_of_freedom': data['degrees_of_freedom'][filtered_indices],
        'degrees_of_freedom_ch4': data['degrees_of_freedom_ch4'][filtered_indices],
        'degrees_of_freedom_aerosol': data['degrees_of_freedom_aerosol'][filtered_indices],
        'signal_to_noise_ratio': data['signal_to_noise_ratio'][filtered_indices],
        'rms': data['rms'][filtered_indices],
    }
    
    return proccessed_data

In [4]:
def process_data_to_scan_line_lists(data_):
    # Suppress RuntimeWarnings
    warnings.filterwarnings("ignore", category=RuntimeWarning)

    # Define the dtype for the structured array
    dtype = [
        ('xch4_corrected', 'f4'),
        ('latitude_corners', 'f4', (4,)),
        ('longitude_corners', 'f4', (4,)),
        ('u10', 'f4'),
        ('v10', 'f4'),
        ('latitude_center', 'f4'),
        ('longitude_center', 'f4'),
        ('scanline', 'i4'),
        ('ground_pixel', 'i4'),
        ('time', 'i4', (7,)),
        ('solar_zenith_angle', 'f4'),
        ('viewing_zenith_angle', 'f4'),
        ('relative_azimuth_angle', 'f4'),
        ('altitude_levels', 'f4', (13,)),
        ('surface_altitude', 'f4'),
        ('surface_altitude_stdv', 'f4'),
        ('dp', 'f4'),
        ('surface_pressure', 'f4'),
        ('dry_air_subcolumns', 'f4', (12,)),
        ('fluorescence_apriori', 'f4'),
        ('cloud_fraction', 'f4', (4,)),
        ('weak_h2o_column', 'f4'),
        ('strong_h2o_column', 'f4'),
        ('weak_ch4_column', 'f4'),
        ('strong_ch4_column', 'f4'),
        ('cirrus_reflectance', 'f4'),
        ('stdv_h2o_ratio', 'f4'),
        ('stdv_ch4_ratio', 'f4'),
        ('xch4', 'f4'),
        ('xch4_precision', 'f4'),
        ('xch4_column_averaging_kernel', 'f4', (12,)),
        ('ch4_profile_apriori', 'f4', (12,)),
        ('xch4_apriori', 'f4'),
        ('fluorescence', 'f4'),
        ('co_column', 'f4'),
        ('co_column_precision', 'f4'),
        ('h2o_column', 'f4'),
        ('h2o_column_precision', 'f4'),
        ('spectral_shift', 'f4', (2,)),
        ('aerosol_size', 'f4'),
        ('aerosol_size_precision', 'f4'),
        ('aerosol_column', 'f4'),
        ('aerosol_column_precision', 'f4'),
        ('aerosol_altitude', 'f4'),
        ('aerosol_altitude_precision', 'f4'),
        ('aerosol_optical_thickness', 'f4', (2,)),
        ('surface_albedo', 'f4', (2,)),
        ('surface_albedo_precision', 'f4', (2,)),
        ('reflectance_max', 'f4', (2,)),
        ('convergence', 'i4'),
        ('iterations', 'i4'),
        ('chi_squared', 'f4'),
        ('chi_squared_band', 'f4', (2,)),
        ('number_of_spectral_points_in_retrieval', 'i4', (2,)),
        ('degrees_of_freedom', 'f4'),
        ('degrees_of_freedom_ch4', 'f4'),
        ('degrees_of_freedom_aerosol', 'f4'),
        ('signal_to_noise_ratio', 'f4', (2,)),
        ('rms', 'f4'),
    ]

    # Calculate the number of unique scan lines and ground pixels
    max_scan_line = max(data_['scanline'])
    min_scan_line = min(data_['scanline'])
    max_ground_pixel = max(data_['ground_pixel'])
    min_ground_pixel = min(data_['ground_pixel'])
    num_scan_lines = max_scan_line - min_scan_line + 1
    num_ground_pixels = max_ground_pixel - min_ground_pixel + 1

    # Initialize the structured array
    scan_line_lists = np.full((num_scan_lines, num_ground_pixels), np.nan, dtype=dtype)

    # Populate the array
    for idx in range(len(data_['xch4_corrected'])):
        scan_idx = data_['scanline'][idx] - min_scan_line
        pixel_idx = data_['ground_pixel'][idx] - min_ground_pixel

        # Ensure indices are within bounds
        if scan_idx < num_scan_lines and pixel_idx < num_ground_pixels:
            scan_line_lists[scan_idx, pixel_idx] = (
                data_['xch4_corrected'][idx],
                data_['latitude_corners'][idx],
                data_['longitude_corners'][idx],
                data_['u10'][idx],
                data_['v10'][idx],
                data_['latitude_center'][idx],
                data_['longitude_center'][idx],
                data_['scanline'][idx],
                data_['ground_pixel'][idx],
                data_['time'][idx],
                data_['solar_zenith_angle'][idx],
                data_['viewing_zenith_angle'][idx],
                data_['relative_azimuth_angle'][idx],
                data_['altitude_levels'][idx],
                data_['surface_altitude'][idx],
                data_['surface_altitude_stdv'][idx],
                data_['dp'][idx],
                data_['surface_pressure'][idx],
                data_['dry_air_subcolumns'][idx],
                data_['fluorescence_apriori'][idx],
                data_['cloud_fraction'][idx],
                data_['weak_h2o_column'][idx],
                data_['strong_h2o_column'][idx],
                data_['weak_ch4_column'][idx],
                data_['strong_ch4_column'][idx],
                data_['cirrus_reflectance'][idx],
                data_['stdv_h2o_ratio'][idx],
                data_['stdv_ch4_ratio'][idx],
                data_['xch4'][idx],
                data_['xch4_precision'][idx],
                data_['xch4_column_averaging_kernel'][idx],
                data_['ch4_profile_apriori'][idx],
                data_['xch4_apriori'][idx],
                data_['fluorescence'][idx],
                data_['co_column'][idx],
                data_['co_column_precision'][idx],
                data_['h2o_column'][idx],
                data_['h2o_column_precision'][idx],
                data_['spectral_shift'][idx],
                data_['aerosol_size'][idx],
                data_['aerosol_size_precision'][idx],
                data_['aerosol_column'][idx],
                data_['aerosol_column_precision'][idx],
                data_['aerosol_altitude'][idx],
                data_['aerosol_altitude_precision'][idx],
                data_['aerosol_optical_thickness'][idx],
                data_['surface_albedo'][idx],
                data_['surface_albedo_precision'][idx],
                data_['reflectance_max'][idx],
                data_['convergence'][idx],
                data_['iterations'][idx],
                data_['chi_squared'][idx],
                data_['chi_squared_band'][idx],
                data_['number_of_spectral_points_in_retrieval'][idx],
                data_['degrees_of_freedom'][idx],
                data_['degrees_of_freedom_ch4'][idx],
                data_['degrees_of_freedom_aerosol'][idx],
                data_['signal_to_noise_ratio'][idx],
                data_['rms'][idx],
            )
        data_for_processing = {
            'scan_line_lists': scan_line_lists,
            'num_scan_lines': num_scan_lines,
            'data_type': dtype
        }
    return data_for_processing

In [5]:
def create_matrices(scan_line_lists, num_scan_lines, dtype, bbox=False):
    """bbox = [min_lat, max_lat, min_lon, max_lon]"""
    
    matrices = []

    max_indices = len(scan_line_lists[0])

    # Loop through the scan lines with 50% vertical overlap
    for start_row in range(0, num_scan_lines, 16):
        if start_row + 32 > num_scan_lines:
            break
        
        # Loop through the indices with 50% horizontal overlap
        for start_col in range(0, max_indices, 16):
            if start_col + 32 > max_indices:
                break
            
            # Extract the 32x32 block
            block = scan_line_lists[start_row:start_row+32, start_col:start_col+32]
            
            # Only append the matrix if it has 20% or more non-NaN values
            if np.count_nonzero(~np.isnan(block['xch4_corrected'])) > 220:
                # Filter for region of interest
                if bbox:

                    if (np.nanmin(block['latitude_corners']) > bbox[0] and np.nanmax(block['latitude_corners']) < bbox[1] and
                        np.nanmin(block['longitude_corners']) > bbox[2] and np.nanmax(block['longitude_corners']) < bbox[3]):
                        matrices.append(block)
                else:
                    matrices.append(block)
                    
    # Convert to a structured array
    tensor = np.array(matrices, dtype=dtype)
    
    return tensor

In [6]:
def normalize_matrix(matrix):
    """ Normalization Parameters """
    mean_ch4 = np.nanmean(matrix)
    std_ch4 = np.nanstd(matrix)
    lower_bound = mean_ch4 - std_ch4
    
    # 100 ppb added to mean, then subtract std
    upper_bound = mean_ch4 + 100 - std_ch4  
    
    # Replace NaN values with 0
    matrix = np.where(np.isnan(matrix), 0, matrix)
    
    # Normalize the data between 0 and 1    
    normalized_matrix = np.where(matrix < lower_bound, 0, matrix)
    normalized_matrix = np.where(matrix > upper_bound, 1, normalized_matrix)  
    in_between = (matrix >= lower_bound) & (matrix <= upper_bound)
    normalized_matrix[in_between] = (matrix[in_between] - lower_bound) / (upper_bound - lower_bound)

    return normalized_matrix