<center>
<table>
  <tr>
    <td><img src="https://portal.nccs.nasa.gov/datashare/astg/training/python/logos/nasa-logo.svg" width="100"/> </td>
     <td><img src="https://portal.nccs.nasa.gov/datashare/astg/training/python/logos/ASTG_logo.png?raw=true" width="80"/> </td>
     <td> <img src="https://www.nccs.nasa.gov/sites/default/files/NCCS_Logo_0.png" width="130"/> </td>
    </tr>
</table>
</center>

        
<center>
<h2><font color= "blue" size="+3">PyCon 2024 Tutorial</font></h2>
</center>

---

<center>
    <h3>Python Workflows to Extract and Plot Satellite Data Products along Tracks</h3>
    <h2><font color="red" size="+3">Aura Satellite - Option 3</font></h2>
</center>

_______

# <font color="red"> Objectives</font>

Use a collection of OMI data files to:
- Gather timeseries data (time, location, value) of surface pressure
- Plot the data on a map

---

----

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import datetime as dt
from pathlib import Path

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

In [None]:
import numpy as np
import h5py
import pandas as pd
import geopandas as gpd

In [None]:
from shapely import geometry as shpgeom
from shapely import wkt as shpwkt

In [None]:
import movingpandas as mpd

In [None]:
import holoviews as hv

In [None]:
import hvplot.pandas 

In [None]:
plot_defaults = {'linewidth':5, 'capstyle':'round', 'figsize':(9,3), 'legend':True}
hv.opts.defaults(hv.opts.Overlay(active_tools=['wheel_zoom'], 
                              frame_width=500, frame_height=400))
hvplot_defaults = {'tiles':None, 'cmap':'Viridis', 'colorbar':True}

In [None]:
mpd.show_versions()

### Data Files

- The [TROPESS OMI-Aura L2 Ozone for Forward Stream](https://disc.gsfc.nasa.gov/datasets/TRPSDL2O3OMIFS_1/summary?keywords=AURA), Standard Product contains the vertical distribution of the retrieved atmospheric state of ozone (O3), formal uncertainties, and diagnostic information measured by the OMI instrument on the EOS Aura satellite.
- The forward stream standard product is global for the time period from 2021-02-01 to present. 
- The data files are written in the netCDF version 4 file format, and each file contains one day of data.
- 

In [None]:
data_dir = "/Users/jkouatch/myTasks/PythonTraining/ASTG606/Materials/sat_data/OMI_Data/"
#data_dir = "/tljh-data/sat_data/OMI_Data"

In [None]:
list_files =[
    "OMI-Aura_L2-OMAERO_2024m0223t0007-o104295_v003-2024m0223t055537.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t0146-o104296_v003-2024m0223t072107.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t0325-o104297_v003-2024m0223t092257.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t0503-o104298_v003-2024m0223t110728.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t0642-o104299_v003-2024m0223t142711.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t0821-o104300_v003-2024m0223t160655.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t1000-o104301_v003-2024m0223t161921.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t1139-o104302_v003-2024m0223t174944.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t1318-o104303_v003-2024m0223t190943.he5",
    "OMI-Aura_L2-OMAERO_2024m0223t1456-o104304_v003-2024m0223t224147.he5"
]

## Step 1: <font color="green"> Understand the structure of one data file</green>

In [None]:
fname = Path(data_dir) / list_files[0]

In [None]:
def print_attrs(name, obj):
    shift = name.count('/') * '    '
    print(shift + name)
    if isinstance(obj, h5py.Dataset):
        print(shift + '    ' + f"Shape: {obj[()].shape}")
    for key, val in obj.attrs.items():
        print(shift + '    ' + f"{key}: {val}")
        
with h5py.File(fname, mode='r') as fid:
    fid.visititems(print_attrs)  

Please identify the datasets:

- `Time`
- `SpacecraftLatitude`
- `SpacecraftLongitude`
- `SingleScatteringAlbedoMW`
- `CloudPressure`

Please pay attention to the attributes of `SingleScatteringAlbedoMW` and `CloudPressure`, and the multi-wavelengths of `SingleScatteringAlbedoMW` (we will only use the first one here).

## Step 2: <font color="green">Write a simple code to get the `Time`, `SpacecraftLatitude`, `SpacecraftLongitude`, `SingleScatteringAlbedoMW` and `CloudPressure` data arrays from one file</font>

Write a function that takes as argument a file name and a grpund pixel identifier, and returns values for the SingleScatteringAlbedoMW (first wavelength), CloudPressure, the time, the lalitude and the longitude:

```python

cloud_name = 'CloudPressure'
alb_name = 'SingleScatteringAlbedoMW'

def get_arrays(fname, ipxl):
    with h5py.File(fname, 'r') as fid:
        ...
    return scatt_alb, cloud_press, time, lats, lons
   ...
```
Test the funtion using any of the above file.

In [None]:
cloud_name = 'CloudPressure'
alb_name = 'SingleScatteringAlbedoMW'

def get_arrays(fname, ipxl):
    with h5py.File(fname, 'r') as h5f:
        geol_group = h5f['HDFEOS/SWATHS/ColumnAmountAerosol/Geolocation Fields']
        data_group = h5f['HDFEOS/SWATHS/ColumnAmountAerosol/Data Fields']
        scatt_alb = data_group[alb_name][:,:,ipxl]
        cloud_pres = data_group[cloud_name][()]
        time = geol_group['Time'][()]
        lats = geol_group['Latitude'][()][:,ipxl]
        lons = geol_group['Longitude'][()][:,ipxl]
    return scatt_alb, cloud_pres, time, lats, lons

<details><summary><b><font color="purple">Click here to access the solution</font></b></summary>
<p>

```python
cloud_name = 'CloudPressure'
alb_name = 'SingleScatteringAlbedoMW'

def get_arrays(fname):
    with h5py.File(fname, 'r') as h5f:
        geol_group = h5f['HDFEOS/SWATHS/ColumnAmountAerosol/Geolocation Fields']
        data_group = h5f['HDFEOS/SWATHS/ColumnAmountAerosol/Data Fields']
        scatt_alb = data_group[alb_name][:,:,0]
        cloud_press = data_group[cloud_name][()]
        time = geol_group['Time'][()]
        lats = geol_group['SpacecraftLatitude'][()]
        lons = geol_group['SpacecraftLongitude'][()]
    return scatt_alb, cloud_press, time, lats, lons
```
</details>

#### Test your function here

In [None]:
ipxl = 0
AL, CP, T,LA, LO = get_arrays(fname, ipxl)

print(f"Shape of {alb_name}: {AL.shape}")
print(f"Shape of {cloud_name}: {CP.shape}")
print(f"Shape of time:      {T.shape}")
print(f"Shape of latitude:  {LA.shape}")
print(f"Shape of longitude: {LO.shape}")

AL, CP, T, LA, LO = None, None, None, None, None

## Step 3: <font color="green">Read the data files</font>

To write a script that loops over the data files and reads each of them to:
- Gather the time, latitude, longitude, cloud presssure and scattering albedo (first bin only)
   - For cloud presssure and scattering albedo only take the first...
- Use the cloud presssure and scattering albedo attributes to restore the values
- Load the data in a Pandas DataFrame


In [None]:
def convert_dict_dtype(sample_dict):
    '''
    Converts attribute dictionary from NumPy data types 
    to general Python data types

    Parameters
    ----------
    sample_dict : dict
         A dictionary of attributes
         
    Returns
    sample_dict : dictt
         A dictionary of attributes
    '''
    for key, item in sample_dict.items():
        if isinstance(item, np.ndarray):   # Converts np arrays to a list to, if applicable, an int or float
            item = list(item)
        
            if len(item) == 1:
                item = item[0]
        elif isinstance(item, np.bytes_):   # Converts np bytes to an np string to a Python string
            item = str(item.astype('str'))
        
            if item[0] == '(' or item[0] == '{':   # Converts to tuple or dict if applicable
                item = eval(item)
            # **eval() relaiability??**
            
        sample_dict[key] = item   # Updates any changes to the key value
        
    return sample_dict

In [None]:
def get_ds_attrs(ds):
    """
       Give a dataset identifier, return the dataset attribute.
       
       Input Parameters:
          - ds: dataset identifier
       Returned value:
          - ds_attrs: a dictionary
    """
    ds_attrs = dict(ds.attrs)
    ds_attrs = convert_dict_dtype(ds_attrs)
    
    return ds_attrs

In [None]:
def get_ds_attribute_value(ds_attrs, attr_name):
    '''
    Obtain the value of a specified attribute in a dataset.
    
    Parameter
    ---------
    ds_attrs : dict
         A dictionary of dataset attributes
    attr_name : str
         Attribute name    
    
    Returns
    --------
    value: float, int, str, list
         Value of the attribute. If attribute not available, None.
    '''
    for key, value in ds_attrs.items():
        if key == attr_name:
            return value 
    return None

In [None]:
def restore_data(ds):
    '''
    Restore the dataset data using the dataset attributes.
      
    Parameters
    ----------
    ds : h5py dataset identifier
    
    Returns:
    data : numpy array
    '''
    ds_attrs = get_ds_attrs(ds)
    
    fill_value = get_ds_attribute_value(ds_attrs, '_FillValue')
    missing_value = get_ds_attribute_value(ds_attrs, 'MissingValue')
    scale_factor = get_ds_attribute_value(ds_attrs, 'scale_factor')
    add_offset = get_ds_attribute_value(ds_attrs, 'add_offset')
    
    data = ds[()]#.astype('float')

    data = np.where(data != missing_value, data, np.nan)
    data = np.where(data != fill_value, data, np.nan)
    if add_offset:
        data -= add_offset
    if scale_factor:
        data *= scale_factor

    return data

__Note that the surface temperature has a `missing_value` and a `_FillValue` attributes. We need to make sure that any entry with that value needs to be replaced with `NaN`.__

<font color="green">Rewrite the function in Step 2 to restore the surface temperature data</font>

<details><summary><b><font color="purple">Click here to access the solution</font></b></summary>
<p>

```python
def get_arrays(fname, ipxl):
    with h5py.File(fname, 'r') as h5f:
        geol_group = h5f['HDFEOS/SWATHS/ColumnAmountAerosol/Geolocation Fields']
        data_group = h5f['HDFEOS/SWATHS/ColumnAmountAerosol/Data Fields']
        scatt_alb = restore_data(data_group[alb_name])[:,0,ipxl]
        cloud_pres = restore_data(data_group[cloud_name])[:,ipxl]
        time = geol_group['Time'][()]
        lats = geol_group['SpacecraftLatitude'][()][:,ipxl]
        lons = geol_group['SpacecraftLongitude'][()][:,ipxl]
    return scatt_alb, cloud_pres, time, lats, lons
```
</details>

#### Test the function here

In [None]:
ipxl = 0
AL, CP, T,LA, LO = get_arrays(fname, ipxl)

print(f"Shape of {alb_name}: {AL.shape}")
print(f"Shape of {cloud_name}: {CP.shape}")
print(f"Shape of time:      {T.shape}")
print(f"Shape of latitude:  {LA.shape}")
print(f"Shape of longitude: {LO.shape}")

AL, CP, T, LA, LO = None, None, None, None, None

<font color="green">Write your script here:

```python
first_iter = True
for i in range(len(list_files)):
    fname = Path(data_dir) / list_files[i]
    print(f"Reading: {fname}")
    ...
```

In [None]:
num_files = len(list_files)
ipxl = 0
first_iter = True
for i in range(num_files):
    fname = Path(data_dir) / list_files[i]
    print(f"Reading: {fname}")
    AL, CP, T,LA, LO = get_arrays(fname, ipxl)
    traj = np.full_like(T, i+1, dtype=int) # Index to distinguish files.
    if first_iter:
        first_iter = False
        scatt_alb, cloud_pres, time, lats, lons = AL, CP, T, LA, LO
        traj_id = traj
    else:
        scatt_alb = np.concatenate((scatt_alb, AL), axis=0)
        cloud_pres = np.concatenate((cloud_pres, CP), axis=0)
        time = np.concatenate((time, T), axis=0)
        lats = np.concatenate((lats, LA), axis=0)
        lons = np.concatenate((lons, LO), axis=0)
        traj_id = np.concatenate((traj_id, traj), axis=0)
        
AL, CP, T, LA, LO, traj = None, None, None, None, None, None

<details><summary><b><font color="purple">Click here to access the solution</font></b></summary>
<p>

```python
num_files = len(list_files)
first_iter = True
for i in range(1):
    fname = Path(data_dir) / list_files[i]
    print(f"Reading: {fname}")
    X, Y, Z, W = get_arrays(fname)
    if first_iter:
        first_iter = False
        surf_temp, time, lats, lons = X, Y, Z, W
    else:
        surf_temp = np.concatenate((surf_temp, X), axis=0)
        time = np.concatenate((time, Y), axis=0)
        lats = np.concatenate((lats, Z), axis=0)
        lons = np.concatenate((lons, W), axis=0)
```
</details>

## Step 4: <font color="green">Create the Pandas DataFrame</font>


Convert the time (GPS unit) to a datetime object:

In [None]:
Times = np.zeros_like(time, object)
gps_epoch = dt.datetime(1980, 1, 6)
for j, t in enumerate(time):
    Times[j] = (gps_epoch + dt.timedelta(seconds=time[j] - (35 - 19))).strftime("%Y-%m-%d %H:%M:%S.%f")

In [None]:
df_omi = pd.DataFrame(
    {
        "latitude": lats, 
        "longitude": lons, 
        alb_name: scatt_alb,
        cloud_name: cloud_pres, 
        "t": Times, 
        "traj_id": traj_id
    }
)

df_omi

In [None]:
df_omi.info()

In [None]:
#df_omi['longitude'] = df_omi['longitude']%360

## Step 5: <font color="green">Create the MovingPandas trajectory</font>

In [None]:
traj_omi = mpd.TrajectoryCollection(df_omi,
                          traj_id_col=traj_id,
                          x = "longitude", y="latitude",
                          t="t")

In [None]:
traj_omi

## Step 6: <font color="green">Perform analyses and visualization</font>