<center>
<table>
  <tr>
    <td><img src="https://portal.nccs.nasa.gov/datashare/astg/training/python/logos/nasa-logo.svg" width="100"/> </td>
     <td><img src="https://portal.nccs.nasa.gov/datashare/astg/training/python/logos/ASTG_logo.png?raw=true" width="80"/> </td>
     <td> <img src="https://www.nccs.nasa.gov/sites/default/files/NCCS_Logo_0.png" width="130"/> </td>
    </tr>
</table>
</center>

        
<center>
<h2><font color= "blue" size="+3">PyCon 2024 Tutorial</font></h2>
</center>

---

<center>
    <h3>Python Workflows to Extract and Plot Satellite Data Products along Tracks</h3>
    <h2><font color="red" size="+3">Tracking the Movement of the Aura Satellite - Exercise</font></h2>
</center>

_______

# <font color="red"> Objectives</font>

Use a collection of of files to:
- Gather timeseries data (time, location, value) of surface prressure
- Plot the data on a map

---

----

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import datetime as dt
from pathlib import Path

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

In [None]:
import numpy as np
import h5py
import pandas as pd
import geopandas as gpd

In [None]:
from shapely import geometry as shpgeom
from shapely import wkt as shpwkt

In [None]:
import movingpandas as mpd

In [None]:
import holoviews as hv

In [None]:
import hvplot.pandas 

In [None]:
plot_defaults = {'linewidth':5, 'capstyle':'round', 'figsize':(9,3), 'legend':True}
hv.opts.defaults(hv.opts.Overlay(active_tools=['wheel_zoom'], 
                              frame_width=500, frame_height=400))
hvplot_defaults = {'tiles':None, 'cmap':'Viridis', 'colorbar':True}

In [None]:
mpd.show_versions()

### Data Files

In [None]:
data_dir = "/Users/jkouatch/myTasks/PythonTraining/ASTG606/Materials/sat_data/OMI_Data/"
#data_dir = "/tljh-data/sat_data/OMI_Data"

In [None]:
list_files =[
    "TROPESS_OMI-Aura_L2_Standard_O3_20240101_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240102_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240103_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240104_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240105_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240106_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240107_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240108_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240109_MUSES_R1p22_FS_F0p9_J1.nc",
    "TROPESS_OMI-Aura_L2_Standard_O3_20240110_MUSES_R1p22_FS_F0p9_J1.nc"
]

## Step 1: <font color="green"> Understand the structure of one data file</green>

In [None]:
fname = Path(data_dir) / list_files[0]

In [None]:
def print_attrs(name, obj):
    shift = name.count('/') * '    '
    print(shift + name)
    if isinstance(obj, h5py.Dataset):
        print(shift + '    ' + f"Shape: {obj[()].shape}")
    for key, val in obj.attrs.items():
        print(shift + '    ' + f"{key}: {val}")
        
with h5py.File(fname, mode='r') as fid:
    fid.visititems(print_attrs)  

Please identify the datasets:

- `latitude`
- `longitude`
- `time`
- `surface temperature`

Please pay attention to the unit of `time` (seconds since 1993-01-01) and the attributes of the surface temperature.

## Step 2: <font color="green">Write a simple code to get the `time`, `latitude`, `longitude` and `surface temperature` data arrays from one file</font>

Write a function that takes as argument a file name and returns values for the surface pressure, the time, the lalitude and the longitude:

```python
def get_arrays(fname):
    with h5py.File(fname, 'r') as fid:
        ...
    return surf_temp, time, lats, lons
   ...
```
Test the funtion using any of the above file.

In [None]:
def get_arrays(fname):
    with h5py.File(fname, 'r') as fid:
        surf_temp = fid['geophysical/surface_temperature'][()]
        time = fid['time'][()]
        lats = fid['latitude'][()]
        lons = fid['longitude'][()]
    return surf_temp, time, lats, lons

In [None]:
X, Y, Z, W = get_arrays(fname)

print(f"Shape of surf_temp: {X.shape}")
print(f"Shape of time:      {Y.shape}")
print(f"Shape of latitude:  {Z.shape}")
print(f"Shape of longitude: {W.shape}")

<details><summary><b><font color="green">Click here to access the solution</font></b></summary>
<p>

```python
def get_arrays(fname):
    with h5py.File(fname, 'r') as fid:
        surf_temp = fid['geophysical/surface_temperature'][()]
        time = fid['time'][()]
        lats = fid['latitude'][()]
        lons = fid['longitude'][()]
    return surf_temp, time, lats, lons


X, Y, Z, W = get_arrays(fname)

print(f"Shape of surf_temp: {X.shape}")
print(f"Shape of time:      {Y.shape}")
print(f"Shape of latitude:  {Z.shape}")
print(f"Shape of longitude: {W.shape}")
```
</details>

## Step 3: <font color="green">Read the data files</font>

To write a script that loops over the data files and reads each of them to:
- Gather the time, latitude, longitude, surface presssure
- Use the surface presssure attributes 
- Load the data in a Pandas DataFrame


In [None]:
def convert_dict_dtype(sample_dict):
    '''
    Converts attribute dictionary from NumPy data types 
    to general Python data types

    Parameters
    ----------
    sample_dict : dict
         A dictionary of attributes
         
    Returns
    sample_dict : dictt
         A dictionary of attributes
    '''
    for key, item in sample_dict.items():
        if isinstance(item, np.ndarray):   # Converts np arrays to a list to, if applicable, an int or float
            item = list(item)
        
            if len(item) == 1:
                item = item[0]
        elif isinstance(item, np.bytes_):   # Converts np bytes to an np string to a Python string
            item = str(item.astype('str'))
        
            if item[0] == '(' or item[0] == '{':   # Converts to tuple or dict if applicable
                item = eval(item)
            # **eval() relaiability??**
            
        sample_dict[key] = item   # Updates any changes to the key value
        
    return sample_dict

In [None]:
def get_ds_attrs(ds):
    """
       Give a dataset identifier, return the dataset attribute.
       
       Input Parameters:
          - ds: dataset identifier
       Returned value:
          - ds_attrs: a dictionary
    """
    ds_attrs = dict(ds.attrs)
    ds_attrs = convert_dict_dtype(ds_attrs)
    
    return ds_attrs

In [None]:
def get_ds_attribute_value(ds_attrs, attr_name):
    '''
    Obtain the value of a specified attribute in a dataset.
    
    Parameter
    ---------
    ds_attrs : dict
         A dictionary of dataset attributes
    attr_name : str
         Attribute name    
    
    Returns
    --------
    value: float, int, str, list
         Value of the attribute. If attribute not available, None.
    '''
    for key, value in ds_attrs.items():
        if key == attr_name:
            return value 
    return None

In [None]:
def restore_data(ds):
    '''
    Restore the dataset data using the dataset attributes.
      
    Parameters
    ----------
    ds : h5py dataset identifier
    
    Returns:
    data : numpy array
    '''
    ds_attrs = get_ds_attrs(ds)
    
    _FillValue = get_ds_attribute_value(ds_attrs, '_FillValue')
    
    data = ds[()]#.astype('float')
    
    data = np.where(data != _FillValue, data, np.nan)

    return data

__Note that the surface temperature has a `_FillValue` attribute. We need to make sure that any entry with that value needs to be replaced with `NaN`.__

<font color="green">Rewrite the function in Step 2 to restore the surface temperature data</font>

<details><summary><b><font color="green">Click here to access the solution</font></b></summary>
<p>

```python
def get_arrays(fname):
    with h5py.File(fname, 'r') as fid:
        surf_temp = restore_data(fid['geophysical/surface_temperature'])
        time = fid['time'][()]
        lats = fid['latitude'][()]
        lons = fid['longitude'][()]
    return surf_temp, time, lats, lons


X, Y, Z, W = get_arrays(fname)

print(f"Shape of surf_temp: {X.shape}")
print(f"Shape of time:      {Y.shape}")
print(f"Shape of latitude:  {Z.shape}")
print(f"Shape of longitude: {W.shape}")
```
</details>

In [None]:
def get_arrays(fname):
    with h5py.File(fname, 'r') as fid:
        surf_temp = restore_data(fid['geophysical/surface_temperature'])
        time = fid['time'][()]
        lats = fid['latitude'][()]
        lons = fid['longitude'][()]
    return surf_temp, time, lats, lons

In [None]:
X, Y, Z, W = get_arrays(fname)

print(f"Shape of surf_temp: {X.shape}")
print(f"Shape of time:      {Y.shape}")
print(f"Shape of latitude:  {Z.shape}")
print(f"Shape of longitude: {W.shape}")

<font color="green">Write your script here:

```python
first_iter = True
for i in range(len(list_files)):
    fname = Path(data_dir) / list_files[i]
    print(f"Reading: {fname}")
    ...
```

In [None]:
num_files = len(list_files)
first_iter = True
for i in range(1):
    fname = Path(data_dir) / list_files[i]
    print(f"Reading: {fname}")
    X, Y, Z, W = get_arrays(fname)
    if first_iter:
        first_iter = False
        surf_temp, time, lats, lons = X, Y, Z, W
    else:
        surf_temp = np.concatenate((surf_temp, X), axis=0)
        time = np.concatenate((time, Y), axis=0)
        lats = np.concatenate((lats, Z), axis=0)
        lons = np.concatenate((lons, W), axis=0)

<details><summary><b><font color="green">Click here to access the solution</font></b></summary>
<p>

```python
num_files = len(list_files)
first_iter = True
for i in range(1):
    fname = Path(data_dir) / list_files[i]
    print(f"Reading: {fname}")
    X, Y, Z, W = get_arrays(fname)
    if first_iter:
        first_iter = False
        surf_temp, time, lats, lons = X, Y, Z, W
    else:
        surf_temp = np.concatenate((surf_temp, X), axis=0)
        time = np.concatenate((time, Y), axis=0)
        lats = np.concatenate((lats, Z), axis=0)
        lons = np.concatenate((lons, W), axis=0)
```
</details>

## Step 4: <font color="green">Create the Pandas DataFrame</font>


Convert the time (GPS unit) to a datetime object:

In [None]:
Times = np.zeros_like(time, object)
gps_epoch = dt.datetime(1993, 1, 1)
for j, t in enumerate(time):
    Times[j] = (gps_epoch + dt.timedelta(seconds=time[j] - (35 - 19))).strftime("%Y-%m-%d %H:%M:%S.%f")

In [None]:
df_omi = pd.DataFrame(
    dict(latitude=lats, longitude=lons, 
         surf_temp=surf_temp, t=Times))
df_omi

In [None]:
df_omi.info()

In [None]:
df_omi['longitude'] = df_omi['longitude']%360

## Step 5: <font color="green">Create the MovingPandas trajectory</font>

In [None]:
traj_omi = mpd.Trajectory(df_omi,
                          traj_id=1,
                          x = "longitude", y="latitude",
                          t="t")

In [None]:
traj_omi.df

## Step 6: <font color="green">Perform analyses and visualization</font>

Timeseries plot:

In [None]:
traj_omi.df['surf_temp'].plot()
plt.xticks(rotation=90);

Histogram:

In [None]:
traj_omi.df['surf_temp'].plot(kind='hist', figsize=(12,8));

In [None]:
traj_omi.plot();

In [None]:
fig, ax = plt.subplots(figsize=(12,10))

traj_omi.plot(legend=True, 
           column="surf_temp", 
           capstyle='round', 
              cmap="jet", ax=ax);

In [None]:
traj_omi.hvplot(tiles="ESRI")

In [None]:
hv_kwargs = dict(hover_cols=["latitude", "longitude"], frame_height=300, frame_width=300)

traj_omi.hvplot(**hv_kwargs)