In [1]:
%pip install simplekml
%pip install geopy
%pip install -U kaleido

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [8]:
%%time
from sliderule import sliderule, icesat2, earthdata
import geopandas as gpd
import pandas as pd
import folium
import numpy as np
from shapely.geometry import Polygon, Point, mapping
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.linear_model import LinearRegression
import geopy
import simplekml
from geopy.distance import geodesic
from matplotlib.backends.backend_pdf import PdfPages
import plotly.express as px
from tqdm import tqdm
import warnings
from scipy.stats import linregress
import statsmodels.api as sm
import dask.array as da
from dask import delayed

CPU times: user 58 µs, sys: 0 ns, total: 58 µs
Wall time: 60.6 µs


In [None]:
from dask.distributed import Client
# Start the Dask client, this will use your local machine's resources
client = Client()
print(client)

In [3]:
segment_meter = "20"
cnt = "10"
ats = "10"

site_name = "BONA"
# site_name = "DEJU"
# site_name = "WREF"
# site_name = "RMNP"
# site_name = "TEAK"

boundary_km = "8"

year_folder = 'year/'
day_folder = 'day/'

In [6]:
def compile_dates_for_site(folder, site_name):
    """
    Reads all CSV files for the specified site in a folder, extracts the 'date' column,
    and compiles a unique list of dates.

    Args:
        folder (str): The directory containing the CSV files.

    Returns:
        list: A sorted list of unique dates for the specified site.
    """
    print(site_name)
    all_dates = set()  # Use a set to ensure unique dates

    # Iterate through all files in the folder
    for filename in os.listdir(folder):
        if filename.startswith(site_name) and filename.endswith(".csv"):  # Match site_name in the file name
            filepath = os.path.join(folder, filename)
            try:
                # Read the CSV file into a DataFrame
                df = pd.read_csv(filepath)
                if 'date' in df.columns and 'ground_photon_count' in df.columns and 'canopy_photon_count' in df.columns:
                    # Filter rows where both ground_photon_count and canopy_photon_count are 0
                    filtered_df = df[(df['ground_photon_count'] != 0) | (df['canopy_photon_count'] != 0)]
                    # Add unique dates directly from the 'date' column as strings
                    all_dates.update(filtered_df['date'].dropna().unique())
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

    # Return sorted list of unique dates
    return sorted(all_dates)

site_dates = compile_dates_for_site(year_folder, site_name)
print(site_dates)

BONA
['2019-01-24', '2019-02-12', '2019-02-26', '2019-03-13', '2019-04-25', '2019-05-28', '2019-06-12', '2019-06-26', '2019-07-15', '2019-08-26', '2019-09-11', '2019-09-24', '2019-11-12', '2019-12-11', '2019-12-24', '2020-01-13', '2020-01-22', '2020-02-11', '2020-02-24', '2020-03-11', '2020-04-22', '2020-05-11', '2020-07-12', '2020-08-10', '2020-09-22', '2020-11-09', '2020-11-23', '2020-12-08', '2021-02-08', '2021-02-21', '2021-03-09', '2021-03-22', '2021-06-21', '2021-07-20', '2021-09-06', '2021-10-19', '2021-11-07', '2022-02-20', '2022-03-07', '2022-03-21', '2022-04-19', '2022-05-21', '2022-06-06', '2022-06-19', '2022-08-07', '2022-08-20', '2022-09-05', '2022-09-18', '2022-10-08', '2022-10-17', '2022-11-06', '2022-11-19', '2022-12-05', '2022-12-18', '2023-01-16', '2023-01-20', '2023-01-31', '2023-02-04', '2023-02-18', '2023-03-05', '2023-03-19', '2023-04-17', '2023-04-21', '2023-05-06', '2023-05-20', '2023-07-17', '2023-08-05', '2023-09-16', '2023-10-15', '2023-11-04', '2023-11-13']


In [11]:
def generate_date_time_ranges(dates):
    """
    Generate a dictionary of start and end times for each unique date.

    Args:
        dates (list of str): List of date strings in the format "YYYY-MM-DD".
        
    Returns:
        list of dict: A list of dictionaries where each dictionary represents a time range
                      with 'start_time' and 'end_time' for a specific date.
    """
    time_ranges = []
    for date in dates:
        start_time = f"{date}T00:00:00Z"
        end_time = f"{date}T23:59:59Z"
        
        # Append the date with its start_time and end_time to the time_ranges list
        time_ranges.append({
            "date": date, 
            "start_time": start_time, 
            "end_time": end_time
        })
    return time_ranges

site_time_ranges = generate_date_time_ranges(site_dates)
print(site_name)
# for date in site_time_ranges:
#     print(date)
print(len(site_time_ranges))

BONA
71


In [None]:
%%time
@delayed
def process_time_range(time_range, site, boundary_km, cnt, ats, segment_meter):
    """
    Processes ATL03 data for a specific time range.
    """
    # Load the region dynamically based on the site
    region = sliderule.toregion(f"geojson_files/{site}_buffer_{boundary_km}km.geojson")
    
    # Construct the parameters for the current time range
    parms = {
        "poly": region['poly'],       # Region polygon
        "t0": time_range['start_time'],   # Start time
        "t1": time_range['end_time'],     # End time
        "srt": icesat2.SRT_LAND,           # Surface return type
        "cnf": 0,                    # Confidence level
        "cnt": cnt,                  # Number of photons
        "ats": ats,                  # Along-track spacing
        "len": segment_meter,        # Segment length
        "res": segment_meter,        # Resolution
        "atl08_class": [             # ATL08 classifications
            "atl08_noise",
            "atl08_ground",
            "atl08_canopy",
            "atl08_top_of_canopy",
            "atl08_unclassified"
        ],
        "phoreal": {                 # Phoreal processing settings
            "binsize": 1.0,
            "geoloc": "mean",
            "use_abs_h": True,
            "send_waveform": True
        }
    }

    # Request the ATL03 data using icesat2
    atl03_data = icesat2.atl03sp(parms, keep_id=True)
    
    # If only the 'geometry' column remains, skip this date
    if atl03_data.shape[1] == 1 and 'geometry' in atl03_data.columns:
        print(f"Skipping {time_range['date']} because it only contains geometry data.")
        return None  # Return None for skipped data
    
    columns_to_drop = [
        'region', 'pair', 'segment_dist', 'segment_id', 'cycle', 'track', 'background_rate', 'y_atc', 
        'yapc_score', 'atl03_cnf', 'relief', 'quality_ph'
    ]
    
    # Drop the columns from the DataFrame
    atl03_data = atl03_data.drop(columns=columns_to_drop)
    
    # Return the processed data along with the date key
    return time_range['date'], atl03_data

def process_site(site, time_ranges, boundary_km, cnt, ats, segment_meter):
    """
    Processes a site by requesting ATL03 data for specified time ranges and returns a dictionary of processed data.
    This function now uses Dask for parallel processing.
    """
    # Initialize an empty dictionary to store processed data
    site_data = {}

    # Create a list of delayed tasks for each time range
    tasks = []
    for time_range in time_ranges:
        task = process_time_range(time_range, site, boundary_km, cnt, ats, segment_meter)
        tasks.append(task)
    
    # Compute the results in parallel using Dask
    results = client.compute(tasks, sync=True)  # The 'sync=True' option will block and collect results

    # Store the processed data in the dictionary with the date as the key
    for date, atl03_data in results:
        if atl03_data is not None:  # Skip any None values returned for skipped dates
            site_data[date] = atl03_data

    return site_data

site_data = process_site(site_name, site_time_ranges, boundary_km, cnt, ats, segment_meter)

In [3]:
%%time

# For testing the new moving window functionality

segment_meter = "20"
cnt = "10"
ats = "10"
start_time = "2022-10-01T00:00:00Z"
end_time =   "2023-05-31T23:59:59Z"

site_name = "BONA"
boundary_km = "8"

region = sliderule.toregion(f"geojson_files/{site_name}_buffer_{boundary_km}km.geojson")

parms = {
    "poly": region['poly'],# Polygon defining the spatial region of interest
    "t0": start_time, # Start time of the data collection period (ISO 8601 format)
    "t1": end_time, # End time of the data collection period (ISO 8601 format)
    "srt": icesat2.SRT_LAND, # Surface return type: specifies the data type (e.g., LAND, ICE, etc.)
    "cnf": 0, # Confidence level for photon classification (e.g., 0: all photons)
    "cnt": cnt, # Number of photons required for the analysis (optional)
    "ats": ats, # Along-track spacing for the data (optional, controls density of data)
    "len": segment_meter,# Segment length in meters, used for segmentation 
    "res": segment_meter, # Resolution of the output, in meters
    # List of ATL08 photon classifications to include in the data
    "atl08_class": [
        "atl08_noise",        # Photons classified as noise
        "atl08_ground",       # Ground-level photons
        "atl08_canopy",       # Photons classified as canopy
        "atl08_top_of_canopy",# Photons classified as the top of the canopy
        "atl08_unclassified"  # Unclassified photons
    ],
    # Phoreal settings for processing photon data
    "phoreal": {
        "binsize": 1.0,          # Bin size for photon aggregation (e.g., 1 meter)
        "geoloc": "mean",        # Method for geolocation aggregation (e.g., mean)
        "use_abs_h": True,       # Whether to use absolute heights in calculations
        "send_waveform": True    # Whether to include waveform data in the output
    }
}

CPU times: user 28.4 ms, sys: 4.48 ms, total: 32.9 ms
Wall time: 36.5 ms


In [4]:
%%time
# Retrieve ATL03 data, retaining the extent ID
atl03_data = icesat2.atl03sp(parms, keep_id=True)

CPU times: user 44.3 s, sys: 1.42 s, total: 45.8 s
Wall time: 47.2 s


In [5]:
print (atl03_data.columns.tolist())

['cycle', 'rgt', 'track', 'sc_orient', 'solar_elevation', 'segment_dist', 'segment_id', 'background_rate', 'extent_id', 'pair', 'region', 'atl08_class', 'atl03_cnf', 'height', 'relief', 'yapc_score', 'y_atc', 'landcover', 'snowcover', 'quality_ph', 'x_atc', 'geometry', 'spot']


In [6]:
%%time
# Group by 'extent_id' and 'x_atc' to define each photon group
def get_photons_per_set(df, distance_window=5):
    """
    Group data by 'x_atc' distance window, creating sets based on distance (e.g., 5 meters per window) 
    and considering unique 'extent_id'.
    """
    df = df.sort_values(by=['extent_id', 'x_atc'])  # Ensure sorted by extent and along-track distance
    df['distance_group'] = df.groupby('extent_id')['x_atc'].transform(lambda x: (x // distance_window).astype(int))
    return df

# Apply rolling window (moving average) for ground height data
def moving_window_distance(df, window_size=3, min_photons=2):
    """
    Calculate average ground height using a moving window on 'distance_group', ensuring enough ground photons.
    """
    # Filter for ground photons
    ground_df = df[df['atl08_class'] == 1].copy()  # Ground photons only

    # Apply rolling average calculation for ground heights
    ground_df['avg_ground_height'] = ground_df.groupby(['extent_id', 'distance_group'])['height'].transform(
        lambda x: x.rolling(window=window_size, min_periods=min_photons).mean()
    )

    # Handle groups with insufficient photons
    ground_df['avg_ground_height'] = ground_df.groupby(['extent_id', 'distance_group'])['height'].transform(
        lambda x: np.nan if len(x) < min_photons else x.mean()
    )

    # Fill NaN values using forward and backward fills
    ground_df['avg_ground_height'] = ground_df['avg_ground_height'].ffill().bfill()

    # Merge back to full DataFrame
    df = pd.merge(df, ground_df[['extent_id', 'distance_group', 'avg_ground_height']], 
                  on=['extent_id', 'distance_group'], how='left')
    return df

# Compute relative height by subtracting average ground height
def calculate_relative_height(df):
    """
    Subtract average ground height from each photon to compute relative height.
    """
    df['relative_height'] = df['height'] - df['avg_ground_height']
    df['relative_height'] = df['relative_height'].fillna(0)  # Replace NaNs with 0 for cleaner output
    return df

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 9.78 µs


In [8]:
%%time
# Initialize an empty list to collect processed data
processed_data_list = []

for extent_id in tqdm(atl03_data['extent_id'].unique(), desc="Processing Extents"):
    extent_data = atl03_data[atl03_data['extent_id'] == extent_id]

    # Process segment data
    extent_data = get_photons_per_set(extent_data, distance_window=5)  # 5-meter window
    extent_data = moving_window_distance(extent_data, window_size=3, min_photons=2)
    extent_data = calculate_relative_height(extent_data)

    # Append processed data to the list
    processed_data_list.append(extent_data)

Processing Extents: 100%|██████████| 12784/12784 [03:01<00:00, 70.35it/s]

CPU times: user 3min, sys: 1.55 s, total: 3min 2s
Wall time: 3min 1s





In [18]:
# Group by extent_id and count the number of NaNs in 'relative_height'
na_counts = processed_atl03_data.groupby('extent_id')['relative_height'].apply(lambda x: x.isna().sum())

# Convert to a DataFrame for easier inspection
na_df = na_counts.reset_index(name='na_count')

# Count the total number of non-NaN values for each extent_id
extent_counts = processed_atl03_data.groupby('extent_id')['relative_height'].count().reset_index(name='total_count')

# Merge the two DataFrames to align the counts
merged_df = pd.merge(na_df, extent_counts, on='extent_id')

# Number of extents with all NaN values in 'relative_height'
all_na_extents = merged_df[merged_df['na_count'] == merged_df['total_count']].shape[0]

# Number of extents with some NaN values in 'relative_height'
some_na_extents = merged_df[merged_df['na_count'] > 0].shape[0]

# Total number of extents
total_extents = merged_df.shape[0]

# Display the counts
print(f"Total extents: {total_extents}")
print(f"Extents with all NaN values in 'relative_height': {all_na_extents}")
print(f"Extents with some NaN values in 'relative_height': {some_na_extents}")


Total extents: 772
Extents with all NaN values in 'relative_height': 0
Extents with some NaN values in 'relative_height': 0
