### Atlas 14 ASC Grid Postprocessing Script
Script 2/3 for Atlas 14 Spatial Variance Analysis

Author: William (Bill) Katzenmeyer, P.E., C.F.M. (C.H. Fenstermaker and Associates, LLC) 

Source: https://github.com/billk-FM/HEC-Commander-Tools

#### NOAA Data Source:
https://hdsc.nws.noaa.gov/pub/hdsc/data/tx/

In [None]:
#1 Define file paths to watershed polygon, state polygon, and example asc files to be used as general figures
watershed_boundary_file = r'Region4_HUC_Boundaries.geojson'
state_boundary_file = r'State_Boundary.geojson'
asc_file_name_1 = r'LWI_Region4/se50yr06ha/se50yr06ha.asc'
asc_file_name_2 = r'LWI_Region4/tx50yr06ha/tx50yr06ha.asc'

# Default CRS assumption for asc files 
asc_file_default_EPSG = "4269"

# Target CRS for all script operations and outputs
reproject_to_epsg = "4269"

# Input Directory with combined ASC File Datasets (this should come from a previous step on revision)
input_directory = r'LWI_Region4'

# Set the base folder path
base_folder = r'LWI_Region4'

# Output Directory for PNG and CSV Outputs
import os
output_directory = os.path.join(input_directory, 'Watershed_Statistical_Analysis')

In [None]:
#2 Automatically Import and Install Libraries
import subprocess
import sys

def install_and_import(package_name, import_name=None):
    if import_name is None:
        import_name = package_name
    try:
        __import__(import_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
        globals()[import_name] = __import__(import_name)

# Installation and import statements
install_and_import("os")
install_and_import("numpy")
install_and_import("rioxarray")
install_and_import("matplotlib.pyplot", "matplotlib")
install_and_import("geopandas", "geopandas")
install_and_import("pyproj")
install_and_import("json")
install_and_import("shapely.geometry", "shapely")
install_and_import("xarray")
install_and_import("affine")
install_and_import("rasterio")
install_and_import("tqdm")
install_and_import("shutil")
install_and_import("pandas")
install_and_import("pathlib")
install_and_import("IPython.display", "IPython")

# Import statements
import os
import numpy as np
import rioxarray
import matplotlib.pyplot as plt
import geopandas as gpd
from pyproj import CRS
import json
from shapely.geometry import shape
import xarray as xr
from affine import Affine
from rasterio.enums import Resampling
from tqdm import tqdm
import rasterio
import shutil
from rasterio.transform import from_origin
import pandas as pd
from pathlib import Path
from IPython.display import display
import geopandas as gpd
import rioxarray
import xarray as xr
import matplotlib.pyplot as plt
from shapely.geometry import mapping, box
import pandas as pd
import numpy as np
from rasterio.transform import from_origin
from rasterio.warp import Resampling
import re

In [None]:
# 3 Extract metadata from file name and add to dataframe asc_file_names with columns "filename", "return_interval", "duration", "duration_units", "duration_hours"

def extract_metadata(filename):
    """
    Extract metadata from the given filename.

    This function extracts the return interval and duration from the filename,
    which is expected to follow a specific naming convention. The return interval
    is the number preceding 'yr', and the duration is the two digits preceding
    'ha', 'da', or 'ma', which represent hours, days, or months respectively.

    Parameters:
    filename (str): The full path of the file from which to extract metadata.

    Returns:
    dict: A dictionary containing the extracted metadata including:
        - filename: The original filename
        - return_interval: The extracted return interval as an integer
        - duration: The extracted duration as an integer
        - duration_units: The units of duration ('ha', 'da', or 'ma')
        - duration_hours: The duration converted to hours as an integer

    Raises:
    ValueError: If the return interval or duration cannot be extracted from the filename.
    """
    print("/n-----   Extracting Metadata   -----")

    base_filename = os.path.basename(filename)
    
    # Extract return interval (numbers before "yr")
    return_interval_match = re.search(r'(\d+)yr', base_filename)
    if return_interval_match:
        return_interval = int(return_interval_match.group(1))
    else:
        raise ValueError(f"Unable to extract return interval from filename: {base_filename}")
    
    # Extract duration (2 numbers before "ha", "da", or "ma", which denote Hours, Days, or Months)
    duration_match = re.search(r'(\d{2})(ha|da|ma)', base_filename)
    if duration_match:
        duration = int(duration_match.group(1))
        duration_units = duration_match.group(2)
        if duration_units == "ha":
            duration_hours = duration
        elif duration_units == "da":
            duration_hours = duration * 24
        elif duration_units == "ma":
            duration_hours = duration * 24 * 30  # Convert Months to Hours
    else:
        raise ValueError(f"Unable to extract duration from filename: {base_filename}")
    
    return {
        "filename": filename,
        "return_interval": return_interval,
        "duration": duration,
        "duration_units": duration_units,
        "duration_hours": duration_hours
    }

# Create a list of ASC filenames
asc_filenames = [asc_file_name_1, asc_file_name_2]

# Initialize the DataFrame if it doesn't exist
if 'asc_file_names' not in globals():
    asc_file_names = pd.DataFrame(columns=["filename", "return_interval", "duration", "duration_units", "duration_hours"])

# Extract metadata for each file
metadata = []
for filename in asc_filenames:
    try:
        # Check if the filename already exists in the DataFrame
        if filename in asc_file_names['filename'].values:
            print(f"Skipping {filename} as it already exists in the DataFrame.")
            continue
        metadata.append(extract_metadata(filename))
    except ValueError as e:
        print(f"Error processing {filename}: {str(e)}")

# Append new data to the DataFrame
if metadata:
    new_data = pd.DataFrame(metadata)
    asc_file_names = pd.concat([asc_file_names, new_data], ignore_index=True)

print("asc_file_names:")
display(asc_file_names)

# Load the GeoJSON files
watershed_boundary_gdf = gpd.read_file(watershed_boundary_file)
state_boundary_gdf = gpd.read_file(state_boundary_file)

# Print the GeoDataFrame names and display them
print("watershed_boundary_gdf")
display(watershed_boundary_gdf)

print("state_boundary_gdf")
display(state_boundary_gdf)

# Reproject GeoDataFrames to the target CRS
target_crs = f"EPSG:{reproject_to_epsg}"
watershed_boundary_gdf = watershed_boundary_gdf.to_crs(target_crs)
state_boundary_gdf = state_boundary_gdf.to_crs(target_crs)

# Print CRS information to verify
print("state_gdf CRS:", state_boundary_gdf.crs)
print("watershed_gdf CRS:", watershed_boundary_gdf.crs)

# Merge the boundaries into one GeoDataFrame
all_polygons = gpd.GeoDataFrame(pd.concat([state_boundary_gdf, watershed_boundary_gdf], ignore_index=True))

# Print CRS information of the combined GeoDataFrame
print("all_polygons CRS:", all_polygons.crs)

### Define and test functions
Uncomment plt.show() lines if you want to see plots as they are generated

In [None]:
# 4 Define Function to Combine ASC Files

import os
import shutil
import rioxarray
from rasterio.transform import from_origin
from rasterio.enums import Resampling
import matplotlib.pyplot as plt

def combine_asc_files(asc_file_name_1, asc_file_name_2):
    """
    Combines two ASC files into a single raster file, reprojecting and aligning them as necessary.
    
    Parameters:
    asc_file_name_1 (str): The file path of the first ASC file.
    asc_file_name_2 (str): The file path of the second ASC file.
    
    Returns:
    tuple: A tuple containing the file path of the combined ASC file, the plot file name, and the merged raster.
    """
    
    print("/n-----   Combining multistate asc files   -----")
    
    # Generate .prj file names for the ASC files
    asc_prj_name_1 = asc_file_name_1.replace('.asc', '.prj')
    asc_prj_name_2 = asc_file_name_2.replace('.asc', '.prj')

    print(f"ASC file 1 PRJ file: {asc_prj_name_1}")
    print(f"ASC file 2 PRJ file: {asc_prj_name_2}")

    # Load the .asc files with rioxarray
    raster_1 = rioxarray.open_rasterio(asc_file_name_1)
    raster_2 = rioxarray.open_rasterio(asc_file_name_2)

    # Read CRS from prj and set it for both rasters
    with open(asc_prj_name_1, 'r') as prj_file:
        crs = prj_file.read()
    raster_1.rio.set_crs(crs, inplace=True)
    raster_2.rio.set_crs(crs, inplace=True)

    # Calculate the extent of the watersheds GeoDataFrame
    watersheds_bounds = all_polygons.total_bounds
    left, bottom, right, top = watersheds_bounds

    # Expand the extent by 10%
    width = right - left
    height = top - bottom
    left -= width * 0.1
    right += width * 0.1
    bottom -= height * 0.1
    top += height * 0.1

    # Print combined extent values
    print(f"Combined extent (watersheds +10%): left={left}, bottom={bottom}, right={right}, top={top}")

    # Calculate new dimensions based on raster_1's resolution
    resolution_x, resolution_y = raster_1.rio.resolution()
    width = int((right - left) / abs(resolution_x))
    height = int((top - bottom) / abs(resolution_y))

    # Check if dimensions are valid
    if width <= 0 or height <= 0:
        raise ValueError("Calculated dimensions are not valid. Width and height must be positive.")

    # Create new transform for the combined extent
    new_transform = from_origin(left, top, abs(resolution_x), abs(resolution_y))

    # Reproject raster_1 to the new dimensions and combined extent
    raster_1_extended = raster_1.rio.reproject(
        raster_1.rio.crs,
        transform=new_transform,
        shape=(height, width),
        resampling=Resampling.nearest
    )
    print("Reprojected raster_1_extended")

    # Align raster_2 to the coordinates of the extended raster_1
    raster_2_aligned = raster_2.rio.reproject_match(raster_1_extended)
    print("Aligned raster_2 to raster_1_extended")

    # Merge the two rasters
    merged_raster = raster_1_extended.where(raster_1_extended != -9, raster_2_aligned)

    # Print the actual extents of merged_raster
    merged_raster_extents = merged_raster.rio.bounds()
    print("merged_raster extent, as calculated after interpolation:", merged_raster_extents)

    # Generate combined ASC folder name
    asc_base_file_name = 'cb' + os.path.basename(asc_file_name_1)[2:]
    asc_base_without_ext = os.path.splitext(asc_base_file_name)[0]  # remove the .asc from the file name
    
    combined_asc_folder_name = os.path.join(base_folder, asc_base_without_ext)
    combined_asc_file_path = os.path.join(base_folder, asc_base_without_ext, asc_base_file_name)

    # Step 1: Extract file paths and names
    xml_filename = asc_file_name_1.replace(r'.asc', r'.xml')
    prj_filename = asc_file_name_1.replace(r'.asc', r'.prj')

    # Step 2: Split the path to get the directory and file name
    xml_path, xml_file = os.path.split(xml_filename)
    prj_path, prj_file = os.path.split(prj_filename)

    # Step 3: Split the path to get all folder names
    xml_path_parts = xml_path.split(os.sep)
    prj_path_parts = prj_path.split(os.sep)

    # Step 4: Replace the first two letters of the second folder level with "cb"
    if len(xml_path_parts) > 1:
        xml_path_parts[1] = 'cb' + xml_path_parts[1][2:]
    if len(prj_path_parts) > 1:
        prj_path_parts[1] = 'cb' + prj_path_parts[1][2:]

    # Step 5: Reconstruct the new paths
    new_xml_path = os.sep.join(xml_path_parts)
    new_prj_path = os.sep.join(prj_path_parts)

    # Step 6: Replace the first two letters of the file name with "cb"
    new_xml_file = 'cb' + xml_file[2:]
    new_prj_file = 'cb' + prj_file[2:]

    # Step 7: Combine the modified paths and file names
    xml_cb_sub = os.path.join(new_xml_path, new_xml_file).replace(os.sep, '/')
    prj_cb_sub = os.path.join(new_prj_path, new_prj_file).replace(os.sep, '/')

    xml_renamed = xml_cb_sub
    prj_renamed = prj_cb_sub

    # Convert merged_raster values from 1000ths of an inch to inches
    merged_raster = merged_raster / 1000

    # Plot the merged raster and the polygon boundaries
    metadata = extract_metadata(asc_file_name_1)
    fig, ax = plt.subplots(figsize=(12, 8))
    cbar = merged_raster.plot(ax=ax, cmap='viridis', vmin=0, vmax=merged_raster.max())
    cbar.colorbar.set_label("Total Precipitation (Inches)")
    all_polygons.boundary.plot(ax=ax, edgecolor='red')
    plt.xlim(left, right)
    plt.ylim(bottom, top)

    # Set the title with duration and return interval information
    plt.title(f"Atlas 14 \n Return Interval: {metadata['return_interval']} years, Duration: {metadata['duration']} {metadata['duration_units']}")

    # Label the axes
    ax.set_xlabel("Longitude, Degrees")
    ax.set_ylabel("Latitude, Degrees")

    # Create the combined folder if it doesn't exist
    os.makedirs(combined_asc_folder_name, exist_ok=True)

    # Copy xml_filename and prj_filename to their renamed paths
    shutil.copy(xml_filename, xml_renamed)
    print(f"Copied {xml_filename} to {xml_renamed}")
    
    shutil.copy(prj_filename, prj_renamed)
    print(f"Copied {prj_filename} to {prj_renamed}")

    # Save the merged raster as a new ASC file
    merged_raster.rio.to_raster(combined_asc_file_path)
    print(f"Combined ASC file saved: {combined_asc_file_path}")

    # Close the raster datasets
    raster_1.close()
    raster_2.close()
    merged_raster.close()

    # Generate plot file name
    plot_file_name = os.path.join(combined_asc_folder_name, f"cb{os.path.basename(asc_file_name_1)[2:-4]}_plot.png")

    input_directory = os.path.dirname(os.path.dirname(asc_file_name_1))
    output_directory = os.path.join(input_directory, 'Watershed_Statistical_Analysis')

    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)
        print(f"Output directory created: {output_directory}")
    else:
        print(f"Output directory already exists: {output_directory}")

    # Save the plot in the output directory
    plot_file_name = os.path.join(output_directory, f"AA - Regional Report Figure cb{os.path.basename(asc_file_name_1)[2:-4]}_plot.png")
    plt.savefig(plot_file_name)
    print(f"Plot saved: {plot_file_name}")

    # Close the plot to free up memory
    plt.close()

    return combined_asc_file_path, plot_file_name, merged_raster

# Generate combined folder and file names
combined_asc_file_path, plot_file_name, merged_raster = combine_asc_files(asc_file_name_1, asc_file_name_2)


In [None]:
# Define Function to Produce Report Plot and Statistics for each polygon in watershed_boundary_file
# 
# This function calculates statistics for a given polygon from a watershed boundary GeoDataFrame
# and generates a plot of the clipped raster data. It computes the maximum, minimum, mean, 
# and range of raster values within the polygon and appends the results to a DataFrame.
#
# INPUTS:
# - combined_asc_file_path: Path to the combined ASC file.
# - merged_raster: The merged raster data to be analyzed.
# - polygon_index: Index of the polygon in the watershed_boundary_gdf to analyze.
# - watershed_boundary_gdf: GeoDataFrame containing the watershed polygons.
#
# OUTPUT:
# - results_dataframe: DataFrame containing the calculated statistics for the polygon.

import os
import matplotlib.pyplot as plt
from shapely.geometry import mapping
import pandas as pd

def calculate_grid_statistics_by_polygon(combined_asc_file_path, merged_raster, polygon_index, watershed_boundary_gdf):
    """Calculate statistics for a specific polygon and generate a plot.
    
    Args:
        combined_asc_file_path (str): Path to the combined ASC file.
        merged_raster (xarray.DataArray): The merged raster data to be analyzed.
        polygon_index (int): Index of the polygon in the watershed_boundary_gdf to analyze.
        watershed_boundary_gdf (GeoDataFrame): GeoDataFrame containing the watershed polygons.
    
    Returns:
        DataFrame: A DataFrame containing the calculated statistics for the polygon.
    """
    
    # Initialize results DataFrame
    results_dataframe = pd.DataFrame(columns=['File Name', 'Max (inches)', 'Min (inches)', 'Mean (inches)', 'Range (%)', 'polygon_name'])
    print("/n-----   Calculating Statistics for Each Watershed Polygon   -----")

    try:
        # Use the specified polygon in watershed_boundary_gdf
        polygon = watershed_boundary_gdf.iloc[[polygon_index]]
        polygon_name = polygon['name'].values[0]
        polygon = polygon.set_crs(watershed_boundary_gdf.crs)
        print(f"Polygon {polygon_index} selected from watershed_boundary_gdf")

        # Calculate bounds and expand them by 10%
        polygon_bounds = polygon.geometry.bounds
        x_min, y_min, x_max, y_max = polygon_bounds.iloc[0]
        x_buffer = (x_max - x_min) * 0.1
        y_buffer = (y_max - y_min) * 0.1
        expanded_bounds = (x_min - x_buffer, y_min - y_buffer, x_max + x_buffer, y_max + y_buffer)

        # Convert the polygon's geometry to a format suitable for clipping
        polygon_geom = [mapping(polygon.geometry.iloc[0])]

        # Clip the merged raster with the polygon's geometry
        clipped_raster = merged_raster.rio.clip(polygon_geom, watershed_boundary_gdf.crs)

        # Remove no-data values (assuming -9 is the no-data value)
        clipped_raster = clipped_raster.where(clipped_raster != -9)

        # Calculate statistics
        max_value = clipped_raster.max().values.item()
        min_value = clipped_raster.min().values.item()
        mean_value = clipped_raster.mean().values.item()
        range_percentage = ((max_value - min_value) / max_value) * 100
        print(f"Statistics calculated: Max={max_value}, Min={min_value}, Mean={mean_value}, Range%={range_percentage}")

        # Append results to the DataFrame
        new_row = pd.DataFrame([{
            'File Name': os.path.basename(combined_asc_file_path),
            'Max (inches)': max_value,
            'Min (inches)': min_value,
            'Mean (inches)': mean_value,
            'Range (%)': range_percentage,
            'polygon_name': polygon['name'].values[0],
            'Results_name': f"{polygon['name'].values[0]} {os.path.basename(combined_asc_file_path)}"
        }])
        results_dataframe = pd.concat([results_dataframe, new_row], ignore_index=True)

        # Extract metadata for the plot title
        metadata = extract_metadata(combined_asc_file_path)

        # Create and save the plot
        fig, ax = plt.subplots(figsize=(10, 10))
        import numpy as np
        max_color_limit = np.ceil(max_value)  # Round up to the nearest whole number
        min_color_limit = np.floor(min_value)  # Round down to the nearest whole number
        im = clipped_raster.plot(ax=ax, cmap='viridis', label='Precipitation (inches)', vmin=min_color_limit, vmax=max_color_limit)
        polygon.geometry.boundary.plot(ax=ax, color='black', linewidth=2, label='Watershed', zorder=10)
        ax.set_xlim(expanded_bounds[0], expanded_bounds[2])
        ax.set_ylim(expanded_bounds[1], expanded_bounds[3])
        
        # Add statistics to the plot title
        stats_text = f"Max: {max_value:.2f}, Min: {min_value:.2f}, Mean: {mean_value:.2f}, Range: {range_percentage:.2f}%"
        plt.title(f"{polygon_name} \n Atlas 14 \n Return Interval: {metadata['return_interval']} years, Duration: {metadata['duration']} {metadata['duration_units']}\n{stats_text}")
        ax.set_xlabel("Longitude, Degrees")
        ax.set_ylabel("Latitude, Degrees")

        plt.tight_layout()
        
        # Save the plot as PNG
        plot_file_name = f"{polygon_name} {os.path.splitext(os.path.basename(combined_asc_file_path))[0]}_plot.png"
        plot_file_path = os.path.join(output_directory, plot_file_name)
        plt.savefig(plot_file_path)
        plt.close()
        print(f"Plot saved: {plot_file_path}\n")

    except Exception as e:
        print(f"Error processing merged raster: {str(e)}")

    return results_dataframe

# Example of walking over watershed_boundary_gdf and running the function for all polygons
results_df = pd.DataFrame()  # Initialize an empty DataFrame
for polygon_index in range(len(watershed_boundary_gdf)):
    results_dataframe = calculate_grid_statistics_by_polygon(combined_asc_file_path, merged_raster, polygon_index, watershed_boundary_gdf)
    results_df = pd.concat([results_df, results_dataframe], ignore_index=True)

print("Final Results DataFrame:")
display(results_df)

### Process all ASC files and Create Maps and Statistics 

In [None]:
#6 Define function to process all combined asc files in the base folder, calculate figures and statistics, and save to csv

print("ASC file processing completed.")

print("Starting to process ASC files...")

# Get all subfolders in the base folder
subfolders = [f.path for f in os.scandir(base_folder) if f.is_dir()]

# Use the first 2 letters in the base file name of asc_file_name_1 as state_code
state_code = os.path.basename(asc_file_name_1)[:2]
print(f"state_code: {state_code}")
second_state_code = os.path.basename(asc_file_name_2)[:2]
print(f"second_state_code: {second_state_code}")

# Filter subfolders starting with state_code (this should only return one state's files, the other state will be added based on this)
state_code_folders = [folder for folder in subfolders if os.path.basename(folder).startswith('se')]
print(f"state_code_folders: {state_code_folders}")

# Walk over folders and combine, calculate statistics, and save to csv

results_df = None

results_combined = pd.DataFrame()  # Initialize an empty DataFrame

# Iterate over each folder that matches the state code
for folder in tqdm(state_code_folders, desc="Processing folders"):
    print(f"\n Processing folder: {folder}")
    base_folder_name = os.path.basename(folder)
    print(f"base_folder_name: {base_folder_name}")
    
    parent_folder = os.path.dirname(os.path.dirname(asc_file_name_1))
    print(f"parent_folder: {parent_folder}")

    second_state_foldername = f"{second_state_code}{base_folder_name[2:]}"
    print(f"second_state_foldername: {second_state_foldername}")

    second_state_folder = os.path.join(parent_folder, second_state_foldername)
    print(f"second_state_folder: {second_state_folder}")
    
    cb_folder = os.path.join(parent_folder, f"cb{base_folder_name[2:]}")
    print(f"cb_folder: {cb_folder}")
    
    # Check if corresponding second_state_folder folder exists
    if not os.path.exists(second_state_folder):
        raise FileNotFoundError(f"second_state_folder does not exist: {second_state_folder}")
    
    print(f"Checking for existence of second_state_folder: {second_state_folder}")
   
    # Define file paths
    asc_file_name_1 = os.path.join(folder, f"{base_folder_name}.asc")
    print(f"asc_file_name_1: {asc_file_name_1}")
    
    asc_file_name_2 = os.path.join(second_state_folder, f"{second_state_code}{base_folder_name[2:]}.asc")
    print(f"asc_file_name_2: {asc_file_name_2}")
    
    # Check if input files exist
    if not os.path.exists(asc_file_name_1) or not os.path.exists(asc_file_name_2):
        print(f"Error: Input files not found. Skipping...")
        # Raise exception if either doesn't exist

    else:
        # Create 'cb' folder if it doesn't exist
        os.makedirs(cb_folder, exist_ok=True)

        # Further script logic uses folder names, so empty folders will make the script crash.  
        try:
            # Combine ASC datasets
            print("Combining ASC files...")
            combined_asc_file_path, plot_file_name, merged_raster = combine_asc_files(asc_file_name_1, asc_file_name_2)        
            print(f"Combined ASC file created: {combined_asc_file_path}")
        except Exception as e:
            print(f"An error occurred while combining ASC files: {e}")
            raise

    # Load merged data before running statistics
    merged_raster = rioxarray.open_rasterio(combined_asc_file_path)

    # Initialize an empty list to accumulate results
    results_accumulator = []

    # Example of walking over watershed_boundary_gdf and running the function for all polygons
    results_df = pd.DataFrame()  # Initialize an empty DataFrame
    for polygon_index in range(len(watershed_boundary_gdf)):
        results_dataframe = calculate_grid_statistics_by_polygon(combined_asc_file_path, merged_raster, polygon_index, watershed_boundary_gdf)
        results_df = pd.concat([results_df, results_dataframe], ignore_index=True).drop_duplicates()
        print("results_dataframe:")
        #display(results_dataframe)

    print("Final Results DataFrame:")
    #display(results_df)

    # Add to results_combined dataframe to hold results for all files and polygons
    results_combined = pd.concat([results_combined, results_df], ignore_index=True)
    print("results_combined:")
    print(f"Total number of entries in results_combined: {len(results_combined)}, here are a few")
    display(results_combined.head())

    # Close the merged_raster to free up resources
    merged_raster.close()
    print("Closed merged_raster")

# Save results file to CSV in Watershed_Statistical_Analysis folder (output_directory)
csv_file_path = os.path.join(output_directory, 'merged_raster_statistics.csv')

# Save the compiled results DataFrame to a CSV file
results_combined.to_csv(csv_file_path, index=False)
print(f"Results saved to: {csv_file_path}")

display(results_combined)

In [None]:
#7 Split Results by Polygon to provide a CSV for each 
import pandas as pd
import os

results_data_from_csv = results_combined


def split_csv_by_polygon(csv_file_path):
    """
    Splits the combined results CSV file by polygon and saves each polygon's data to a separate CSV file.

    Parameters:
    csv_file_path (str): The path to the input CSV file containing combined results.

    This function loads the dataset, groups the data by 'File Name' and 'polygon_name',
    and saves each group to a separate CSV file in the 'output_csv_by_polygon' directory.
    """
    # Load the dataset from the provided CSV file path
    data = pd.read_csv(csv_file_path)
    print("Loaded data from CSV file.")
    
    # Group the data by 'File Name' to ensure unique entries for processing
    file_name_groups = data.groupby('File Name')

    # Dictionary to store dataframes for each polygon_name
    polygon_dfs = {}

    # Iterate over each group and split by 'polygon_name'
    for file_name, group in file_name_groups:
        for polygon_name, polygon_group in group.groupby('polygon_name'):
            if polygon_name not in polygon_dfs:
                polygon_dfs[polygon_name] = []
            # Append each polygon_group to the corresponding list in the dictionary
            polygon_dfs[polygon_name].append(polygon_group)

    # Get the directory of the input file
    output_directory = os.path.dirname(csv_file_path)

    # Create output directory if it doesn't exist
    if not os.path.exists("output_csv_by_polygon"):
        os.makedirs("output_csv_by_polygon")

    # Save each polygon dataframe to a separate CSV file in the output directory
    for polygon_name, polygon_group_list in polygon_dfs.items():
        combined_polygon_group = pd.concat(polygon_group_list)
        output_path = os.path.join("output_csv_by_polygon", f"{polygon_name.replace(' ', '_')}.csv")
        combined_polygon_group.to_csv(output_path, index=False)
        print(f"Saved CSV for polygon: {polygon_name}")

    print(f"CSV files have been split and saved in: {output_directory}")

# Example usage
# csv_file_path = r'path_to_your_csv_file.csv'
split_csv_by_polygon(csv_file_path)
