### Atlas 14 ASC Grid Summary Data Analysis Script
Script 3/3 for Atlas 14 Spatial Variance Analysis

Author: William (Bill) Katzenmeyer, P.E., C.F.M. (C.H. Fenstermaker and Associates, LLC) 

Source: https://github.com/billk-FM/HEC-Commander-Tools

In [None]:
# Import Required Libraries 
# only pandas needs installation,and should have been installed in previous scripts
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
import os
import re

In [None]:
# Provide path to a CSV file from previous script, the script will iterate over all files in the folder

# Load the data from the CSV file (use an example file for the example plot)
file_path = r'output_csv_by_polygon\West_Fork_2D.csv'
csv_data = pd.read_csv(file_path)


# output_folder is the folder containing file_path
output_csv_by_polygon_folder = os.path.dirname(file_path)
print(f"output_csv_by_polygon_folder: {output_csv_by_polygon_folder}")


# Display the first few rows of the dataframe to understand its structure
display(csv_data)

In [None]:
# Function to extract metadata from file name - adapted from previous script to work on the csv dataframe, and extract the metadata into columns of the dataframe

# Define the folder containing the CSV files
output_csv_by_polygon_folder = os.path.dirname(file_path)
print(f"output_csv_by_polygon_folder: {output_csv_by_polygon_folder}")

def extract_metadata_from_dataframe(dataframe):
    """
    Extracts metadata from the 'File Name' column of the given dataframe.
    
    This function processes each filename to extract the return interval, duration, 
    duration units, and duration in minutes. The extracted metadata is added as new 
    columns to the dataframe.
    
    Parameters:
    dataframe (pd.DataFrame): The dataframe containing a 'File Name' column.
    
    Returns:
    pd.DataFrame: The updated dataframe with new metadata columns.
    """
    print("\n-----   Extracting Metadata   -----")
    
    def extract_metadata(filename):
        """
        Extracts metadata from a given filename.
        
        Parameters:
        filename (str): The name of the file from which to extract metadata.
        
        Returns:
        tuple: A tuple containing the return interval, duration, duration units, 
               and duration in minutes.
        """
        base_filename = os.path.basename(filename)
        
        # Extract return interval (numbers before "yr")
        return_interval_match = re.search(r'(\d+)yr', base_filename)
        if return_interval_match:
            return_interval = int(return_interval_match.group(1))
        else:
            raise ValueError(f"Unable to extract return interval from filename: {base_filename}")
        
        # Extract duration (2 numbers before "ha", "da", or "ma", which denote Hours Days or Minutes)
        duration_match = re.search(r'(\d{2})(ha|da|ma)', base_filename)
        if duration_match:
            duration = int(duration_match.group(1))
            duration_units = duration_match.group(2)
            if duration_units == "ha":
                duration_minutes = duration * 60
            elif duration_units == "da":
                duration_minutes = duration * 60 * 24
            elif duration_units == "ma":
                duration_minutes = duration   
        else:
            raise ValueError(f"Unable to extract duration from filename: {base_filename}")
        
        return return_interval, duration, duration_units, duration_minutes
    
    # Apply the extract_metadata function to each row in the dataframe
    metadata = dataframe['File Name'].apply(extract_metadata)
    dataframe[['Return Interval', 'Duration', 'Duration Units', 'Duration Hours']] = pd.DataFrame(metadata.tolist(), index=dataframe.index)
    
    return dataframe

# Extract metadata and add it to the dataframe
csv_data_with_metadata = extract_metadata_from_dataframe(csv_data)

# Save the updated dataframe back to the CSV file
csv_data_with_metadata.to_csv(file_path, index=False)

# Display the updated dataframe
print("csv_data_with_metadata")
display(csv_data_with_metadata)

# now, repeat for all csv files in the output_csv_by_polygon folder

# create a list of all csv (files ending in .csv) in the output_csv_by_polygon folder, then start a loop
# create variable csv_from_folder with the path of the current file
# populate csv_data with the data for the current file
# then, call the extract_metadata_from_dataframe function for each file

# Create a list of all CSV files in the output_csv_by_polygon folder
csv_files = [os.path.join(output_csv_by_polygon_folder, f) for f in os.listdir(output_csv_by_polygon_folder) if f.endswith('.csv')]

# Initialize an empty list to accumulate dataframes
dataframes = []

# Loop through each CSV file
for csv_file in csv_files:
    # Read the CSV file into a dataframe
    csv_data = pd.read_csv(csv_file)
    
    # Print the dataframe name and display the dataframe
    print(f"csv_data from {csv_file}")
    #display(csv_data)
    
    # Extract metadata and add it to the dataframe
    csv_data_with_metadata = extract_metadata_from_dataframe(csv_data)
    
    # Save the updated dataframe back to the CSV file
    csv_data_with_metadata.to_csv(csv_file, index=False)
    
    # Print the updated dataframe name and display the updated dataframe
    #print(f"csv_data_with_metadata from {csv_file}")
    #display(csv_data_with_metadata)
    
    # Append the updated dataframe to the list
    dataframes.append(csv_data_with_metadata)

# Print completion message
print("Metadata extraction complete")


In [None]:
# Create HUC-specific plots for each duration and return interval
# For each Return Interval, create a bar chart showing the min, mean, and max for each duration
def plot_min_mean_max_by_duration(dataframe, output_folder, file_name):
    """
    Create bar charts showing the minimum, mean, and maximum values for each duration
    within the specified return intervals.

    Parameters:
    dataframe (pd.DataFrame): The input dataframe containing rainfall data.
    output_folder (str): The folder where the plots will be saved.
    file_name (str): The name of the file being processed, used for plot titles.

    Returns:
    None
    """
    unique_return_intervals = dataframe['Return Interval'].unique()
    
    for return_interval in unique_return_intervals:
        # Filter the dataframe for the current return interval
        subset = dataframe[dataframe['Return Interval'] == return_interval]
        
        # Sort the subset by Duration Hours
        subset = subset.sort_values(by='Duration Hours')
        
        plt.figure(figsize=(16, 9))  # Set the figure size for the plot
        bar_width = 0.25  # Define the width of the bars
        index = range(len(subset))  # Create an index for the x-axis
        
        # Create bar plots for Min, Mean, and Max values
        plt.bar(index, subset['Min (inches)'], bar_width, label='Min (inches)')
        plt.bar([i + bar_width for i in index], subset['Mean (inches)'], bar_width, label='Mean (inches)')
        plt.bar([i + 2 * bar_width for i in index], subset['Max (inches)'], bar_width, label='Max (inches)')
        
        # Calculate and display percent variance
        for i, (min_val, max_val) in enumerate(zip(subset['Min (inches)'], subset['Max (inches)'])):
            percent_variance = ((max_val - min_val) / min_val) * 100
            plt.text(i + bar_width, max_val, f'{percent_variance:.1f}%', 
                     ha='center', va='bottom', fontweight='bold')
        
        # Set the title and labels for the plot
        title = f'Atlas 14 Variance within {os.path.splitext(os.path.basename(file_name))[0].replace("_", " ")} Watershed \nfor Return Interval: {return_interval} years'
        plt.title(title)
        plt.xlabel('Duration (Minutes)')
        plt.ylabel('Inches')
        plt.xticks([i + bar_width for i in index], subset['Duration Hours'])
        plt.legend(title='Legend', labels=['Min (inches)', 'Mean (inches)', 'Max (inches)', 'Percent variance from min to max'])
        plt.grid(True)  # Add grid lines to the plot
        
        # Save the plot
        plot_filename = f"{os.path.splitext(os.path.basename(file_name))[0]}_{return_interval}yr.png"
        plt.savefig(os.path.join(output_folder, plot_filename))
        plt.close()  # Close the plot to free memory
        print(f"Bar chart created for Return Interval: {return_interval} years")

# Create a list of all CSV files in the output_csv_by_polygon folder
csv_files = [os.path.join(output_csv_by_polygon_folder, f) for f in os.listdir(output_csv_by_polygon_folder) if f.endswith('.csv')]

# Loop through each CSV file
for csv_file in csv_files:
    # Read the CSV file into a dataframe
    csv_data = pd.read_csv(csv_file)
    
    print(f"Processing csv_data from {csv_file}")
    
    # Extract metadata and add it to the dataframe
    csv_data_with_metadata = extract_metadata_from_dataframe(csv_data)
    
    # Create output folder for plots
    output_folder = os.path.join(os.path.dirname(csv_file), os.path.splitext(os.path.basename(csv_file))[0])
    os.makedirs(output_folder, exist_ok=True)  # Create the output folder if it doesn't exist
    
    # Call the function to create bar charts
    plot_min_mean_max_by_duration(csv_data_with_metadata, output_folder, csv_file)
    
    print(f"Plots created for {csv_file}")

print("Processing complete for all CSV files")


In [None]:
# Plot mean/max by polygon

def plot_min_mean_max_by_polygon(dataframes, output_folder, duration, return_interval):
    """
    Creates a bar plot showing the minimum, mean, and maximum rainfall (in inches) 
    across different watersheds for a specified duration and return interval.

    Parameters:
    dataframes (list): A list of pandas DataFrames containing rainfall data.
    output_folder (str): The folder where the plot will be saved.
    duration (int): The duration in hours for which the plot is created.
    return_interval (int): The return interval in years for which the plot is created.
    """
    plt.figure(figsize=(12, 6))  # Set the figure size for the plot
    bar_width = 0.25  # Set the width of the bars in the bar chart
    index = range(len(dataframes))  # Create an index for the x-axis based on the number of dataframes
    
    # Extract minimum, mean, and maximum values for the specified duration and return interval
    min_values = [df[(df['Duration Hours'] == duration) & (df['Return Interval'] == return_interval)]['Min (inches)'].values[0] for df in dataframes]
    mean_values = [df[(df['Duration Hours'] == duration) & (df['Return Interval'] == return_interval)]['Mean (inches)'].values[0] for df in dataframes]
    max_values = [df[(df['Duration Hours'] == duration) & (df['Return Interval'] == return_interval)]['Max (inches)'].values[0] for df in dataframes]
    
    # Create bar plots for minimum, mean, and maximum values
    plt.bar(index, min_values, bar_width, label='Min (inches)')
    plt.bar([i + bar_width for i in index], mean_values, bar_width, label='Mean (inches)')
    plt.bar([i + 2 * bar_width for i in index], max_values, bar_width, label='Max (inches)')
    
    # Display actual values above each bar
    for i, (min_val, mean_val, max_val) in enumerate(zip(min_values, mean_values, max_values)):
        plt.text(i, min_val, f'{min_val:.1f}', ha='center', va='bottom', fontsize=8)
        plt.text(i + bar_width, mean_val, f'{mean_val:.1f}', ha='center', va='bottom', fontsize=8)
        plt.text(i + 2 * bar_width, max_val, f'{max_val:.1f}', ha='center', va='bottom', fontsize=8)
    
    # Set the title and labels for the plot
    title = f'Atlas 14 Variance Across Watersheds\nfor Duration: {duration} hours, Return Interval: {return_interval} years'
    plt.title(title)
    plt.xlabel('Watersheds')
    plt.ylabel('Inches')
    plt.xticks([i + bar_width for i in index], [os.path.splitext(os.path.basename(df.name))[0].replace("_", " ") for df in dataframes], rotation=45, ha='right')
    plt.legend(title='Legend', labels=['Min (inches)', 'Mean (inches)', 'Max (inches)'])
    plt.grid(True)  # Add grid lines to the plot
    plt.tight_layout()  # Adjust layout to prevent clipping of tick-labels
    
    # Save the plot
    plot_filename = f"Regional_Plot_{duration}hr_{return_interval}yr.png"
    plt.savefig(os.path.join(output_folder, plot_filename))  # Save the figure to the specified output folder
    print(f"output folder {output_folder}/{plot_filename} written")
    plt.close()  # Close the plot to free memory
    print(f"Regional bar chart created for Duration: {duration} hours, Return Interval: {return_interval} years")

# Define the folder containing the CSV files
output_csv_by_polygon_folder = r'h:\2202134.00C_LWI_Region4\Technical\Regional Design Storm Models\Atlas 14 Assistant\output_csv_by_polygon'

# Create a list of all CSV files in the output_csv_by_polygon folder
csv_files = [os.path.join(output_csv_by_polygon_folder, f) for f in os.listdir(output_csv_by_polygon_folder) if f.endswith('.csv')]

# Create a list to store all dataframes
all_dataframes = []

# Loop through each CSV file
for csv_file in csv_files:
    # Read the CSV file into a dataframe
    csv_data = pd.read_csv(csv_file)
    
    print(f"Processing csv_data from {csv_file}")  # Inform the user about the current file being processed
    
    # Extract metadata and add it to the dataframe
    csv_data_with_metadata = extract_metadata_from_dataframe(csv_data)
    
    # Add the file name as an attribute to the dataframe
    csv_data_with_metadata.name = csv_file
    
    # Append the dataframe to the list
    all_dataframes.append(csv_data_with_metadata)

# Create output folder for regional plots
regional_output_folder = os.path.join(output_csv_by_polygon_folder, "Regional Plots")
os.makedirs(regional_output_folder, exist_ok=True)  # Create the output folder if it doesn't exist
print(regional_output_folder)

# Get unique durations and return intervals
unique_durations = all_dataframes[0]['Duration Hours'].unique()  # Extract unique durations from the first dataframe
unique_return_intervals = all_dataframes[0]['Return Interval'].unique()  # Extract unique return intervals from the first dataframe

# Create regional plots for each combination of duration and return interval
for duration in unique_durations:
    for return_interval in unique_return_intervals:
        plot_min_mean_max_by_polygon(all_dataframes, regional_output_folder, duration, return_interval)  # Call the plotting function for each combination

print("Processing complete for all CSV files")  # Inform the user that processing is complete
