In [4]:
# SOIL STATS TOOL FOR RASMAPPER

# This script helps calculate soil statistics for a given watershed boundary
# This is useful for watershed studies where HEC-RAS is used for infiltration calculations
# The inputs to this script are the watershed boundary and the infiltration layer (tif and hdf) from RASMapper


# USER INPUTS
shp_path = r"input_files\WF_Boundary_Simple.shp"  # Update this path
tif_path = r"input_files\gSSURGO_InfiltrationDC.tif"

# User must also provide the HDF associated with the infiltration .tif
# All inputs to this script are sourced directly from HEC-RAS's RASMapper 
# mukeys are read from the hdf file directly using h5py

# shp_path should contain a single part polygon
# If your boundary is multipart, provide to GIS Autonomous Assistant and ask it to convert to a single part polygon (Link: https://chat.openai.com/g/g-2mZE2aq07-gis-autonomous-assistant)
# The script does not check projections, so ensure all files are in same projection (this should be true if a consistent projection was used in RASMapper)


#### The next code cell automatically pip installs pandas, rasterio, rasterstats, h5py, numpy and any other dependencies. 
Recommend using Anaconda with Python 3.11

In [5]:
# Required Import Statements and Libary Installation
# Auto-Install packages using subprocess (available in most base environments)

packages = ["os", 
            "shutil", "re", "pandas", "chardet", "geopandas", "rasterio", "rasterstats",
            "datetime", "fnmatch", "threading", "h5py", "numpy",
            "time", "tempfile"]

# Logic to Install Packages
import subprocess 
import sys
def install(package):
    try:
        print("Installing " + package)
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print("Installed " + package + " successfully")
    except subprocess.CalledProcessError:
        try:
            subprocess.check_call(["conda", "install", "-y", package])
        except subprocess.CalledProcessError as ex:
            print(f"Unable to install {package}: {str(ex)}")

for package in packages:
    try:
        # If the import succeeds, the package is installed, so we can move on
        __import__(package)
    except ImportError:
        # If the import fails, the package is not installed and we try to install it
        install(package)
import os                 
import sys
from pathlib import Path
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio.mask import mask
from rasterstats import zonal_stats
import h5py
import numpy as np
import shutil

In [6]:
#  -----   Main Logic, Infiltration Stats Tool ---- 

# Find HDF File
# hdf path is tif_path with .hdf instead of .tiff
import os  
hdf_path = os.path.splitext(tif_path)[0] + '.hdf'

# Function to read the raster map from the HDF file and create a mapping
def read_raster_map(hdf_path):
    with h5py.File(hdf_path, 'r') as hdf:
        raster_map_data = hdf['Raster Map'][:]
        return {int(item[0]): item[1].decode('utf-8') for item in raster_map_data}

# Load the shapefile and raster
watershed = gpd.read_file(shp_path)
raster = rasterio.open(tif_path)

# Read the raster map for mukey mapping
raster_map = read_raster_map(hdf_path)

# Clip the raster with the watershed polygon
out_image, out_transform = mask(raster, watershed.geometry, crop=True)
nodata = raster.nodatavals[0]

# Perform zonal statistics
stats = zonal_stats(watershed, out_image[0], affine=out_transform, nodata=nodata, categorical=True)

# Conversion factors
sqm_to_acre = 0.000247105
sqm_to_sqmile = 3.861e-7

# Initialize a dictionary for calculating the area of each mukey
mukey_areas = {mukey: 0 for mukey in raster_map.values()}
print("mukey areas")
print(mukey_areas)

total_area_sqm = 0
for stat in stats:
    for raster_val, area in stat.items():
        mukey = raster_map.get(raster_val)
        if mukey:
            mukey_areas[mukey] += area
        total_area_sqm += area
        print(f"mukey: {mukey}, area: {area}, total_area_sqm: {total_area_sqm}")


# Create a DataFrame from the mukey_areas dictionary

# Initialize an accumulator list
accumulator = []

# Iterate over mukey_areas and create a DataFrame for each row
for mukey, area_sqm in mukey_areas.items():
    if area_sqm > 0:  # Only add entries with non-zero area
        area_acres = area_sqm * sqm_to_acre
        area_sqmiles = area_sqm * sqm_to_sqmile
        percent = (area_sqm / total_area_sqm) * 100 if total_area_sqm != 0 else 0

        # Create a DataFrame for this row and add it to the accumulator
        row_df = pd.DataFrame({'mukey': [mukey], 
                               'Percentage': [percent], 
                               'Area in Acres': [area_acres], 
                               'Area in Square Miles': [area_sqmiles]})
        accumulator.append(row_df)

# Concatenate all the DataFrames in the accumulator into one big DataFrame
mukey_df = pd.concat(accumulator, ignore_index=True)

# Print and save the DataFrame
display(mukey_df)
mukey_df.to_csv(r'final_mukey_statistics.csv', index=False)
print("final_mukey_statistics.csv saved")



mukey areas
{'NoData': 0, '1725696': 0, '1725683': 0, '1725682': 0, '1725727': 0, '1725722': 0, '1725689': 0, '1725723': 0, '1725700': 0, '1725685': 0, '1725684': 0, '1725716': 0, '1725726': 0, '1725745': 0, '1725708': 0, '1725690': 0, '1725704': 0, '1725686': 0, '1725715': 0, '1725714': 0, '1725709': 0, '1725713': 0, '1725719': 0, '1725720': 0, '1725707': 0, '1725693': 0, '1725712': 0, '1725728': 0, '1725717': 0, '1725706': 0, '1725680': 0, '1725695': 0, '1725692': 0, '1725718': 0, '1725705': 0, '1725691': 0, '1725698': 0, '1725701': 0, '1725699': 0, '1725724': 0, '1725702': 0, '1725681': 0, '1725694': 0, '1725711': 0, '1725687': 0, '1725697': 0, '1725688': 0, '1725703': 0, '1413666': 0, '1413695': 0, '1413703': 0, '1413675': 0, '1413688': 0, '1413684': 0, '1413674': 0, '1413672': 0, '1413682': 0, '1413690': 0, '1413673': 0, '1413691': 0, '1413685': 0, '1413686': 0, '1413680': 0, '1413676': 0, '1413696': 0, '1413679': 0, '1725569': 0, '1725529': 0, '1725540': 0, '1725570': 0, '1725536

Unnamed: 0,mukey,Percentage,Area in Acres,Area in Square Miles
0,1725696,19.527739,104.824906,0.163788
1,1725683,9.091972,48.805709,0.076259
2,1725682,0.035860,0.192495,0.000301
3,1725727,1.263283,6.781303,0.010596
4,1725722,0.066334,0.356078,0.000556
...,...,...,...,...
63,1413683,0.110203,0.591569,0.000924
64,1413671,0.867721,4.657929,0.007278
65,1413678,0.012751,0.068448,0.000107
66,1413669,0.295440,1.585920,0.002478


final_mukey_statistics.csv saved


In [7]:
# Now, show all the mukeys that have a percentage greater than 1%
print("mukeys with percentage greater than 1%")
display(mukey_df[mukey_df['Percentage'] > 1])

# Then print the sum of percentages for all mukeys that have a percentage greater than 1%
print("sum of percentages for all mukeys that have a percentage greater than 1%")
print(mukey_df[mukey_df['Percentage'] > 1]['Percentage'].sum())


mukeys with percentage greater than 1%


Unnamed: 0,mukey,Percentage,Area in Acres,Area in Square Miles
0,1725696,19.527739,104.824906,0.163788
1,1725683,9.091972,48.805709,0.076259
3,1725727,1.263283,6.781303,0.010596
5,1725689,1.51872,8.152488,0.012738
9,1725684,1.216513,6.530244,0.010203
10,1725716,1.504265,8.074897,0.012617
13,1725708,2.918168,15.664727,0.024476
14,1725690,2.084005,11.186938,0.01748
15,1725704,1.992813,10.697423,0.016715
17,1725715,1.208135,6.485271,0.010133


sum of percentages for all mukeys that have a percentage greater than 1%
91.36725288120365
