# core

> Fill in a module description here

In [1]:
#| default_exp core

In [2]:
#| hide
from nbdev.showdoc import *

  import pkg_resources,importlib


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import rasterio
import numpy as np
import pandas as pd
import geopandas as gpd

In [5]:
#| hide
from importlib.resources import files

def get_demo_path(filename):
    return files("muir.demo").joinpath(filename)

In [6]:
#| hide
get_demo_path("dem.tif")

Path('/Volumes/creek/muir/muir/demo/dem.tif')

Let's get the stats and area of a LULC raster.

In [7]:
#| export
def get_unique_classes(raster_path):
    with rasterio.open(raster_path) as src:
        raster_data = src.read(1)
        unique_classes = np.unique(raster_data)
    return unique_classes

We can get the unique labels like this:

In [8]:
lulc_bbox_path = get_demo_path("lulc/mapbiomas-30m-2023-bbox.tif")
lulc_classes = get_unique_classes(lulc_bbox_path)
lulc_classes

array([ 0,  3,  5,  9, 11, 12, 15, 20, 21, 23, 24, 25, 29, 30, 31, 32, 33,
       39, 41, 46, 48, 49, 50], dtype=uint8)

In [9]:
#| export
def parse_clr(lulc_clr_path):
    records = []
    with open(lulc_clr_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 6:
                continue  # skip malformed lines
            class_ = int(parts[0])
            r, g, b = map(int, parts[1:4])
            class_name = " ".join(parts[5:])
            color_rgb = (r, g, b)
            color_hex = "#{:02x}{:02x}{:02x}".format(r, g, b)
            records.append({
                "class": class_,
                "color_rgb": color_rgb,
                "color_hex": color_hex,
                "class_name": class_name
            })
    df = pd.DataFrame(records)
    return df

We can parse LULC codes like this:

In [10]:
lulc_clr_path = get_demo_path("lulc/mapbiomas-lulc-color-codes.clr")
class_mapping = parse_clr(lulc_clr_path)
print(f"Total rows: {len(class_mapping)}")
class_mapping.head(2)

Total rows: 37


Unnamed: 0,class,color_rgb,color_hex,class_name
0,1,"(31, 141, 73)",#1f8d49,Forest
1,3,"(31, 141, 73)",#1f8d49,Forest Formation


In [11]:
#| export
def load_lulc(lulc_tif_path, lulc_clr_path):
    unique_classes = get_unique_classes(lulc_tif_path)
    class_mapping = parse_clr(lulc_clr_path)
    relevant_classes = class_mapping[class_mapping['class'].isin(unique_classes)].reset_index(drop=True)
    return relevant_classes

In [12]:
lulc_df = load_lulc(lulc_bbox_path, lulc_clr_path)
print(f"Total rows: {len(lulc_df)}")
lulc_df.head(2)

Total rows: 22


Unnamed: 0,class,color_rgb,color_hex,class_name
0,3,"(31, 141, 73)",#1f8d49,Forest Formation
1,5,"(4, 56, 29)",#04381d,Mangrove


Now, we can get some stats for each LULC class.

In [13]:
#| export
def lulc_pixel_stats(raster_path):
    """
    Returns a DataFrame with the number of pixels and area (in square meters) for each unique class in the raster.
    """
    with rasterio.open(raster_path) as src:
        data = src.read(1)
        pixel_area = abs(src.transform.a) * abs(src.transform.e)  # pixel width * pixel height
    classes, counts = np.unique(data, return_counts=True)
    stats = []
    for cls, count in zip(classes, counts):
        area = count * pixel_area
        stats.append({
            'class': cls,
            'pixel_count': count,
            'area_m2': area
        })
    return pd.DataFrame(stats)


In [14]:
pixel_stats = lulc_pixel_stats(lulc_bbox_path)
pixel_stats.head(2)

Unnamed: 0,class,pixel_count,area_m2
0,0,8198013,0.5954
1,3,11060266,0.803278


In [15]:
lulc_stats = pd.merge(lulc_df, pixel_stats, on="class", how="left")
lulc_stats.head(2)

Unnamed: 0,class,color_rgb,color_hex,class_name,pixel_count,area_m2
0,3,"(31, 141, 73)",#1f8d49,Forest Formation,11060266,0.803278
1,5,"(4, 56, 29)",#04381d,Mangrove,110531,0.008028


In [16]:
lulc_stats_path = get_demo_path("lulc/mapbiomas-30m-2023-bbox.csv")
lulc_stats.to_csv(lulc_stats_path, index=False)

## Biophysical Table

In [17]:
from muir import BIOPHYSICAL_DB_COLUMNS

In [18]:
#| export
def make_biophysical_table_template(
    lulc_df: pd.DataFrame,         # DataFrame with LULC classes and names
    columns: list,                 # List of column names for the biophysical values
    output_csv_path: str           # Output CSV file path
    ) -> pd.DataFrame:
    df = lulc_df.copy().reset_index(drop=True)
    table = pd.DataFrame()
    table["lucode"] = df["class"]
    table["description"] = df["class_name"]
    for col in columns:
        table[col] = ""
    table.to_csv(output_csv_path, index=False)
    return table

We can for example create a template for the Sediment Delivery Ratio (SDR) model we can then import to e.g. Excel or Google Sheets.

In [19]:
SDR_db_columns = BIOPHYSICAL_DB_COLUMNS["SDR"]
biophysical_table_csv = get_demo_path("biophysical_table/template_SDR.csv")
db_template = make_biophysical_table_template(
    lulc_df, 
    SDR_db_columns,
    biophysical_table_csv
)
db_template.head(2)

Unnamed: 0,lucode,description,usle_c,usle_p
0,3,Forest Formation,,
1,5,Mangrove,,


### Validate Biophysical Table

Ensure your biophysical table contains the information it needs before running your InVEST model. This can save hours of frustration!

In [62]:
def check_biophysical_table(
    table: pd.DataFrame,
    model: str = None,
    lulc_df: pd.DataFrame = None
) -> None:
    """
    Checks a biophysical table for common issues and prints results with emojis.
    """
    # 1. Missing values
    missing_cells = table.isnull().sum().sum()
    if missing_cells > 0:
        print(f"❌ Missing values: {int(missing_cells)} cells are empty.")
        for col in table.columns:
            n_missing = table[col].isnull().sum()
            if n_missing > 0:
                print(f"   - Column '{col}': {n_missing} missing")
    else:
        print("✅ No missing values.")

    # 2. Missing columns (if model specified)
    if model:
        required_cols = set(BIOPHYSICAL_DB_COLUMNS.get(model, []))
        missing_cols = required_cols - set(table.columns)
        if missing_cols:
            print(f"❌ Missing required columns for model '{model}': {sorted(missing_cols)}")
        else:
            print(f"✅ All required columns for model '{model}' are present.")

    # 3. Missing lucode column (regardless)
    if 'lucode' not in table.columns:
        print("❌ Missing 'lucode' column.")
    else:
        print("✅ 'lucode' column present.")

    # 4. Missing/extra classes (if lulc_df specified)
    if lulc_df is not None and 'class' in lulc_df.columns and 'lucode' in table.columns:
        expected_classes = set(lulc_df['class'])
        table_classes = set(table['lucode'])
        missing_classes = expected_classes - table_classes
        extra_classes = table_classes - expected_classes

        if missing_classes:
            missing_info = [
                f"{cls} ({lulc_df.loc[lulc_df['class'] == cls, 'class_name'].values[0]})"
                for cls in sorted(missing_classes)
            ]
            print(f"❌ Missing LULC classes in table: {missing_info}")
        else:
            print("✅ All expected LULC classes are present.")

        if extra_classes:
            extra_info = []
            for cls in sorted(extra_classes):
                # Try to get class name from table if available, else just show code
                name = None
                if 'description' in table.columns:
                    name_row = table.loc[table['lucode'] == cls, 'description']
                    if not name_row.empty:
                        name = name_row.values[0]
                extra_info.append(f"{cls} ({name})" if name else str(cls))
            print(f"🟡 Extra LULC classes in table: {extra_info}")
        else:
            print("✅ No extra LULC classes in table.")

    # 5. Duplicate rows
    # Check for duplicated entire rows
    if table.duplicated().any():
        dup_rows = table[table.duplicated()].index.tolist()
        print(f"❌ Duplicate rows at indices: {dup_rows}")
    else:
        print("✅ No duplicate rows.")

    # Check for duplicated lulc class (lucode)
    if 'lucode' in table.columns:
        duplicated_lucode = table['lucode'][table['lucode'].duplicated()].unique()
        if len(duplicated_lucode) > 0:
            print(f"❌ Duplicated 'lucode' values: {duplicated_lucode.tolist()}")
        else:
            print("✅ No duplicated 'lucode' values.")

Let's try it out.

In [63]:
biophysical_table_path = get_demo_path("biophysical_table/tests/demo.csv")
demo_table = pd.read_csv(biophysical_table_path)
demo_table.head(2)

Unnamed: 0,lucode,description,usle_c,usle_p
0,3,Forest Formation,0.001,1.0
1,5,Mangrove,0.002,1.0


In [64]:
check_biophysical_table(demo_table, "SDR", lulc_df)

✅ No missing values.
✅ All required columns for model 'SDR' are present.
✅ 'lucode' column present.
✅ All expected LULC classes are present.
✅ No extra LULC classes in table.
✅ No duplicate rows.
✅ No duplicated 'lucode' values.


We can now test different tables.

In [65]:
test_table_path = get_demo_path("biophysical_table/tests/extra_class.csv")
test_table = pd.read_csv(test_table_path, keep_default_na=True)
check_biophysical_table(test_table, "SDR", lulc_df)

✅ No missing values.
✅ All required columns for model 'SDR' are present.
✅ 'lucode' column present.
✅ All expected LULC classes are present.
🟡 Extra LULC classes in table: ['55 (Extra Class)']
✅ No duplicate rows.
✅ No duplicated 'lucode' values.


In [66]:
test_table_path = get_demo_path("biophysical_table/tests/missing_class.csv")
test_table = pd.read_csv(test_table_path, keep_default_na=True)
check_biophysical_table(test_table, "SDR", lulc_df)

✅ No missing values.
✅ All required columns for model 'SDR' are present.
✅ 'lucode' column present.
❌ Missing LULC classes in table: ['15 (Pasture)']
✅ No extra LULC classes in table.
✅ No duplicate rows.
✅ No duplicated 'lucode' values.


In [67]:
test_table_path = get_demo_path("biophysical_table/tests/missing_lucode_column.csv")
test_table = pd.read_csv(test_table_path, keep_default_na=True)
check_biophysical_table(test_table, "SDR", lulc_df)

✅ No missing values.
✅ All required columns for model 'SDR' are present.
❌ Missing 'lucode' column.
✅ No duplicate rows.


In [68]:
test_table_path = get_demo_path("biophysical_table/tests/missing_value.csv")
test_table = pd.read_csv(test_table_path, keep_default_na=True)
check_biophysical_table(test_table, "SDR", lulc_df)

❌ Missing values: 1 cells are empty.
   - Column 'usle_c': 1 missing
✅ All required columns for model 'SDR' are present.
✅ 'lucode' column present.
✅ All expected LULC classes are present.
✅ No extra LULC classes in table.
✅ No duplicate rows.
✅ No duplicated 'lucode' values.


In [69]:
test_table_path = get_demo_path("biophysical_table/tests/missing_values.csv")
test_table = pd.read_csv(test_table_path, keep_default_na=True)
check_biophysical_table(test_table, "SDR", lulc_df)

❌ Missing values: 3 cells are empty.
   - Column 'usle_c': 2 missing
   - Column 'usle_p': 1 missing
✅ All required columns for model 'SDR' are present.
✅ 'lucode' column present.
✅ All expected LULC classes are present.
✅ No extra LULC classes in table.
✅ No duplicate rows.
✅ No duplicated 'lucode' values.


In [70]:
test_table_path = get_demo_path("biophysical_table/tests/duplicate_row.csv")
test_table = pd.read_csv(test_table_path, keep_default_na=True, index_col=False)
check_biophysical_table(test_table, "SDR", lulc_df)

✅ No missing values.
✅ All required columns for model 'SDR' are present.
✅ 'lucode' column present.
✅ All expected LULC classes are present.
✅ No extra LULC classes in table.
✅ No duplicate rows.
❌ Duplicated 'lucode' values: [20]


In [71]:
test_table_path = get_demo_path("biophysical_table/tests/duplicate_row_2.csv")
test_table = pd.read_csv(test_table_path, keep_default_na=True, index_col=False)
check_biophysical_table(test_table, "SDR", lulc_df)

✅ No missing values.
✅ All required columns for model 'SDR' are present.
✅ 'lucode' column present.
✅ All expected LULC classes are present.
✅ No extra LULC classes in table.
❌ Duplicate rows at indices: [11]
❌ Duplicated 'lucode' values: [20]


In [20]:
#| hide
import nbdev; nbdev.nbdev_export()