In [12]:
import os
import json
import pandas as pd
from glob import glob

# Paths
html_dir = "01_raw_data/03_hardiness_map"
output_path = "02_wrangled_data/04_general_plant_distribution_map.csv"


In [13]:
# Extract distribution map info
html_files = glob(os.path.join(html_dir, "plant_species_hardiness_map_*.html"))

records = []
for file in html_files:
    basename = os.path.basename(file)
    try:
        general_plant_id = int(basename.split("_")[-1].split(".")[0])
        local_path = os.path.join(html_dir, basename).replace("\\", "/")
        records.append({
            "general_plant_id": general_plant_id,
            "distribution_map": local_path
        })
    except ValueError:
        continue  # Skip malformed filenames

In [14]:
# Create DataFrame and sort
df = pd.DataFrame(records)
df = df.sort_values(by="general_plant_id").reset_index(drop=True)
df["general_plant_id"] = pd.to_numeric(df["general_plant_id"], errors="coerce").astype("Int64")

# Save to CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)