In [50]:
import os
import json
import pandas as pd
from glob import glob

# Define input and output paths
json_dir = "01_raw_data/01_species_details"
csv_path = "01_raw_data/01_threatened-plant-living-collection-plan.csv"
output_path = "02_wrangled_data/01_plant_main_table.csv"

In [51]:
# Flatten each JSON record
def first_or_none(lst):
    return lst[0] if isinstance(lst, list) and lst else None


cycle_map = {
    "Perennial": "Every year",
    "Annual": "Once a year",
    "Biennial": "Every 2 years"
}

def flatten_general(data):
    plant_id = data.get("id")
    return {
        "plant_id": plant_id,
        "general_plant_id": plant_id,
        "threatened_plant_id": None,
        "common_name": data.get("common_name"),
        "scientific_name": first_or_none(data.get("scientific_name")),
        "other_name": first_or_none(data.get("other_name")),
        "if_threatened": False,
        "if_edible": data.get("edible_fruit", False) or data.get("edible_leaf", False) or data.get("cuisine", False),
        "if_indoors": data.get("indoor", False),
        "if_medicinal": data.get("medicinal", False),
        "if_poisonous": data.get("poisonous_to_humans", False) or data.get("poisonous_to_pets", False),
        "if_fruits": data.get("fruits", False),
        "if_flowers": data.get("flowers", False),
        "sun_expose": data.get("sunlight", []),
        "watering": data.get("watering"),
        "plant_cycle": cycle_map.get(data.get("cycle"), data.get("cycle")),
        "growth_rate": data.get("growth_rate")
    }

def flatten_threatened(row, index_offset):
    plant_id = 3001 + index_offset
    return {
        "plant_id": plant_id,
        "general_plant_id": None,
        "threatened_plant_id": plant_id,
        "common_name": row["Common Name"],
        "scientific_name": row["Species Name"],
        "other_name": None,
        "if_threatened": True,
        "if_edible": False,
        "if_indoors": False,
        "if_medicinal": False,
        "if_poisonous": False,
        "if_fruits": False,
        "if_flowers": False,
        "sun_expose": [row["Sun"]] if pd.notna(row["Sun"]) else [],
        "watering": None,
        "plant_cycle": cycle_map.get(row["Habit"], row["Habit"]),
        "growth_rate": None
    }
    


In [52]:
# Load all JSON files
json_files = glob(os.path.join(json_dir, "plant_species_details_*.json"))
general_data = []
for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        general_data.append(flatten_general(data))

# Load threatened CSV
threatened_df = pd.read_csv(csv_path)
threatened_data = [flatten_threatened(row, i) for i, row in threatened_df.iterrows()]


In [53]:
# Create DataFrame and sort
combined_df = pd.DataFrame(general_data + threatened_data)
combined_df = combined_df.sort_values(by="plant_id").reset_index(drop=True)

# Reorder columns
ordered_cols = [
    "plant_id", "general_plant_id", "threatened_plant_id",
    "common_name", "scientific_name", "other_name",
    "if_threatened", "if_edible", "if_indoors", "if_medicinal",
    "if_poisonous", "if_fruits", "if_flowers",
    "sun_expose", "watering", "plant_cycle", "growth_rate"
]
combined_df = combined_df[ordered_cols]

# Ensure ID fields are integers (or NaN if missing)
combined_df["plant_id"] = pd.to_numeric(combined_df["plant_id"], errors="coerce").astype("Int64")
combined_df["general_plant_id"] = pd.to_numeric(combined_df["general_plant_id"], errors="coerce").astype("Int64")
combined_df["threatened_plant_id"] = pd.to_numeric(combined_df["threatened_plant_id"], errors="coerce").astype("Int64")


# Save to CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
combined_df.to_csv(output_path, index=False)