In [14]:
import os
import json
import pandas as pd
from glob import glob

# Paths
details_dir = "01_raw_data/01_species_details"
care_dir = "01_raw_data/02_care_guide"
output_path = "02_wrangled_data/03_general_care.csv"

In [15]:
# Cycle label mapping
cycle_map = {
    "Perennial": "Every year",
    "Annual": "Once a year",
    "Biennial": "Every 2 years"
}

In [16]:
# Care guide parser
def parse_care_sections(sections):
    guide = {"watering_guide": None, "sunlight_guide": None, "pruning_guide": None}
    for section in sections:
        section_type = section.get("type")
        description = section.get("description")
        if section_type and description:
            if section_type == "watering":
                guide["watering_guide"] = description
            elif section_type == "sunlight":
                guide["sunlight_guide"] = description
            elif section_type == "pruning":
                guide["pruning_guide"] = description
    return guide

In [17]:
# Flatten general plant record with care guide
def flatten_general_care(details, care):
    plant_id = details.get("id")
    if plant_id > 3000:
        return None  # Skip threatened plants

    # Extract care guide sections
    care_sections = care.get("data", [{}])[0].get("section", []) if care else []
    guide = parse_care_sections(care_sections)

    # Compose benchmark
    benchmark = details.get("watering_general_benchmark", {})
    benchmark_str = None
    if benchmark.get("value") and benchmark.get("unit"):
        benchmark_str = f"At least once {benchmark['value']} {benchmark['unit']}"

    # Compose pruning count
    pruning_count_list = details.get("pruning_count", [])
    pruning_str = None
    if isinstance(pruning_count_list, list) and pruning_count_list:
        first_entry = pruning_count_list[0]
        if isinstance(first_entry, dict) and "amount" in first_entry and "interval" in first_entry:
            pruning_str = f"{first_entry['amount']} times {first_entry['interval']}"


    # Compose flowers detail
    flowers_detail = None
    if details.get("flowers") and details.get("flowering_season"):
        flowers_detail = f"Flowers in {details['flowering_season']}"

    return {
        "general_plant_id": plant_id,
        "watering": details.get("watering"),
        "watering_general_benchmark": benchmark_str,
        "sunlight": details.get("sunlight", []),
        "soil": details.get("soil", []),
        "drought_tolerant": details.get("drought_tolerant", False),
        "salt_tolerant": details.get("salt_tolerant", False),
        "pruning_month": details.get("pruning_month", []),
        "pruning_count": pruning_str,
        "pest_susceptibility": details.get("pest_susceptibility", []),
        "flowers_detail": flowers_detail,
        "harvest_season": details.get("harvest_season"),
        "growth_rate": details.get("growth_rate"),
        "maintenance": details.get("maintenance"),
        "care_level": details.get("care_level"),
        "watering_guide": guide["watering_guide"],
        "sunlight_guide": guide["sunlight_guide"],
        "pruning_guide": guide["pruning_guide"]
    }

In [18]:
# Load general plant JSON files
detail_files = glob(os.path.join(details_dir, "plant_species_details_*.json"))
care_files = glob(os.path.join(care_dir, "plant_species_care_guide_*.json"))


In [19]:
# Build care guide lookup by species_id
care_lookup = {}
for file in care_files:
    with open(file, "r", encoding="utf-8") as f:
        care_data = json.load(f)
        species_id = care_data.get("data", [{}])[0].get("species_id")
        if species_id:
            care_lookup[species_id] = care_data

# Flatten and combine
flattened_data = []
for file in detail_files:
    with open(file, "r", encoding="utf-8") as f:
        details = json.load(f)
        species_id = details.get("id")
        care = care_lookup.get(species_id)
        record = flatten_general_care(details, care)
        if record:
            flattened_data.append(record)


In [20]:
# Create DataFrame and sort
df = pd.DataFrame(flattened_data)
df = df.sort_values(by="general_plant_id").reset_index(drop=True)
df["general_plant_id"] = pd.to_numeric(df["general_plant_id"], errors="coerce").astype("Int64")

# Column order
ordered_cols = [
    "general_plant_id", "watering", "watering_general_benchmark", "sunlight", "soil",
    "drought_tolerant", "salt_tolerant", "pruning_month", "pruning_count",
    "pest_susceptibility", "flowers_detail", "harvest_season", "growth_rate",
    "maintenance", "care_level", "watering_guide", "sunlight_guide", "pruning_guide"
]
df = df[ordered_cols]


In [21]:
# Save to CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)