In [1]:
import os
import json
import pandas as pd
from glob import glob

# 📁 Paths
input_dir = "01_raw_data/01_species_details"
output_path = "02_wrangled_data/02_general_description.csv"

In [2]:
# Cycle label mapping
cycle_map = {
    "Perennial": "Every year",
    "Annual": "Once a year",
    "Biennial": "Every 2 years"
}

# Flatten general plant record
def flatten_general_description(data):
    plant_id = data.get("id")
    if plant_id > 3000:
        return None  # Skip threatened plants

    return {
        "general_plant_id": plant_id,
        "if_edible": data.get("edible_fruit", False) or data.get("edible_leaf", False) or data.get("cuisine", False),
        "if_indoors": data.get("indoor", False),
        "if_medicinal": data.get("medicinal", False),
        "if_poisonous": data.get("poisonous_to_humans", False) or data.get("poisonous_to_pets", False),
        "if_fruits": data.get("fruits", False),
        "if_flowers": data.get("flowers", False),
        "plant_type": data.get("type"),
        "plant_cycle": cycle_map.get(data.get("cycle"), data.get("cycle")),
        "attracts": data.get("attracts", []),
        "propagation": data.get("propagation", []),
        "description": data.get("description")
    }

In [3]:
# Load and flatten JSON file
json_files = glob(os.path.join(input_dir, "plant_species_details_*.json"))
flattened_data = []

for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        record = flatten_general_description(data)
        if record:
            flattened_data.append(record)

In [4]:
# Create DataFrame and sort
df = pd.DataFrame(flattened_data)
df = df.sort_values(by="general_plant_id").reset_index(drop=True)

df["general_plant_id"] = pd.to_numeric(df["general_plant_id"], errors="coerce").astype("Int64")

ordered_cols = [
    "general_plant_id", "if_edible", "if_indoors", "if_medicinal", "if_poisonous",
    "if_fruits", "if_flowers", "plant_type", "plant_cycle",
    "attracts", "propagation", "description"
]
df = df[ordered_cols]

os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)