In [10]:
import os
import json
import pandas as pd
from glob import glob

# Define input and output paths
input_dir = "01_raw_data/01_species_details"
output_path = "02_wrangled_data/01_species_details_cleaned.csv"


In [11]:
# Load all JSON files
json_files = glob(os.path.join(input_dir, "plant_species_details_*.json"))


In [19]:
# Flatten each JSON record
def safe_get(d, key, default=None):
    return d[key] if isinstance(d, dict) and key in d else default

def flatten_plant_data(data):
    return {
        "plant_id": data.get("id"),
        "common_name": data.get("common_name"),
        "scientific_name": ", ".join(data.get("scientific_name", [])),
        "other_name": ", ".join(data.get("other_name", [])),
        "if_threatened": False,
        "if_edible": data.get("edible_fruit", False) or data.get("edible_leaf", False),
        "if_indoors": data.get("indoor", False),
        "if_medicinal": data.get("medicinal", False),
        "if_poisonous": data.get("poisonous_to_humans", False) or data.get("poisonous_to_pets", False),
        "if_fruits": data.get("fruits", False),
        "if_flowers": data.get("flowers", False),
        "sun_expose": ", ".join(data.get("sunlight", [])),
        "watering": data.get("watering"),
        "plant_cycle": data.get("cycle"),
        "growth_rate": data.get("growth_rate")
    }
    


In [20]:
# Aggregate all flattened records
flattened_data = []
for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        flattened_data.append(flatten_plant_data(data))



In [21]:
# Create DataFrame and sort
df = pd.DataFrame(flattened_data)
df = df.sort_values(by="plant_id").reset_index(drop=True)

# Add foreign keys
df["general_plant_id"] = df["plant_id"]
df["threatened_plant_id"] = df["plant_id"]

# Reorder columns
ordered_cols = [
    "plant_id", "general_plant_id", "threatened_plant_id",
    "common_name", "scientific_name", "other_name",
    "if_threatened", "if_edible", "if_indoors", "if_medicinal",
    "if_poisonous", "if_fruits", "if_flowers",
    "sun_expose", "watering", "plant_cycle", "growth_rate"
]
df = df[ordered_cols]

# Save to CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)