In [9]:
import os
import json
import pandas as pd
from glob import glob

# Paths
details_dir = "01_raw_data/01_species_details"
care_dir = "01_raw_data/02_care_guide"
output_path = "02_wrangled_data/Table03_GeneralPlantCareGuideTable.csv"

In [10]:
# Cycle label mapping
cycle_map = {
    "Perennial": "Every year",
    "Annual": "Once a year",
    "Biennial": "Every 2 years"
}

In [11]:
# Care guide parser
def parse_care_sections(sections):
    guide = {"watering_guide": None, "sunlight_guide": None, "pruning_guide": None}
    for section in sections:
        section_type = section.get("type")
        description = section.get("description")
        if section_type and description:
            if section_type == "watering":
                guide["watering_guide"] = description
            elif section_type == "sunlight":
                guide["sunlight_guide"] = description
            elif section_type == "pruning":
                guide["pruning_guide"] = description
    return guide

In [12]:
# Flatten general plant record with care guide
def flatten_general_care(details, care):
    plant_id = details.get("id")
    if plant_id > 3000:
        return None  # Skip threatened plants

    # Extract care guide sections
    care_sections = care.get("data", [{}])[0].get("section", []) if care else []
    guide = parse_care_sections(care_sections)

    # Compose benchmark
    benchmark = details.get("watering_general_benchmark", {})
    benchmark_str = None
    if benchmark.get("value") and benchmark.get("unit"):
        benchmark_str = f"At least once {benchmark['value']} {benchmark['unit']}"

    # Compose pruning count
    pruning_count_list = details.get("pruning_count", [])
    pruning_str = None
    if isinstance(pruning_count_list, list) and pruning_count_list:
        first_entry = pruning_count_list[0]
        if isinstance(first_entry, dict) and "amount" in first_entry and "interval" in first_entry:
            pruning_str = f"{first_entry['amount']} times {first_entry['interval']}"


    # Compose flowers detail
    flowers_detail = None
    if details.get("flowers") and details.get("flowering_season"):
        flowers_detail = f"Flowers in {details['flowering_season']}"

    return {
        "general_plant_id": plant_id,
        
        "watering": details.get("watering"),
        "watering_general_benchmark": benchmark_str,
        # "sunlight": details.get("sunlight", []),
        "sunlight": json.dumps(details.get("sunlight", []), ensure_ascii=False), # Change a little bit to fit MySQL Workbench
        # "soil": details.get("soil", []),
        "soil": json.dumps(details.get("soil", []), ensure_ascii=False), # Change a little bit to fit MySQL Workbench
        "drought_tolerant": details.get("drought_tolerant", False),
        "salt_tolerant": details.get("salt_tolerant", False),
        # "pruning_month": details.get("pruning_month", []),
        "pruning_month": json.dumps(details.get("pruning_month", []), ensure_ascii=False), # Change a little bit to fit MySQL Workbench
        "pruning_count": pruning_str,
        # "pest_susceptibility": details.get("pest_susceptibility", []),
        "pest_susceptibility": json.dumps(details.get("pest_susceptibility", []), ensure_ascii=False), # Change a little bit to fit MySQL Workbench
        "flowers_detail": flowers_detail,
        "harvest_season": details.get("harvest_season"),
        "growth_rate": details.get("growth_rate"),
        "maintenance": details.get("maintenance"),
        "care_level": details.get("care_level"),
        
        "watering_guide": guide["watering_guide"],
        "sunlight_guide": guide["sunlight_guide"],
        "pruning_guide": guide["pruning_guide"]
    }

In [13]:
# Load general plant JSON files
detail_files = glob(os.path.join(details_dir, "plant_species_details_*.json"))
care_files = glob(os.path.join(care_dir, "plant_species_care_guide_*.json"))

In [14]:
# Build care guide lookup by species_id
care_lookup = {}
for file in care_files:
    with open(file, "r", encoding="utf-8") as f:
        care_data = json.load(f)
        species_id = care_data.get("data", [{}])[0].get("species_id")
        if species_id:
            care_lookup[species_id] = care_data

# Flatten and combine
flattened_data = []
for file in detail_files:
    with open(file, "r", encoding="utf-8") as f:
        details = json.load(f)
        species_id = details.get("id")
        care = care_lookup.get(species_id)
        record = flatten_general_care(details, care)
        if record:
            flattened_data.append(record)


In [15]:
# Create DataFrame and sort
df = pd.DataFrame(flattened_data)
df = df.sort_values(by="general_plant_id").reset_index(drop=True)
df["general_plant_id"] = pd.to_numeric(df["general_plant_id"], errors="coerce").astype("Int64")

# Column order
ordered_cols = [
    "general_plant_id", "watering", "watering_general_benchmark", "sunlight", "soil",
    "drought_tolerant", "salt_tolerant", "pruning_month", "pruning_count",
    "pest_susceptibility", "flowers_detail", "harvest_season", "growth_rate",
    "maintenance", "care_level", "watering_guide", "sunlight_guide", "pruning_guide"
]
df = df[ordered_cols]

In [16]:
# Save to CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)

## 执行 LOAD DATA LOCAL INFILE 命令上传到数据库

In [18]:
import mysql.connector
from mysql.connector import Error

# 数据库连接配置
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True  # 使用纯Python实现
}

try:
    # 建立连接
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("成功连接到 MySQL 服务器")
        
        # 创建游标
        cursor = connection.cursor()
        
        # 构建 LOAD DATA LOCAL INFILE 命令
        # 注意：请将下面的文件路径替换为你实际的CSV文件路径
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table03_GeneralPlantCareGuideTable.csv'
        INTO TABLE Table03_GeneralPlantCareGuideTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (   
            general_plant_id, watering, watering_general_benchmark, sunlight,soil,
            drought_tolerant, salt_tolerant, pruning_month,pruning_count,
            pest_susceptibility, flowers_detail, harvest_season, growth_rate,maintenance,
            care_level, watering_guide, sunlight_guide, pruning_guide
        );
        """
        
        # 执行命令
        cursor.execute(load_data_query)
        connection.commit()  # 提交事务
        
        print(f"数据导入成功！影响了 {cursor.rowcount} 行。")
        
except Error as e:
    print(f"执行过程中发生错误：{e}")
    
finally:
    # 关闭连接
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL 连接已关闭。")

成功连接到 MySQL 服务器
数据导入成功！影响了 484 行。
MySQL 连接已关闭。


## 验证导入结果

In [20]:
# 在同一个连接会话中，或者在新的连接中执行
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table03_GeneralPlantCareGuideTable")
    row_count = cursor.fetchone()[0]
    print(f"表中现有 {row_count} 行数据")
    
    # 查看前几行数据
    cursor.execute("SELECT * FROM Table03_GeneralPlantCareGuideTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"查询过程中发生错误：{e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

表中现有 494 行数据
(1, 'Frequent', 'At least once "7-10" days', '["full sun"]', '[]', 'False', 'False', '["February", "March", "April"]', '', '[]', '', '', 'High', '', 'Medium', "European Silver Fir 'Abies Alba' should be watered about once a week, depending on the season and weather conditions. During the growing season (spring to mid-summer), water deeply and thoroughly. For the rest of the year, water only when the soil is dry. When watering, ensure that the root system is saturated. Do not over-water, as this can lead to root rot.", "European Silver Fir 'Abies Alba' is a species of coniferous tree that grows best in sunny areas. This species of plant requires a minimum of 6 hours of sunlight per day throughout the growing season. Ideally, it should receive up to 8 hours of direct sunlight in the summer months, and 4-5 hours of sunlight in the winter. It is important to note that European Silver Fir 'Abies Alba' is not tolerant of shade and may be damaged if exposed to too much shade.", '