In [33]:
import pandas as pd
import re

def strip_prefix(value):
    if pd.isna(value):
        return value
    # 匹配前缀: 可有数字+可有*+后跟 - 或 =
    return re.sub(r'^\s*\d+\*?\s*[-=]\s*', '', str(value)).strip()

# 读入原始文件
df = pd.read_csv("01_raw_data/01_threatened-plant-living-collection-plan.csv")

rename_map = {
    "Species Name": "species_name",
    "Common Name": "common_name",
    "Habit": "habit",
    "Soil": "soil",
    "Sun": "sun",
    "Cultivation Protocols ": "cultivation_protocols",
    "Propagation Protocols ": "propagation_protocols",
    "Propagation Methods": "propagation_methods",
    "Cultivation Requirements Subtotal": "cultivation_requirements_subtotal",
    "Germplasm Source": "germplasm_source",
    "Conservation Status": "conservation_status",
    "Germplasm Origin": "germplasm_origin",
    "Provenance": "provenance",
    "Additional Conservation\r\nbenefit": "additional_conservation_benefit",
    "Conservation Subtotal": "conservation_subtotal",
    "Additional Local Benefits Description (i.e., to City of Melbourne)": "additional_local_benefits_description_i_e_to_city_of_melbourne",
    "Additional Local Benefits Subtotal": "additional_local_benefits_subtotal",
    "Total Score": "total_score",
    "Horticultural Potential ": "horticultural_potential",
    "Total Score Including Hort Potential": "total_score_including_hort_potential",
    "Weed Rating": "weed_rating",
    "Priority species": "priority_species"
}

df.rename(columns=rename_map, inplace=True)

In [34]:
import numpy as np

# 添加 threatened_plant_id
df['threatened_plant_id'] = range(3001, 3001 + len(df))

# Table06
table06 = df[['threatened_plant_id',
              
              'conservation_status',
              'provenance',
              'weed_rating',
              
              'habit',
              'germplasm_source',
              'additional_conservation_benefit',
              'additional_local_benefits_description_i_e_to_city_of_melbourne',
              'horticultural_potential']].copy()

table06.rename(columns={
    'additional_conservation_benefit': 'conservation_benefit',
    'additional_local_benefits_description_i_e_to_city_of_melbourne': 'local_benefits_description'
}, inplace=True)

table06['conservation_benefit'] = table06['conservation_benefit'].replace('0', np.nan)

# 去掉前缀编号
for col in ['provenance', 'conservation_status', 'conservation_benefit', 'germplasm_source', 'horticultural_potential']:
    table06[col] = table06[col].apply(strip_prefix)

table06.to_csv("02_wrangled_data/Table06_ThreatenedPlantDescriptionTable.csv", index=False)

# Table07
table07 = df[['threatened_plant_id',
              
              'soil',
              'sun',
              'propagation_methods',
              
              'propagation_protocols',
              'cultivation_protocols']].copy()

table07.rename(columns={
    'propagation_protocols': 'propagation_level',
    'cultivation_protocols': 'cultivation_note'
}, inplace=True)

# 去掉前缀编号
for col in ['propagation_level', 'cultivation_note']:
    table07[col] = table07[col].apply(strip_prefix)

table07.to_csv("02_wrangled_data/Table07_ThreatenedPlantCareGuideTable.csv", index=False)

## 执行 LOAD DATA LOCAL INFILE 命令上传到数据库 (Table06)

In [35]:
import mysql.connector
from mysql.connector import Error

# 数据库连接配置
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True  # 使用纯Python实现
}

try:
    # 建立连接
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("成功连接到 MySQL 服务器")
        
        # 创建游标
        cursor = connection.cursor()
        
        # 构建 LOAD DATA LOCAL INFILE 命令
        # 注意：请将下面的文件路径替换为你实际的CSV文件路径
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table06_ThreatenedPlantDescriptionTable.csv'
        INTO TABLE Table06_ThreatenedPlantDescriptionTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (
            threatened_plant_id, conservation_status, provenance, weed_rating, habit, germplasm_source,
            conservation_benefit, local_benefits_description, horticultural_potential
        );
        """
        
        # 执行命令
        cursor.execute(load_data_query)
        connection.commit()  # 提交事务
        
        print(f"数据导入成功！影响了 {cursor.rowcount} 行。")
        
except Error as e:
    print(f"执行过程中发生错误：{e}")
    
finally:
    # 关闭连接
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL 连接已关闭。")

成功连接到 MySQL 服务器
数据导入成功！影响了 195 行。
MySQL 连接已关闭。


## 验证导入结果 (Table06)

In [36]:
# 在同一个连接会话中，或者在新的连接中执行
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table06_ThreatenedPlantDescriptionTable")
    row_count = cursor.fetchone()[0]
    print(f"表中现有 {row_count} 行数据")
    
    # 查看前几行数据
    cursor.execute("SELECT * FROM Table06_ThreatenedPlantDescriptionTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"查询过程中发生错误：{e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

表中现有 200 行数据
(3001, 'Critically Endangered', 'Indigenous to City of Melbourne', 'Low Risk', 'Annual', 'Currently held limited propagation material', 'Bushfire Recovery', 'Culturally Important to Traditional owners / and or local community', 'Plant with some ornamental attributes (form/flowers/fruit etc.) at various times of the year and could be reasonably incorporated into mixed plantings')
(3002, 'Critically Endangered', 'Indigenous to City of Melbourne', 'Low Risk', 'Annual', 'Currently held in collection', 'Bushfire Recovery', 'Culturally Important to Traditional owners / and or local community', 'Plant with some ornamental attributes (form/flowers/fruit etc.) at various times of the year and could be reasonably incorporated into mixed plantings')
(3003, 'Endangered', 'Indigenous to City of Melbourne', 'Medium Risk', 'Herbaceous Perennial', 'Not held in collection but easily obtained', '', 'Culturally Important to Traditional owners / and or local community, Suited to Green infrast

## 执行 LOAD DATA LOCAL INFILE 命令上传到数据库 (Table07)

In [37]:
try:
    # 建立连接
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("成功连接到 MySQL 服务器")
        
        # 创建游标
        cursor = connection.cursor()
        
        # 构建 LOAD DATA LOCAL INFILE 命令
        # 注意：请将下面的文件路径替换为你实际的CSV文件路径
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table07_ThreatenedPlantCareGuideTable.csv'
        INTO TABLE Table07_ThreatenedPlantCareGuideTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (
            threatened_plant_id, soil, sun, propagation_methods, propagation_level, cultivation_note
        );
        """
        
        # 执行命令
        cursor.execute(load_data_query)
        connection.commit()  # 提交事务
        
        print(f"数据导入成功！影响了 {cursor.rowcount} 行。")
        
except Error as e:
    print(f"执行过程中发生错误：{e}")
    
finally:
    # 关闭连接
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL 连接已关闭。")

成功连接到 MySQL 服务器
数据导入成功！影响了 190 行。
MySQL 连接已关闭。


## 验证导入结果 (Table07)

In [38]:
# 在同一个连接会话中，或者在新的连接中执行
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table07_ThreatenedPlantCareGuideTable")
    row_count = cursor.fetchone()[0]
    print(f"表中现有 {row_count} 行数据")
    
    # 查看前几行数据
    cursor.execute("SELECT * FROM Table07_ThreatenedPlantCareGuideTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"查询过程中发生错误：{e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

表中现有 200 行数据
(3001, 'Free Draining, Moderately Draining', 'Full Sun', 'Seed/Cuttings', 'Can be propagated - but requires significant time/effort', 'Difficult (but not impossible) to cultivate - short lived or has specific requirements for cultivation such as misting or mycorrhizal associates or pot culture')
(3002, 'Free Draining, Moderately Draining', 'Full Sun', 'Seed', 'Difficult to propagate', 'Difficult (but not impossible) to cultivate - short lived or has specific requirements for cultivation such as misting or mycorrhizal associates or pot culture')
(3003, 'Free Draining', 'Full Sun', 'Seed/Cuttings', 'Easily propagated', 'Can be cultivated with specific growing conditions eg moist well drained soils  - would need to be replaced within 5 years to maintain high quality plant')
(3004, 'Free Draining', 'Full Sun, Part Shade', 'Seed', 'Easily propagated', 'Can be cultivated and is reasonably  tolerant of a range of garden situations/conditions - Longer lived 5+ years')
(3005, 'Free