In [1]:
import os
import json
import pandas as pd
from glob import glob

# Paths
html_dir = "01_raw_data/03_hardiness_map"
output_path = "02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv"

In [2]:
# Extract distribution map info
html_files = glob(os.path.join(html_dir, "plant_species_hardiness_map_*.html"))

records = []
for file in html_files:
    basename = os.path.basename(file)
    try:
        # 从文件名中提取植物 ID
        general_plant_id = int(basename.split("_")[-1].split(".")[0])
        
        # 读取 HTML 文件内容
        with open(file, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # 将植物 ID 和 HTML 内容添加到记录中
        records.append({
            "general_plant_id": general_plant_id,
            "distribution_map_html": html_content  # 存储 HTML 内容而不是文件路径
        })
    except (ValueError, IndexError) as e:
        print(f"跳过格式错误的文件: {basename}, 错误: {e}")
        continue

In [3]:
# 创建 DataFrame 并排序
df = pd.DataFrame(records)
df = df.sort_values(by="general_plant_id").reset_index(drop=True)
df["general_plant_id"] = pd.to_numeric(df["general_plant_id"], errors="coerce").astype("Int64")

# 保存到 CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False, encoding='utf-8', lineterminator='\r\n')

print(f"成功处理 {len(records)} 个HTML文件，已保存到 {output_path}")

成功处理 593 个HTML文件，已保存到 02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv


## 验证：使用 Python 创建临时 HTML 文件并打开

In [43]:
import pandas as pd
import tempfile
import os
import webbrowser

# 读取 CSV 文件
csv_path = "02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv"
df = pd.read_csv(csv_path)

# 选择要验证的植物 ID（例如第77个）
plant_id = df.iloc[77]['general_plant_id']
html_content = df.iloc[77]['distribution_map_html']

# 创建临时 HTML 文件
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
    f.write(html_content)
    temp_file = f.name

print(f"已创建临时文件: {temp_file}")

# 在默认浏览器中打开
webbrowser.open('file://' + os.path.realpath(temp_file))

# 可选：等待用户输入后删除临时文件
input("按回车键删除临时文件...")
os.unlink(temp_file)

已创建临时文件: C:\Users\zyyin1\AppData\Local\Temp\tmpjib8lhke.html


In [4]:
# 验证行终止符
with open(output_path, 'rb') as f:
    content = f.read()
    
if b'\r\n' in content:
    print("✅ CSV 文件使用 CRLF (\\r\\n) 行终止符")
elif b'\n' in content:
    print("❌ CSV 文件使用 LF (\\n) 行终止符")
else:
    print("❓ 无法确定行终止符类型")

✅ CSV 文件使用 CRLF (\r\n) 行终止符


In [5]:
# 读取文件的前几百个字节来检查行终止符
with open(output_path, 'rb') as f:
    first_chunk = f.read(500)  # 读取前500字节
    
print("文件开头部分（十六进制）：")
print(first_chunk.hex())

print("\n文件开头部分（ASCII，不可见字符显示为转义序列）：")
# 将字节转换为可读形式，特殊字符显示为转义序列
readable = first_chunk.replace(b'\r', br'\r').replace(b'\n', br'\n')
print(readable.decode('utf-8', errors='ignore'))

文件开头部分（十六进制）：
67656e6572616c5f706c616e745f69642c646973747269627574696f6e5f6d61705f68746d6c0d0a312c223c21444f43545950452068746d6c3e0a3c68746d6c206c616e673d2222656e22223e0a202020203c686561643e0a0a20202020202020203c212d2d203630202d2d3e0a20202020202020200a20202020202020203c6d657461206e616d653d2222726f626f7473222220636f6e74656e743d22226e6f696e6465782c206e6f666f6c6c6f7722223e0a0a20202020202020203c212d2d204c69766577697265205374796c6573202d2d3e3c7374796c65203e5b776972655c3a6c6f6164696e675d5b776972655c3a6c6f6164696e675d2c205b776972655c3a6c6f6164696e675c2e64656c61795d5b776972655c3a6c6f6164696e675c2e64656c61795d2c205b776972655c3a6c6f6164696e675c2e696e6c696e652d626c6f636b5d5b776972655c3a6c6f6164696e675c2e696e6c696e652d626c6f636b5d2c205b776972655c3a6c6f6164696e675c2e696e6c696e655d5b776972655c3a6c6f6164696e675c2e696e6c696e655d2c205b776972655c3a6c6f6164696e675c2e626c6f636b5d5b776972655c3a6c6f6164696e675c2e626c6f636b5d2c205b776972655c3a6c6f6164696e675c2e666c65785d5b776972655c3a6c6f6164696e675c2e666c65

## 执行 LOAD DATA LOCAL INFILE 命令上传到数据库

In [6]:
import mysql.connector
from mysql.connector import Error

# 数据库连接配置
db_config = {
    'host': 'database-plantx.cqz06uycysiz.us-east-1.rds.amazonaws.com',
    'user': 'zihan',
    'password': '2002317Yzh12138.',
    'database': 'FIT5120_PlantX_Database',
    'allow_local_infile': True,
    'use_pure': True  # 使用纯Python实现
}

try:
    # 建立连接
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("成功连接到 MySQL 服务器")
        
        # 创建游标
        cursor = connection.cursor()
        
        # truncate_query = "TRUNCATE TABLE Table04_GeneralPlantDistributionMapTable;" 
        # cursor.execute(truncate_query) 
        # print("已清空表中原有数据。") 
        
        # 构建 LOAD DATA LOCAL INFILE 命令
        # 注意：请将下面的文件路径替换为你实际的CSV文件路径
        load_data_query = """
        LOAD DATA LOCAL INFILE '02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv'
        INTO TABLE Table04_GeneralPlantDistributionMapTable
        CHARACTER SET utf8mb4
        FIELDS TERMINATED BY ',' 
        OPTIONALLY ENCLOSED BY '"'
        LINES TERMINATED BY '\\r\\n'
        IGNORE 1 LINES
        (   
            general_plant_id, distribution_map_html
        );
        """
        
        # 执行命令
        cursor.execute(load_data_query)
        connection.commit()  # 提交事务
        
        print(f"数据导入成功！影响了 {cursor.rowcount} 行。")
        
except Error as e:
    print(f"执行过程中发生错误：{e}")
    
finally:
    # 关闭连接
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL 连接已关闭。")

成功连接到 MySQL 服务器
数据导入成功！影响了 98 行。
MySQL 连接已关闭。


## 验证导入结果

In [7]:
# 在同一个连接会话中，或者在新的连接中执行
try:
    connection = mysql.connector.connect(**db_config)
    cursor = connection.cursor()
    
    cursor.execute("SELECT COUNT(*) FROM Table04_GeneralPlantDistributionMapTable")
    row_count = cursor.fetchone()[0]
    print(f"表中现有 {row_count} 行数据")
    
    # 查看前几行数据
    cursor.execute("SELECT * FROM Table04_GeneralPlantDistributionMapTable LIMIT 5")
    rows = cursor.fetchall()
    for row in rows:
        print(row)
        
except Error as e:
    print(f"查询过程中发生错误：{e}")
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()

表中现有 593 行数据
(1, '<!DOCTYPE html>\n<html lang="en">\n    <head>\n\n        <!-- 60 -->\n        \n        <meta name="robots" content="noindex, nofollow">\n\n        <!-- Livewire Styles --><style >[wire:loading][wire:loading], [wire:loading.delay][wire:loading.delay], [wire:loading.inline-block][wire:loading.inline-block], [wire:loading.inline][wire:loading.inline], [wire:loading.block][wire:loading.block], [wire:loading.flex][wire:loading.flex], [wire:loading.table][wire:loading.table], [wire:loading.grid][wire:loading.grid], [wire:loading.inline-flex][wire:loading.inline-flex] {display: none;}[wire:loading.delay.none][wire:loading.delay.none], [wire:loading.delay.shortest][wire:loading.delay.shortest], [wire:loading.delay.shorter][wire:loading.delay.shorter], [wire:loading.delay.short][wire:loading.delay.short], [wire:loading.delay.default][wire:loading.delay.default], [wire:loading.delay.long][wire:loading.delay.long], [wire:loading.delay.longer][wire:loading.delay.longer], [wire:l

## 验证数据库中 HTML 内容的可复现性  
从数据库提取 HTML 并保存为文件测试。

In [8]:
import mysql.connector
from mysql.connector import Error
import tempfile
import os
import webbrowser

# 数据库连接配置
def test_html_from_database(plant_id):
    """从数据库提取指定植物的 HTML 并在浏览器中测试"""
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor()
        
        # 查询指定植物的 HTML 内容
        query = """
            SELECT 
                distribution_map_html 
            FROM 
                Table04_GeneralPlantDistributionMapTable 
            WHERE 
                general_plant_id = %s
        """
        cursor.execute(query, (plant_id,))
        result = cursor.fetchone()
        
        if result:
            html_content = result[0]
            
            # 创建临时 HTML 文件
            with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
                f.write(html_content)
                temp_file = f.name
            
            print(f"已创建临时文件: {temp_file}")
            
            # 在默认浏览器中打开
            webbrowser.open('file://' + os.path.realpath(temp_file))
            
            # 等待用户确认
            input(f"请检查植物 ID {plant_id} 的分布图是否正常显示，按回车继续...")
            
            # 删除临时文件
            os.unlink(temp_file)
            print(f"已删除临时文件: {temp_file}")
        else:
            print(f"未找到植物 ID {plant_id} 的分布图数据")
            
    except Error as e:
        print(f"查询过程中发生错误：{e}")
    finally:
        if connection.is_connected():
            cursor.close()
            connection.close()

# 测试几个不同的植物 ID
test_html_from_database(77)

已创建临时文件: C:\Users\zyyin1\AppData\Local\Temp\tmp4pptz27m.html
已删除临时文件: C:\Users\zyyin1\AppData\Local\Temp\tmp4pptz27m.html
