In [4]:
import os
import json
import pandas as pd
from glob import glob

# Paths
html_dir = "01_raw_data/03_hardiness_map"
output_path = "02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv"

In [5]:
# Extract distribution map info
html_files = glob(os.path.join(html_dir, "plant_species_hardiness_map_*.html"))

records = []
for file in html_files:
    basename = os.path.basename(file)
    try:
        # 从文件名中提取植物 ID
        general_plant_id = int(basename.split("_")[-1].split(".")[0])
        
        # 读取 HTML 文件内容
        with open(file, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # 将植物 ID 和 HTML 内容添加到记录中
        records.append({
            "general_plant_id": general_plant_id,
            "distribution_map_html": html_content  # 存储 HTML 内容而不是文件路径
        })
    except (ValueError, IndexError) as e:
        print(f"跳过格式错误的文件: {basename}, 错误: {e}")
        continue

In [6]:
# 创建 DataFrame 并排序
df = pd.DataFrame(records)
df = df.sort_values(by="general_plant_id").reset_index(drop=True)
df["general_plant_id"] = pd.to_numeric(df["general_plant_id"], errors="coerce").astype("Int64")

# 保存到 CSV
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False, encoding='utf-8')

print(f"成功处理 {len(records)} 个HTML文件，已保存到 {output_path}")

成功处理 495 个HTML文件，已保存到 02_wrangled_data/Table04_GeneralPlantDistributionMapTable.csv
