## Cell 1 — 配置（范围、输出目录）

In [1]:
# === Config (single key) ===

API_KEY = "sk-hCEF68b0479a9ad5b12090"   # ←← 替换成你的真实 key

assert API_KEY and API_KEY.startswith("sk-"), "请在 API_KEY 中填写有效的 key"

START_ID = 1
END_ID   = 3000

# 限速与重试
SLEEP_BETWEEN = 1
MAX_RETRIES   = 5
BACKOFF_BASE  = 1.6

# 输出目录 & 文件名
from pathlib import Path
OUT_DIR = Path("01_raw_data/03_hardiness_map")
FILENAME_PATTERN = "plant_species_hardiness_map_{species_id}.html"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Key: ****{API_KEY[-6:]}, 范围: {START_ID}~{END_ID}, 输出目录: {OUT_DIR.resolve()}")

Key: ****b12090, 范围: 1~3000, 输出目录: E:\05_YZH_DS\02_Monash_DS\2025_S2_FIT5120_Industry_Experience_Studio_Project\06_main_project\03_github_submission\03_github_submission\2025-08-SDG13-Plant-X-Website\01_data_wrangling\01_raw_data\03_hardiness_map


## Cell 2 — 工具函数

In [2]:
import time
import requests

def save_html(path: Path, html_text: str):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(html_text, encoding="utf-8")

def build_filepath(species_id: int) -> Path:
    return OUT_DIR / FILENAME_PATTERN.format(species_id=species_id)

## Cell 3 — 单条抓取函数（含重试、指数退避、429/5xx 处理）

In [3]:
def fetch_species_hardiness_map_html(species_id: int, api_key: str) -> str | None:
    """
    成功 -> 返回 HTML 字符串
    404   -> 返回占位 HTML
    None  -> 触发主循环早停（例如配额/IP 限制，多次 429，或 x-ratelimit-remaining=0）
    """
    base_url = "https://perenual.com/api/hardiness-map"
    params = {"species_id": species_id, "size": "og", "key": api_key}
    attempt = 0
    consecutive_429 = 0

    while attempt <= MAX_RETRIES:
        try:
            resp = requests.get(
                base_url,
                params=params,
                headers={"accept": "text/html"},
                timeout=30
            )

            remaining = resp.headers.get("x-ratelimit-remaining")
            if remaining is not None:
                try:
                    if int(remaining) <= 0:
                        print(f"[ID {species_id}] x-ratelimit-remaining=0 → 安全早停。")
                        return None
                except ValueError:
                    pass

            if resp.status_code == 200:
                return resp.text

            if resp.status_code == 404:
                print(f"[ID {species_id}] 404：该物种暂无 hardiness map。")
                return f"<!-- missing hardiness map for species_id={species_id} -->"

            if resp.status_code == 429:
                consecutive_429 += 1
                if consecutive_429 >= 3:
                    print(f"[ID {species_id}] 连续 429 次，推断配额/IP 限制 → 安全早停。")
                    return None
                wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
                print(f"[ID {species_id}] 429 Too Many Requests，等待 {wait:.1f}s 后重试。")
                time.sleep(wait)
                attempt += 1
                continue

            if resp.status_code in (500, 502, 503, 504):
                wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
                print(f"[ID {species_id}] {resp.status_code} 服务端错误，等待 {wait:.1f}s 重试。")
                time.sleep(wait)
                attempt += 1
                continue

            print(f"[ID {species_id}] HTTP {resp.status_code}: {resp.text[:200]}")
            return f"<!-- error status={resp.status_code} for species_id={species_id} -->"

        except requests.RequestException as e:
            wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
            print(f"[ID {species_id}] 网络异常 {type(e).__name__}：{e}，等待 {wait:.1f}s 重试。")
            time.sleep(wait)
            attempt += 1

    print(f"[ID {species_id}] 重试耗尽。")
    return f"<!-- retries exhausted for species_id={species_id} -->"

## Cell 4 — 主循环（断点续跑、跳过已有文件、智能停机）

In [4]:
from tqdm.auto import tqdm
import random

downloaded = 0
skipped = 0

for species_id in tqdm(range(START_ID, END_ID + 1), desc="Downloading Hardiness Maps"):
    fp = build_filepath(species_id)
    if fp.exists():
        skipped += 1
        continue

    html_text = fetch_species_hardiness_map_html(species_id, API_KEY)
    if html_text is None:
        print("推断配额/限流 → 安全早停。")
        break

    save_html(fp, html_text)
    downloaded += 1
    time.sleep(SLEEP_BETWEEN + random.uniform(0.0, 0.6))

print(f"新增下载 {downloaded} 个，跳过(已存在) {skipped} 个。输出目录：{OUT_DIR.resolve()}")

  from .autonotebook import tqdm as notebook_tqdm
Downloading Hardiness Maps:   0%|          | 0/3000 [00:00<?, ?it/s]

[ID 298] 429 Too Many Requests，等待 1.0s 后重试。
[ID 298] 429 Too Many Requests，等待 1.8s 后重试。


Downloading Hardiness Maps:  10%|▉         | 297/3000 [00:05<00:53, 51.00it/s]

[ID 298] 连续 429 次，推断配额/IP 限制 → 安全早停。
推断配额/限流 → 安全早停。
新增下载 0 个，跳过(已存在) 297 个。输出目录：E:\05_YZH_DS\02_Monash_DS\2025_S2_FIT5120_Industry_Experience_Studio_Project\06_main_project\03_github_submission\03_github_submission\2025-08-SDG13-Plant-X-Website\01_data_wrangling\01_raw_data\03_hardiness_map





In [5]:
# import shutil
# from google.colab import files

# shutil.make_archive("03_species_hardiness_map", 'zip', "01_raw_data/03_species_hardiness_map")
# files.download("03_species_hardiness_map.zip")