## Cell 1 — 配置（多 Key、范围、输出目录）

In [None]:
# === Config ===

# 方式A：在此处直接列出多个 API Key  （注意每行末尾都有逗号）
API_KEYS = [
    "sk-TRjA68b0485ed0d8412092"
]

START_ID = 1
END_ID   = 3000

# 每个 Key 的当日软上限（免费100/天；建议 <100 以免踩线）
PER_KEY_SOFT_CAP = 100

# 限速（每次请求后暂停秒数；不稳时调大至 2.0+）
SLEEP_BETWEEN = 1

# 重试设置
MAX_RETRIES   = 5
BACKOFF_BASE  = 1.6

# 输出目录 & 文件名模式（本任务为 HTML）
from pathlib import Path
OUT_DIR = Path("data/01_raw_datasets/03_species_hardiness_map")
FILENAME_PATTERN = "plant_species_hardiness_map_{species_id}.html"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"使用 {len(API_KEYS)} 个 API Key，ID范围: {START_ID}～{END_ID}，输出目录: {OUT_DIR.resolve()}")


使用 1 个 API Key，ID范围: 1～3000，输出目录: E:\05_YZH_DS\02_Monash_DS\2025_S2_FIT5120_Industry_Experience_Studio_Project\06_main_project\02_asmt_draft\data\species_details


## Cell 2 — Key 轮换管理与工具函数

In [None]:
import time
import json
from typing import Optional, Dict, Any
import requests

class KeyManager:
    """
    在多个 API Key 间轮换，统计使用次数；结合 x-ratelimit-remaining 进行早停。
    """
    def __init__(self, keys, per_key_soft_cap=100):
        self.keys = list(keys)
        self.n = len(self.keys)
        self.per_key_soft_cap = per_key_soft_cap
        self.idx = 0
        self.usage = {k: 0 for k in self.keys}
        self.mark_exhausted = {k: False for k in self.keys}
        self.consecutive_429_keys = set()  # 记录本轮被 429 的不同 key（用于推断账户/IP级配额）

    def current(self) -> Optional[str]:
        for _ in range(self.n):
            k = self.keys[self.idx]
            if not self.mark_exhausted[k] and self.usage[k] < self.per_key_soft_cap:
                return k
            self.idx = (self.idx + 1) % self.n
        return None

    def rotate(self):
        self.idx = (self.idx + 1) % self.n

    def add_usage(self, key: str, cnt: int = 1):
        self.usage[key] += cnt
        if self.usage[key] >= self.per_key_soft_cap:
            self.mark_exhausted[key] = True

    def set_exhausted(self, key: str):
        self.mark_exhausted[key] = True

    def all_exhausted(self) -> bool:
        return all(self.mark_exhausted[k] or self.usage[k] >= self.per_key_soft_cap for k in self.keys)

def save_html(path: Path, html_text: str):
    path.write_text(html_text, encoding="utf-8")

def build_filepath(species_id: int) -> Path:
    return OUT_DIR / FILENAME_PATTERN.format(species_id=species_id)

## Cell 3 — 单条抓取函数（含重试、指数退避、429/5xx 处理）

In [None]:
def fetch_species_hardiness_map_html(species_id: int, key_manager: KeyManager) -> Optional[str]:
    """
    获取 hardiness-map 的 HTML 文本。
    成功返回 HTML（str）；若判断为配额/限流（多 key 连续 429）则返回 None 触发主循环早停。
    注：官方示例使用 https。
    """
    base_url = "https://perenual.com/api/hardiness-map"
    params_template = {"species_id": species_id, "size": "og"}  # size参数按示例设为 og
    attempt = 0

    while attempt <= MAX_RETRIES:
        key = key_manager.current()
        if key is None:
            print(f"[ID {species_id}] 所有 Key 当日软上限已满或不可用，停止。")
            return None

        try:
            params = dict(params_template)
            params["key"] = key

            resp = requests.get(
                base_url,
                params=params,
                headers={"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},
                timeout=30
            )

            # 读剩余额度（若服务端提供）
            remaining = resp.headers.get("x-ratelimit-remaining")
            if remaining is not None:
                try:
                    if int(remaining) <= 0:
                        print(f"[{key}] x-ratelimit-remaining=0 -> 标记该 Key 用尽。")
                        key_manager.set_exhausted(key)
                        key_manager.rotate()
                        continue
                except ValueError:
                    pass

            if resp.status_code == 200:
                key_manager.add_usage(key, 1)
                # 对于 HTML，直接返回 resp.text；个别情况下若返回图片/二进制，可包一层 <img>，但官方为 HTML。
                return resp.text

            elif resp.status_code == 404:
                key_manager.add_usage(key, 1)
                print(f"[ID {species_id}] 404：该物种可能暂无 hardiness map，保存占位 HTML。")
                return f"<!-- missing hardiness map for species_id={species_id} -->"

            elif resp.status_code == 429:
                key_manager.consecutive_429_keys.add(key)
                if len(key_manager.consecutive_429_keys) >= min(len(API_KEYS), 3):
                    print(f"[ID {species_id}] 多个不同 key 连续 429，推断为账户/IP 级配额受限——安全停机。")
                    return None
                wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
                print(f"[ID {species_id}] 429 Too Many Requests，用 key={key}，等待 {wait:.1f}s 后换 Key 重试。")
                time.sleep(wait)
                key_manager.rotate()
                attempt += 1
                continue

            elif resp.status_code in (500, 502, 503, 504):
                wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
                print(f"[ID {species_id}] {resp.status_code} 服务端错误，用 key={key}，等待 {wait:.1f}s 重试。")
                time.sleep(wait)
                attempt += 1
                continue

            else:
                key_manager.add_usage(key, 1)
                print(f"[ID {species_id}] HTTP {resp.status_code}: {resp.text[:200]}")
                # 虽是异常，但仍返回页面片段以便留痕
                return f"<!-- error status={resp.status_code} for species_id={species_id} -->\n{resp.text[:200]}"

        except requests.RequestException as e:
            wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
            print(f"[ID {species_id}] 网络异常（{type(e).__name__}）：{e}。等待 {wait:.1f}s 重试。")
            time.sleep(wait)
            attempt += 1

    print(f"[ID {species_id}] 重试耗尽，放弃。")
    return f"<!-- retries exhausted for species_id={species_id} -->"

## Cell 4 — 主循环（断点续跑、跳过已有文件、智能停机）

In [None]:
from tqdm.auto import tqdm
import random

km = KeyManager(API_KEYS, per_key_soft_cap=PER_KEY_SOFT_CAP)
km.consecutive_429_keys.clear()

downloaded = 0
skipped = 0

for species_id in tqdm(range(START_ID, END_ID + 1), desc="Downloading Hardiness Maps"):
    fp = build_filepath(species_id)
    if fp.exists():
        skipped += 1
        continue

    html_text = fetch_species_hardiness_map_html(species_id, km)
    if html_text is None:
        print("Key 配额可能用尽或受限，已安全停机。")
        break

    save_html(fp, html_text)
    downloaded += 1
    time.sleep(SLEEP_BETWEEN + random.uniform(0.0, 0.6))

print(f"新增下载 {downloaded} 个，跳过(已存在) {skipped} 个。输出目录：{OUT_DIR.resolve()}")
print("各 Key 使用统计：", km.usage)


In [None]:
import shutil
from google.colab import files

# 打包为 zip
shutil.make_archive("03_species_hardiness_map", 'zip', "data/01_raw_datasets/03_species_hardiness_map")

# 下载到本地电脑
files.download("03_species_hardiness_map.zip")