## Cell 1 — 配置（多 Key、范围、输出目录）

In [15]:
# === Config ===

# 方式A：在此处直接列出多个 API Key
API_KEYS = [
    "sk-kMpx68b02586bb61112084"
    "sk-3ksU68b046f66e1f912089",
    "sk-hCEF68b0479a9ad5b12090",
    "sk-AAWv68b047ef83b2b12091",
    "sk-TRjA68b0485ed0d8412092",
    "sk-GAw468b062289a99012094",
    "sk-Om0C68b0629c837de12095",
    "sk-ACnv68b0630adff3b12096"
]

START_ID = 1
END_ID   = 3000

# 每个 Key 的当日软上限（官方免费配额是100/天，这里留点缓冲）
PER_KEY_SOFT_CAP = 100

# 全局限速（每次请求后暂停秒数，避免429）
SLEEP_BETWEEN = 1

# 重试设置
MAX_RETRIES   = 5
BACKOFF_BASE  = 1.6

# 输出目录 & 文件名模式
from pathlib import Path
OUT_DIR = Path("data/01_raw_datasets/01_species_details")
FILENAME_PATTERN = "plant_species_details_{species_id}.json"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"使用 {len(API_KEYS)} 个 API Key，ID范围: {START_ID}～{END_ID}，输出目录: {OUT_DIR.resolve()}")

使用 1 个 API Key，ID范围: 1～3000，输出目录: E:\05_YZH_DS\02_Monash_DS\2025_S2_FIT5120_Industry_Experience_Studio_Project\06_main_project\02_asmt_draft\data\species_details


## Cell 2 — Key 轮换管理与工具函数

In [16]:
import time
import json
from typing import Optional, Dict, Any
import requests

class KeyManager:
    """
    负责在多个 API Key 间轮换、统计每个 Key 已用请求数，并基于响应头的 x-ratelimit-remaining 智能停机。
    """
    def __init__(self, keys, per_key_soft_cap=100):
        self.keys = list(keys)
        self.n = len(self.keys)
        self.per_key_soft_cap = per_key_soft_cap
        self.idx = 0
        self.usage = {k: 0 for k in self.keys}
        self.mark_exhausted = {k: False for k in self.keys}  # 额度用尽或被429/禁止时标记

    def current(self) -> Optional[str]:
        # 如果当前 key 不可用，向前找可用的
        for _ in range(self.n):
            k = self.keys[self.idx]
            if not self.mark_exhausted[k] and self.usage[k] < self.per_key_soft_cap:
                return k
            self.idx = (self.idx + 1) % self.n
        return None  # 所有 key 都不可用

    def rotate(self):
        self.idx = (self.idx + 1) % self.n

    def add_usage(self, key: str, cnt: int = 1):
        self.usage[key] += cnt
        if self.usage[key] >= self.per_key_soft_cap:
            # 达到软上限，后续不再使用它
            self.mark_exhausted[key] = True

    def set_exhausted(self, key: str):
        self.mark_exhausted[key] = True

    def all_exhausted(self) -> bool:
        return all(self.mark_exhausted[k] or self.usage[k] >= self.per_key_soft_cap for k in self.keys)

def save_json(path: Path, data: Dict[str, Any]):
    path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

def build_filepath(species_id: int) -> Path:
    return OUT_DIR / FILENAME_PATTERN.format(species_id=species_id)


## Cell 3 — 单条抓取函数（含重试、指数退避、429/5xx 处理）

In [17]:
def fetch_species_details(species_id: int, key_manager: KeyManager) -> Optional[dict]:
    """
    返回JSON字典；若检测到配额用尽（从响应头推断）则返回 None 以触发安全停机。
    """
    url = f"https://perenual.com/api/v2/species/details/{species_id}"
    attempt = 0

    while attempt <= MAX_RETRIES:
        key = key_manager.current()
        if key is None:
            print(f"[ID {species_id}] 所有 Key 当日软上限已满或不可用，停止。")
            return None

        try:
            resp = requests.get(url, params={"key": key}, headers={"accept":"application/json"}, timeout=30)

            # 读剩余额度（若服务端提供）
            remaining = resp.headers.get("x-ratelimit-remaining")
            if remaining is not None:
                try:
                    if int(remaining) <= 0:
                        print(f"[{key}] x-ratelimit-remaining=0 -> 标记该 Key 用尽。")
                        key_manager.set_exhausted(key)
                        # 立即换下一个 Key 继续尝试
                        key_manager.rotate()
                        continue
                except ValueError:
                    pass

            if resp.status_code == 200:
                key_manager.add_usage(key, 1)
                return resp.json()

            elif resp.status_code == 404:
                key_manager.add_usage(key, 1)
                print(f"[ID {species_id}] 404，不存在：保存占位并继续。")
                return {"__missing__": True, "id": species_id}

            elif resp.status_code == 429:
                # 速率受限：指数退避，并暂时避开该 key
                wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
                print(f"[ID {species_id}] 429 Too Many Requests，用 key={key}，等待 {wait:.1f}s 后换 Key 重试。")
                time.sleep(wait)
                # 不计入 usage，但先旋转 key
                key_manager.rotate()
                attempt += 1
                continue

            elif resp.status_code in (500, 502, 503, 504):
                wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
                print(f"[ID {species_id}] {resp.status_code} 服务端错误，用 key={key}，等待 {wait:.1f}s 重试。")
                time.sleep(wait)
                attempt += 1
                continue

            else:
                key_manager.add_usage(key, 1)
                print(f"[ID {species_id}] HTTP {resp.status_code}: {resp.text[:200]}")
                return {"__error_status__": resp.status_code, "id": species_id}

        except requests.RequestException as e:
            wait = (BACKOFF_BASE ** attempt) + 0.2 * attempt
            print(f"[ID {species_id}] 网络异常（{type(e).__name__}）：{e}。等待 {wait:.1f}s 重试。")
            time.sleep(wait)
            attempt += 1

    print(f"[ID {species_id}] 重试耗尽，放弃。")
    return {"__error_retries_exhausted__": True, "id": species_id}


## Cell 4 — 主循环（断点续跑、跳过已有文件、智能停机）

In [None]:
from tqdm.auto import tqdm  # 非必须；若未安装可注释掉并删去进度条

km = KeyManager(API_KEYS, per_key_soft_cap=PER_KEY_SOFT_CAP)

downloaded = 0
skipped = 0

for species_id in tqdm(range(START_ID, END_ID + 1), desc="Downloading"):
    fp = build_filepath(species_id)
    if fp.exists():
        skipped += 1
        continue

    data = fetch_species_details(species_id, km)
    if data is None:
        # 所有 key 临时不可用或额度用尽 -> 安全停机
        print("Key 配额可能用尽或受限，已安全停机。")
        break

    save_json(fp, data)
    downloaded += 1
    time.sleep(SLEEP_BETWEEN)

print(f"新增下载 {downloaded} 个，跳过(已存在) {skipped} 个。输出目录：{OUT_DIR.resolve()}")
print("各 Key 使用统计：", km.usage)


In [None]:
import shutil
shutil.make_archive("01_species_details", 'zip', "data/01_species_details")

In [None]:
from google.colab import files
files.download("01_species_details.zip")