<a href="https://colab.research.google.com/github/ashikita/orcid/blob/main/get-orcid-record-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import csv
import time
import json
import math
import argparse
import urllib.parse
from datetime import datetime, timezone
import requests

BASE_PUB = "https://pub.orcid.org/v3.0"
SEARCH_ENDPOINT = f"{BASE_PUB}/search/"
EXPANDED_SEARCH_ENDPOINT = f"{BASE_PUB}/expanded-search/"  # 使わないが将来拡張用
SESSION = requests.Session()

# ---- 出力カラム（順序厳守：現状維持）----
CSV_COLUMNS = [
    "orcid_id", "flg", "family-name", "given-name", "flg-enroll",
    "current-organization", "current-department-name", "current-role-title",
    "employments", "employment-organization", "educations", "education-organizasion",
    "qualifications", "invited-positions", "distinctions", "memberships", "services",
    "fundings", "research-resources", "works", "peer-reviews", "submission-date", "last-modified-date"
]

# ---- ヘッダ（Accept 明示、gzip、User-Agent）----
def build_headers():
    headers = {
        "Accept": "application/orcid+json",  # or application/orcid+xml
        "Accept-Encoding": "gzip",
        "User-Agent": "kyushu-orcid-harvester/1.0 (contact: example@example.org)"
    }
    token = os.getenv("ORCID_TOKEN")
    if token:
        headers["Authorization"] = f"Bearer {token}"
    return headers

# ---- レート制御付きGET（指数バックオフ＋Retry-After尊重）----
def http_get(url, max_retries=6, base_delay=1.0, timeout=30):
    headers = build_headers()
    delay = base_delay
    for attempt in range(max_retries):
        resp = SESSION.get(url, headers=headers, timeout=timeout)
        if resp.status_code == 200:
            return resp
        # 429 / 5xx はバックオフ
        if resp.status_code in (429,) or (500 <= resp.status_code < 600):
            retry_after = resp.headers.get("Retry-After")
            if retry_after and retry_after.isdigit():
                time.sleep(int(retry_after))
            else:
                time.sleep(delay)
                delay = min(delay * 2, 60)  # 上限60s
            continue
        # その他は例外
        text = ""
        try:
            text = resp.text[:1000]
        except Exception:
            pass
        raise RuntimeError(f"HTTP {resp.status_code}: {text}")
    raise RuntimeError("Max retries exceeded")

# ---- 検索（affiliation-org-name）: start=0 でページング ----
def search_orcids_by_affiliation(org_names, rows=200, start=0, updated_since=None):
    """
    org_names: ["Kyushu University", "九州大学", ...]
    updated_since: "YYYY-MM-DDTHH:MM:SSZ" 以降に更新されたレコードのみ検索する場合に付与
    """
    # クエリ組み立て（org名を OR で括る）
    # 例: q=affiliation-org-name:("Kyushu University" OR "九州大学" OR "Kyusyu University")
    quoted = [f"\"{n.strip()}\"" for n in org_names if n and n.strip()]
    base_q = f"affiliation-org-name:({ ' OR '.join(quoted) })"
    if updated_since:
        # 例: AND profile-last-modified-date:[2025-12-01T00:00:00Z TO NOW]
        base_q += f" AND profile-last-modified-date:[{updated_since} TO NOW]"
    q = urllib.parse.quote(base_q, safe=":+()[]\" ")

    found = []
    page = 0
    while True:
        url = f"{SEARCH_ENDPOINT}?q={q}&rows={rows}&start={start + page*rows}"
        resp = http_get(url)
        data = resp.json()
        # num-found（総件数）と result 配列
        num_found = int(data.get("num-found", 0))
        results = data.get("result", []) or []
        # orcid-identifier.path を積み上げ
        for r in results:
            oid = (((r.get("orcid-identifier") or {}).get("path")) or "").strip()
            if oid:
                found.append(oid)
        # ページング終了条件
        got = len(results)
        if got == 0:
            break
        # 取り切ったか？
        next_start = start + (page+1)*rows
        if next_start >= num_found:
            break
        page += 1
        # サービスに配慮して少し待機
        time.sleep(0.25)
    # 重複排除
    return sorted(set(found))

# ---- 詳細取得：/person と /activities を優先し、必要なら /record へ ----
def fetch_person(orcid):
    url = f"{BASE_PUB}/{orcid}/person"
    return http_get(url).json()

def fetch_activities(orcid):
    url = f"{BASE_PUB}/{orcid}/activities"
    return http_get(url).json()

def fetch_record(orcid):
    url = f"{BASE_PUB}/{orcid}/record"
    return http_get(url).json()

# ---- 安全に深いキーへアクセスする小ヘルパー ----
def dig(obj, *keys):
    cur = obj
    for k in keys:
        if isinstance(cur, dict) and k in cur:
            cur = cur[k]
        else:
            return None
    return cur

# ---- 年の抽出（存在しない・空なら ""）----
def safe_year(d):
    if not isinstance(d, dict):
        return ""
    y = dig(d, "year", "value")
    return str(y).strip() if y is not None else ""

# ---- 現在所属（雇用 or 学歴）を推定 ----
def pick_current_affiliation(aff_groups, summary_key, org_key="organization"):
    """
    aff_groups: activities-summary の affiliation-group 配列
    summary_key: "employment-summary" / "education-summary"
    戻り: (org_name, dept_name, role_title, start_year, end_year, end_year_for_status)
    """
    best = None
    def score(item):
        # end-date 無しを最優先、次いで start-date の新しさ
        endy = safe_year(item.get("end-date") or {})
        starty = safe_year(item.get("start-date") or {})
        # endy=="" を大きく優遇
        end_score = 1 if endy == "" else 0
        # start年は大きいほど良い
        try:
            start_val = int(starty) if starty else -1
        except:
            start_val = -1
        return (end_score, start_val)

    items = []
    for g in (aff_groups or []):
        summ = g.get(summary_key)
        if not summ:
            continue
        # employment-summary / education-summary は配列 or オブジェクトのことがあるので配列化
        if isinstance(summ, list):
            arr = summ
        else:
            arr = [summ]
        for s in arr:
            items.append(s)

    for it in items:
        if best is None or score(it) > score(best):
            best = it

    if not best:
        return ("", "", "", "", "", "")

    org = (dig(best, org_key, "name") or "").strip()
    dept = (best.get("department-name") or "").strip()
    role = (best.get("role-title") or "").strip()
    sy = safe_year(best.get("start-date") or {})
    ey = safe_year(best.get("end-date") or {})
    # ステータス判定用に end-year を返す
    return (org, dept, role, sy, ey, ey)

# ---- 1レコード（行）を生成：指定のカラム順で返す ----
def build_row(orcid, person, activities, record_fallback=None, today_year=None):
    # today_year: 在職/在学の境界判定用
    if today_year is None:
        today_year = datetime.now(timezone.utc).year

    # 名前
    family = (dig(person, "name", "family-name", "value") or "").strip()
    given  = (dig(person, "name", "given-names", "value") or "").strip()
    flg = "t" if (family or given) else "nil"
    if not family:
        family = "[未登録]"
    if not given:
        given = "[未登録]"

    # activities-summary から件数と要約を取る（あれば）
    acts = activities or {}

    # employments
    emp_summary = dig(acts, "employments") or {}
    emp_groups = emp_summary.get("affiliation-group") or []
    employments_count = len(emp_groups)

    # educations
    edu_summary = dig(acts, "educations") or {}
    edu_groups = edu_summary.get("affiliation-group") or []
    educations_count = len(edu_groups)

    # その他の件数
    qualifications_count   = len((dig(acts, "qualifications", "affiliation-group") or []))
    invited_positions_count= len((dig(acts, "invited-positions", "affiliation-group") or []))
    distinctions_count     = len((dig(acts, "distinctions", "affiliation-group") or []))
    memberships_count      = len((dig(acts, "memberships", "affiliation-group") or []))
    services_count         = len((dig(acts, "services", "affiliation-group") or []))
    fundings_count         = len((dig(acts, "fundings", "group") or []))
    research_resources_cnt = len((dig(acts, "research-resources", "group") or []))
    works_count            = len((dig(acts, "works", "group") or []))
    peer_reviews_count     = len((dig(acts, "peer-reviews", "group") or []))

    # 雇用・学歴の文字列（"org(start-end); ..."）
    def list_affiliations(groups, summary_key):
        parts = []
        for g in groups:
            s = g.get(summary_key)
            arr = s if isinstance(s, list) else ([s] if s else [])
            for it in arr:
                nm = (dig(it, "organization", "name") or "").strip()
                sy = safe_year(it.get("start-date") or {})
                ey = safe_year(it.get("end-date") or {})
                if nm:
                    if sy or ey:
                        parts.append(f"{nm}({sy}-{ey})")
                    else:
                        parts.append(nm)
        return "; ".join(parts)

    emp_list_str = list_affiliations(emp_groups, "employment-summary")
    edu_list_str = list_affiliations(edu_groups, "education-summary")

    # 現在所属の推定
    cur_org, cur_dept, cur_role, emp_sy, emp_ey, emp_ey_for_status = \
        pick_current_affiliation(emp_groups, "employment-summary")

    if not cur_org and educations_count > 0:
        cur_org, cur_dept, cur_role, edu_sy, edu_ey, edu_ey_for_status = \
            pick_current_affiliation(edu_groups, "education-summary")
    else:
        edu_sy = edu_ey = edu_ey_for_status = ""

    # flg-enroll（在職中/離職中/在学中/卒業生/その他）
    flg_enroll = "その他"
    if employments_count == 0 and educations_count == 0:
        if invited_positions_count > 0:
            flg_enroll = "特別任用"
        elif memberships_count > 0:
            flg_enroll = "メンバーシップ"
        elif qualifications_count > 0:
            flg_enroll = "資格取得"
        else:
            flg_enroll = "その他"
    elif employments_count == 0 and educations_count >= 1:
        flg_enroll = "在学中" if (not edu_ey_for_status or int(edu_ey_for_status or 0) >= int(today_year)) else "卒業生"
        if not cur_org:
            # 文字列の先頭要素を current-organization に
            cur_org = (edu_list_str.split(";")[0] if ";" in edu_list_str else edu_list_str)
    elif employments_count >= 1:
        flg_enroll = "在職中" if (not emp_ey_for_status or int(emp_ey_for_status or 0) >= int(today_year)) else "離職中"
        if not cur_org:
            cur_org = (emp_list_str.split(";")[0] if ";" in emp_list_str else emp_list_str)

    # submission / last-modified
    # /activities には history がないため /person or /record から取得
    sub_date = (dig(record_fallback or person, "history", "submission-date", "value") or
                dig(record_fallback or {}, "history", "submission-date", "value") or "")
    last_mod = (dig(record_fallback or person, "history", "last-modified-date", "value") or
                dig(record_fallback or {}, "history", "last-modified-date", "value") or "")

    def to_iso(d):
        # ORCIDはミリ秒エポックやISOを返すことがあるので柔軟に
        if not d:
            return ""
        s = str(d)
        if s.isdigit():
            # epoch millis
            try:
                ts = datetime.fromtimestamp(int(s)/1000, tz=timezone.utc)
                return ts.strftime("%Y-%m-%d")
            except Exception:
                return ""
        if len(s) >= 10:
            return s[:10]
        return s

    sub_date_iso = to_iso(sub_date)
    last_mod_iso = to_iso(last_mod)

    row = {
        "orcid_id": orcid,
        "flg": flg,
        "family-name": family,
        "given-name": given,
        "flg-enroll": flg_enroll,
        "current-organization": cur_org or "",
        "current-department-name": cur_dept or "",
        "current-role-title": cur_role or "",
        "employments": str(employments_count),
        "employment-organization": emp_list_str,
        "educations": str(educations_count),
        "education-organizasion": edu_list_str,  # ※スペルは現状維持
        "qualifications": str(qualifications_count),
        "invited-positions": str(invited_positions_count),
        "distinctions": str(distinctions_count),
        "memberships": str(memberships_count),
        "services": str(services_count),
        "fundings": str(fundings_count),
        "research-resources": str(research_resources_cnt),
        "works": str(works_count),
        "peer-reviews": str(peer_reviews_count),
        "submission-date": sub_date_iso,
        "last-modified-date": last_mod_iso
    }
    return row

# ---- 既存CSVの読込・マージ（orcid_id キー）----
def load_existing(path):
    if not path or not os.path.exists(path):
        return {}
    out = {}
    with open(path, "r", newline="", encoding="utf-8") as f:
        rdr = csv.DictReader(f)
        for r in rdr:
            key = (r.get("orcid_id") or "").strip()
            if key:
                out[key] = r
    return out

def write_csv(path, rows):
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=CSV_COLUMNS)
        w.writeheader()
        for r in rows:
            # 欠落キーは空文字で埋める
            for k in CSV_COLUMNS:
                r.setdefault(k, "")
            w.writerow(r)

# ---- メイン処理 ----
def main():
    ap = argparse.ArgumentParser(description="ORCID Harvester for Kyushu University (CSV output)")
    ap.add_argument("--org", nargs="+", default=["Kyushu University", "九州大学", "Kyusyu University"],
                    help="Affiliation org names for search (space separated)")
    ap.add_argument("--existing", default=None, help="Existing CSV to merge with (optional)")
    ap.add_argument("--since", default=None,
                    help="Only fetch ORCID records modified since this ISO time (e.g., 2025-12-01T00:00:00Z)")
    ap.add_argument("--out", default="orcid_kyushu.csv", help="Output CSV path")
    ap.add_argument("--force-all", action="store_true",
                    help="Ignore existing data and refetch all IDs' details")
    args = ap.parse_args()

    existing = load_existing(args.existing)
    existing_lastmod = {k: (v.get("last-modified-date") or "") for k, v in existing.items()}

    print(f"[1/3] Searching ORCID iDs by affiliation-org-name ...")
    orcid_ids = search_orcids_by_affiliation(args.org, rows=200, start=0, updated_since=args.since)
    print(f"  -> {len(orcid_ids)} ORCID iDs found")

    result_rows = []
    today_year = datetime.now(timezone.utc).year

    # 既存行を残しつつ更新
    keep = dict(existing)  # コピー

    print(f"[2/3] Fetching details (person + activities, fallback=record) ...")
    for idx, oid in enumerate(orcid_ids, 1):
        # 更新条件：--force-all か、既存が無い/last-modified-date が変わった可能性がある場合
        need_fetch = args.force_all or (oid not in existing)
        # since で抽出している場合は更新候補のみ来ている想定
        if not need_fetch and not args.since:
            # 既存last-modified-dateが空なら更新
            need_fetch = not existing_lastmod.get(oid)

        if not need_fetch:
            # 既存そのまま
            result_rows.append(keep.pop(oid))
            continue

        # 詳細取得（軽量戦略）
        try:
            person = fetch_person(oid)
            activities = fetch_activities(oid)
            # submission-date や last-modified が取れない場合に備え record も一回だけ
            record_fallback = None
            # /person の history で十分なことが多いが、ダメなケースだけ /record
            if not dig(person, "history"):
                record_fallback = fetch_record(oid)
            row = build_row(oid, person, activities, record_fallback=record_fallback, today_year=today_year)
            result_rows.append(row)
        except Exception as e:
            # 失敗した場合、既存があれば既存を残す
            print(f"  [{idx}/{len(orcid_ids)}] {oid} -> ERROR: {e}")
            if oid in keep:
                result_rows.append(keep.pop(oid))
            # 少し待機して次へ（スパイク回避）
            time.sleep(0.5)
            continue

        # サービス配慮の軽いスリープ
        time.sleep(0.25)
        if idx % 50 == 0:
            print(f"  ... {idx} processed")

    # keep に残っている既存は「今回の検索に出てこなかったID」なので、そのまま温存するかどうかは要件次第。
    # 継続性重視のため、ここでは温存（脱落させない）
    for _oid, old_row in keep.items():
        result_rows.append(old_row)

    print(f"[3/3] Writing CSV -> {args.out} (rows={len(result_rows)})")
    write_csv(args.out, result_rows)
    print("Done.")

if __name__ == "__main__":
    main()

In [None]:
# JSONファイルを削除します。
!rm *.json