<a href="https://colab.research.google.com/github/ashikita/qir-toolbox/blob/main/json2tsv/json2tsv-4evergreen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

機関リポジトリへの論文登録のため、JSON形式の論文メタデータをタブ区切りテキストファイル形式に一括変換します。  
タブ区切りテキストファイルはE-Cats Libraryのメタデータ一括登録用フォーマットに準拠しています。

In [None]:
# ZIPファイルを展開します。
!unzip *.zip

In [None]:
# -*- coding: utf-8 -*-
"""
機能（新スキーマ対応版）:
- 作業ディレクトリ内の *.json をすべて読み込み、論文メタデータをTAB区切りの一括登録用フォーマットに整形して output.txt を生成
- 1論文1行、先頭行に可変列（著者数/所属数/キーワード数）を含むヘッダを自動構築
- かな漢字/アルファベットに基づく xml:lang の簡易判定
- 共著者および1著者複数所属に対応（列ずれを避けるため、各著者位置の最大所属数に合わせて空列を補完）

対象とするJSONスキーマ（旧スキーマはサポート対象外）:
{
  "title": str,
  "authors": [
    {
      "name": str,
      "orcid": str,  # 例: 0000-0002-7005-0797
      "is_corresponding": bool,  # 無視
      "affiliation_numbers": ["1","2", ...],
      "contact_info": {...}  # 無視
    }, ...
  ],
  "affiliations": { "1": "Affiliation A", "2": "Affiliation B", ... },
  "keywords": [str, ...]  # または "keyword"
  "Abstract": str  # または "abstract"
  "publication_info": { "year": "2026", "month": "Jan"|"1"|1|... , "volume": "12", "issue": "04" },
  "page_info": { "start_page": "1", "end_page": "12" }
}

role / date / corresponding author / contact_info は読み飛ばします。
"""

import os
import re
import json
import glob
from typing import List, Dict, Any

# =========================
# 設定ブロック（必要に応じて編集）
# =========================
DEFAULTS = {
    "access_rights": "110",  # /dcterms:accessRights#1
    "rights_label": "Creative Commons Attribution 4.0 International",  # /dc:rights#1
    "rights_lang": "en",  # /dc:rights#1@xml:lang
    "rights_uri": "https://creativecommons.org/licenses/by/4.0/",  # /dc:rights#1@rdf:resource
    "publisher_en": "Transdisciplinary Research and Education Center for Green Technologies, Kyushu University",  # /dc:publisher#1
    "publisher_ja": "九州大学グリーンテクノロジー研究教育センター",  # /dc:publisher#2
    "issued_date_type": "Issued",  # /datacite:date#1@dateType
    "language_three_letter": "eng",  # /dc:language#1
    "contents_type": "1207000000",  # /local:contentsType#1 (journal article)
    "version": "VoR",  # /oaire:version#1
    "peer_reviewed": "refereed",  # /local:peerReviewed#1
    "pissn": "2189-0420",  # /jpcoar:sourceIdentifier#1
    "pissn_type": "PISSN",
    "eissn": "2432-5953",  # /jpcoar:sourceIdentifier#2
    "eissn_type": "eISSN",
    "source_title": "evergreen",  # /local:sourceTitle#1
    "subject_scheme": "Other",  # /jpcoar:subject#n@subjectScheme
    "description_type": "Abstract",  # /datacite:description#1@descriptionType
}

INPUT_GLOB_PATTERN = "*.json"
OUTPUT_FILE = "output.txt"
ENCODING = "utf-8"

# =========================
# ユーティリティ
# =========================

JP_REGEX = re.compile(r"[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uff66-\uff9f]")
EN_REGEX = re.compile(r"[A-Za-z]")
ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")

MONTH_MAP = {
    "jan": "01", "january": "01",
    "feb": "02", "february": "02",
    "mar": "03", "march": "03",
    "apr": "04", "april": "04",
    "may": "05",
    "jun": "06", "june": "06",
    "jul": "07", "july": "07",
    "aug": "08", "august": "08",
    "sep": "09", "sept": "09", "september": "09",
    "oct": "10", "october": "10",
    "nov": "11", "november": "11",
    "dec": "12", "december": "12"
}

def detect_lang(text: str) -> str:
    """かな・漢字（含むカタカナ）→ 'ja'、アルファベット → 'en'、それ以外は空値"""
    if not text:
        return ""
    if JP_REGEX.search(text):
        return "ja"
    if EN_REGEX.search(text):
        return "en"
    return ""

def sanitize_value(v: Any) -> str:
    """タブ・改行を空白化し、前後空白をトリム"""
    if v is None:
        return ""
    s = str(v)
    s = s.replace("\t", " ").replace("\r\n", " ").replace("\n", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def valid_orcid(oid: str) -> bool:
    return bool(ORCID_REGEX.match((oid or "").strip()))

def month_to_mm(m) -> str:
    """多様な月入力を 'MM' に正規化。未知は '' を返す。"""
    if m is None:
        return ""
    s = str(m).strip()
    if not s:
        return ""
    key = s.lower()
    if key in MONTH_MAP:
        return MONTH_MAP[key]
    digits = re.sub(r"\D", "", s)
    if digits.isdigit() and 1 <= int(digits) <= 12:
        return f"{int(digits):02d}"
    return ""

def normalize_issue(issue) -> str:
    """先頭ゼロを除去（'01'→'1'）。全てゼロの場合は '0'。非数はそのままトリム。"""
    if issue is None:
        return ""
    s = str(issue).strip()
    if not s:
        return ""
    if re.fullmatch(r"\d+", s):
        stripped = s.lstrip("0")
        return stripped if stripped else "0"
    return s

def parse_json_file(path: str) -> List[Dict[str, Any]]:
    """
    JSONファイルを柔軟に解釈:
    - 単一オブジェクト
    - 配列
    - 改行区切りの複数JSONオブジェクト
    - 連結JSON（{}{}{}...）を波括弧バランスで分割
    """
    with open(path, "r", encoding=ENCODING) as f:
        txt = f.read().strip()
    if not txt:
        return []

    # まずは通常のJSONとして試行
    try:
        obj = json.loads(txt)
        if isinstance(obj, list):
            return [x for x in obj if isinstance(x, dict)]
        if isinstance(obj, dict):
            return [obj]
    except Exception:
        pass

    # 行ごと（JSON Lines）を試行
    records = []
    for line in txt.splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            if isinstance(obj, dict):
                records.append(obj)
        except Exception:
            records = []
            break
    if records:
        return records

    # 波括弧バランスで分割
    parts = []
    depth = 0
    start = None
    for i, ch in enumerate(txt):
        if ch == "{":
            if depth == 0:
                start = i
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0 and start is not None:
                parts.append(txt[start:i+1])
                start = None
    out = []
    for p in parts:
        try:
            obj = json.loads(p)
            if isinstance(obj, dict):
                out.append(obj)
        except Exception:
            continue
    return out

# =========================
# 1次走査: 正規化 & 最大列数の算出
# =========================

json_files = sorted(glob.glob(INPUT_GLOB_PATTERN))
all_records: List[Dict[str, Any]] = []
for fp in json_files:
    try:
        recs = parse_json_file(fp)
        all_records.extend(recs)
    except Exception:
        continue

normalized: List[Dict[str, Any]] = []
max_creators = 0
max_affils_per_pos: List[int] = []  # 著者位置ごとの最大 affiliation 数
max_subjects = 0

for rec in all_records:
    title = sanitize_value(rec.get("title", ""))

    # 所属辞書（新形式のみサポート）
    aff_map_src = rec.get("affiliations") or {}
    aff_map: Dict[str, str] = {}
    if isinstance(aff_map_src, dict):
        for k, v in aff_map_src.items():
            idx = sanitize_value(k)
            name = sanitize_value(v)
            if idx and name:
                aff_map[idx] = name

    # 著者（新形式のみサポート）
    authors_norm = []
    authors_raw = rec.get("authors") or []
    if isinstance(authors_raw, list):
        for a in authors_raw:
            if not isinstance(a, dict):
                continue
            name = sanitize_value(a.get("name"))
            if not name:
                continue
            orcid = sanitize_value(a.get("orcid"))
            aff_nums = a.get("affiliation_numbers") or []
            if not isinstance(aff_nums, list):
                aff_nums = [aff_nums]
            # affiliation_numbers の順序で名称を引く
            aff_names = []
            for num in aff_nums:
                idx = sanitize_value(num)
                if idx and idx in aff_map:
                    aff_names.append(aff_map[idx])
            aff_names = [x for x in aff_names if x]  # 空は除外
            authors_norm.append({
                "name": name,
                "name_lang": detect_lang(name),
                "orcid": orcid if valid_orcid(orcid) else "",
                "affiliations": [{"name": an, "lang": detect_lang(an)} for an in aff_names]
            })

    # subjects（keywords）
    kw = rec.get("keywords") or rec.get("keyword")
    subjects = []
    if isinstance(kw, list):
        for s in kw:
            val = sanitize_value(s)
            if val:
                subjects.append({"value": val, "lang": detect_lang(val)})

    # 概要
    desc = sanitize_value(rec.get("Abstract") or rec.get("abstract") or "")

    # publication_info
    pub = rec.get("publication_info") or {}
    year = sanitize_value(pub.get("year"))
    month = month_to_mm(pub.get("month"))
    issued = f"{year}-{month}" if (year and month) else ""
    volume = sanitize_value(pub.get("volume"))
    issue = normalize_issue(pub.get("issue"))

    # page_info
    page_info = rec.get("page_info") or {}
    page_start = sanitize_value(page_info.get("start_page"))
    page_end = sanitize_value(page_info.get("end_page"))

    item = {
        "title": title,
        "title_lang": detect_lang(title),
        "authors": authors_norm,
        "subjects": subjects,
        "description": desc,
        "description_lang": detect_lang(desc),
        "issued": issued,
        "volume": volume,
        "issue": issue,
        "page_start": page_start,
        "page_end": page_end,
    }
    normalized.append(item)

    # 最大数更新
    n_creators = len(authors_norm)
    if n_creators > max_creators:
        max_creators = n_creators
    for i, au in enumerate(authors_norm):
        aff_count = len(au.get("affiliations", []))
        # 著者位置 i の最大 affiliation 数を更新
        if i >= len(max_affils_per_pos):
            max_affils_per_pos.extend([0] * (i + 1 - len(max_affils_per_pos)))
        if aff_count > max_affils_per_pos[i]:
            max_affils_per_pos[i] = aff_count
    if len(subjects) > max_subjects:
        max_subjects = len(subjects)

# =========================
# ヘッダ構築（指定順）
# =========================

columns: List[str] = []
# 1) タイトル
columns.append("/dc:title#1")
columns.append("/dc:title#1@xml:lang")

# 2) クリエータ（可変）
for i in range(1, max_creators + 1):
    columns.append(f"/jpcoar:creator#{i}")  # 値は空（ヘッダ的）
    columns.append(f"/jpcoar:creator#{i}/jpcoar:nameIdentifier#1")
    columns.append(f"/jpcoar:creator#{i}/jpcoar:nameIdentifier#1@nameIdentifierScheme")
    columns.append(f"/jpcoar:creator#{i}/jpcoar:nameIdentifier#1@nameIdentifierURI")
    columns.append(f"/jpcoar:creator#{i}/jpcoar:creatorName#1")
    columns.append(f"/jpcoar:creator#{i}/jpcoar:creatorName#1@xml:lang")
    # 所属（著者位置ごとの最大数）
    max_aff = max_affils_per_pos[i-1] if i-1 < len(max_affils_per_pos) else 0
    for j in range(1, max_aff + 1):
        columns.append(f"/jpcoar:creator#{i}/jpcoar:affiliation#{j}")  # 値は空（ヘッダ的）
        columns.append(f"/jpcoar:creator#{i}/jpcoar:affiliation#{j}/jpcoar:affiliationName#1")
        columns.append(f"/jpcoar:creator#{i}/jpcoar:affiliation#{j}/jpcoar:affiliationName#1@xml:lang")

# 3) アクセス権
columns.append("/dcterms:accessRights#1")
### columns.append("/dcterms:accessRights#1@rdf:resource")

# 4) 権利情報
columns.append("/dc:rights#1")
columns.append("/dc:rights#1@xml:lang")
columns.append("/dc:rights#1@rdf:resource")

# 5) 主題（可変）
for k in range(1, max_subjects + 1):
    columns.append(f"/jpcoar:subject#{k}")
    columns.append(f"/jpcoar:subject#{k}@xml:lang")
    columns.append(f"/jpcoar:subject#{k}@subjectScheme")

# 6) 要約
columns.append("/datacite:description#1")
columns.append("/datacite:description#1@xml:lang")
columns.append("/datacite:description#1@descriptionType")

# 7) 出版者(英/日)
columns.append("/dc:publisher#1")
columns.append("/dc:publisher#1@xml:lang")
columns.append("/dc:publisher#2")
columns.append("/dc:publisher#2@xml:lang")

# 8) 発行年月（Issued）
columns.append("/datacite:date#1")
columns.append("/datacite:date#1@dateType")

# 9) 言語
columns.append("/dc:language#1")

# 10) コンテンツタイプ
columns.append("/local:contentsType#1")

# 11) バージョン/査読
columns.append("/oaire:version#1")
columns.append("/local:peerReviewed#1")

# 12) ソース識別子等＋巻号＋ページ
columns.append("/jpcoar:sourceIdentifier#1")
columns.append("/jpcoar:sourceIdentifier#1@identifierType")
columns.append("/jpcoar:sourceIdentifier#2")
columns.append("/jpcoar:sourceIdentifier#2@identifierType")
columns.append("/local:sourceTitle#1")
columns.append("/jpcoar:volume#1")
columns.append("/jpcoar:issue#1")
columns.append("/jpcoar:pageStart#1")
columns.append("/jpcoar:pageEnd#1")

# =========================
# 行データ生成
# =========================

def row_for_item(item: Dict[str, Any]) -> List[str]:
    row: List[str] = []
    # タイトル
    row.append(item.get("title", ""))
    row.append(item.get("title_lang", ""))

    # クリエータ（最大数に合わせて出力）
    authors = item.get("authors", [])
    for i in range(max_creators):
        if i < len(authors):
            au = authors[i]
            # ヘッダ的列（空）
            row.append("")
            oid = au.get("orcid") or ""
            row.append(oid if valid_orcid(oid) else "")
            row.append("ORCID" if valid_orcid(oid) else "")
            row.append(f"https://orcid.org/{oid}" if valid_orcid(oid) else "")
            row.append(au.get("name", ""))
            row.append(au.get("name_lang", ""))

            # 所属（著者位置ごとの最大数に合わせて埋める）
            affs = au.get("affiliations", [])
            max_aff = max_affils_per_pos[i] if i < len(max_affils_per_pos) else 0
            for j in range(max_aff):
                if j < len(affs):
                    af = affs[j]
                    row.append("")  # affiliation ヘッダ列（空）
                    row.append(af.get("name", ""))
                    row.append(af.get("lang", ""))
                else:
                    row.extend(["", "", ""])
        else:
            # 著者が存在しない位置 → その位置の全列を空で補完
            row.append("")  # creator#i ヘッダ
            row.append("")  # nameIdentifier
            row.append("")  # nameIdentifierScheme
            row.append("")  # nameIdentifierURI
            row.append("")  # creatorName
            row.append("")  # creatorName@xml:lang
            max_aff = max_affils_per_pos[i] if i < len(max_affils_per_pos) else 0
            for _ in range(max_aff):
                row.extend(["", "", ""])  # affiliation ヘッダ/名称/lang

    # アクセス権
    row.append(DEFAULTS["access_rights"])
    ### row.append("")  # @rdf:resource 空

    # 権利情報
    row.append(DEFAULTS["rights_label"])
    row.append(DEFAULTS["rights_lang"])
    row.append(DEFAULTS["rights_uri"])

    # 主題
    subs = item.get("subjects", [])
    for k in range(max_subjects):
        if k < len(subs):
            s = subs[k]
            row.append(s.get("value", ""))
            row.append(s.get("lang", ""))
            row.append(DEFAULTS["subject_scheme"])
        else:
            row.extend(["", "", ""])

    # 要約
    row.append(item.get("description", ""))
    row.append(item.get("description_lang", ""))
    row.append(DEFAULTS["description_type"])

    # 出版者（英/日）
    pub1 = DEFAULTS["publisher_en"]
    pub2 = DEFAULTS["publisher_ja"]
    row.append(pub1)
    row.append(detect_lang(pub1))
    row.append(pub2)
    row.append(detect_lang(pub2))

    # 発行年月（YYYY-MM; 欠けている場合は空）
    row.append(item.get("issued", ""))
    row.append(DEFAULTS["issued_date_type"])

    # 言語
    row.append(DEFAULTS["language_three_letter"])

    # コンテンツタイプ
    row.append(DEFAULTS["contents_type"])

    # バージョン/査読
    row.append(DEFAULTS["version"])
    row.append(DEFAULTS["peer_reviewed"])

    # 出典関連
    row.append(DEFAULTS["pissn"])
    row.append(DEFAULTS["pissn_type"])
    row.append(DEFAULTS["eissn"])
    row.append(DEFAULTS["eissn_type"])
    row.append(DEFAULTS["source_title"])
    row.append(item.get("volume", ""))
    row.append(item.get("issue", ""))
    row.append(item.get("page_start", ""))
    row.append(item.get("page_end", ""))

    return [sanitize_value(x) for x in row]

# =========================
# 書き出し
# =========================

with open(OUTPUT_FILE, "w", encoding=ENCODING, newline="") as fw:
    # ヘッダ
    fw.write("\t".join(columns) + "\n")
    # レコード
    for it in normalized:
        fw.write("\t".join(row_for_item(it)) + "\n")

print(f"処理が完了しました。{len(normalized)}件のレコードを'{OUTPUT_FILE}'に出力しました。")

In [None]:
# JSONファイルを削除します。
!rm *.json