<a href="https://colab.research.google.com/github/ashikita/qir-toolbox/blob/main/json2tsv-4qir.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
"""
機能:
- 作業ディレクトリ内の *.json をすべて読み込み、論文メタデータをTAB区切りの一括登録用フォーマットに整形して output.txt を生成
- 1論文1行、先頭行に可変列を含むヘッダを出力
- かな漢字/アルファベットに基づき xml:lang を簡易判定
- 共著者と所属の可変数に対応し、列ずれが起こらないように最大数に合わせてTABを補完
- 指示に沿った初期値は設定ブロックで変更可能
"""

import os
import re
import json
import glob
from typing import List, Dict, Any, Tuple

# =========================
# 設定ブロック（必要に応じて編集）
# =========================
DEFAULTS = {
    "access_rights": "110",  # /dcterms:accessRights#1
    "rights_label": "Creative Commons Attribution 4.0 International",  # /dc:rights#1
    "rights_lang": "en",  # /dc:rights#1@xml:lang
    "rights_uri": "https://creativecommons.org/licenses/by/4.0/",  # /dc:rights#1@rdf:resource
    "publisher_en": "Transdisciplinary Research and Education Center for Green Technologies, Kyushu University",  # /dc:publisher#1
    "publisher_ja": "九州大学グリーンテクノロジー研究教育センター",  # /dc:publisher#2
    "issued_date": "2026-01",  # /datacite:date#1
    "issued_date_type": "Issued",  # /datacite:date#1@dateType
    "language_three_letter": "eng",  # /dc:language#1
    "contents_type": "1207000000",  # /local:contentsType#1 (journal article)
    "version": "VoR",  # /oaire:version#1
    "peer_reviewed": "refereed",  # /local:peerReviewed#1
    "pissn": "2189-0420",  # /jpcoar:sourceIdentifier#1
    "pissn_type": "PISSN",
    "eissn": "2432-5953",  # /jpcoar:sourceIdentifier#2
    "eissn_type": "eISSN",
    "source_title": "evergreen",  # /local:sourceTitle#1
    "volume": "12",  # /jpcoar:volume#1
    "issue": "4",  # /jpcoar:issue#1
    "subject_scheme": "Other",  # /jpcoar:subject#n@subjectScheme
    "description_type": "Abstract",  # /datacite:description#1@descriptionType
}

INPUT_GLOB_PATTERN = "*.json"
OUTPUT_FILE = "output.txt"
ENCODING = "utf-8"

# =========================
# ユーティリティ
# =========================

JP_REGEX = re.compile(r"[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uff66-\uff9f]")
EN_REGEX = re.compile(r"[A-Za-z]")
ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")

def detect_lang(text: str) -> str:
    """かな・漢字（含むカタカナ）→ 'ja'、アルファベット → 'en'、それ以外は空値"""
    if not text:
        return ""
    if JP_REGEX.search(text):
        return "ja"
    if EN_REGEX.search(text):
        return "en"
    return ""

def sanitize_value(v: Any) -> str:
    """タブ・改行を空白化し、前後空白をトリム"""
    if v is None:
        return ""
    s = str(v)
    s = s.replace("\t", " ").replace("\r\n", " ").replace("\n", " ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def valid_orcid(oid: str) -> bool:
    return bool(ORCID_REGEX.match(oid.strip())) if oid else False

def split_affiliation_indices(raw_idx) -> List[str]:
    """所属インデックスが '1' / '1,2' / '1; 2' / [1,2] など様々な形式に対応"""
    if raw_idx is None:
        return []
    if isinstance(raw_idx, list):
        return [str(x).strip() for x in raw_idx if str(x).strip()]
    s = str(raw_idx)
    parts = re.split(r"[,\s;]+", s)
    return [p for p in (x.strip() for x in parts) if p]

def build_orcid_map(orcid_list: Any) -> Dict[str, str]:
    """orcid項目（配列）から {author_name: orcid_id} の辞書を構築"""
    m: Dict[str, str] = {}
    if isinstance(orcid_list, list):
        for item in orcid_list:
            try:
                name = sanitize_value(item.get("author_name"))
                oid = sanitize_value(item.get("orcid_id"))
                if name and valid_orcid(oid):
                    m[name] = oid
            except Exception:
                continue
    return m

def build_affiliation_map(aff_list: Any) -> Dict[str, str]:
    """affiliation配列から {index: name} 辞書を構築"""
    m: Dict[str, str] = {}
    if isinstance(aff_list, list):
        for item in aff_list:
            try:
                idx = sanitize_value(item.get("affiliation_index"))
                name = sanitize_value(item.get("affiliation_name"))
                if idx and name:
                    m[idx] = name
            except Exception:
                continue
    return m

def parse_json_file(path: str) -> List[Dict[str, Any]]:
    """
    JSONファイルを柔軟に解釈:
    - 単一オブジェクト
    - 配列
    - 改行区切りの複数JSONオブジェクト
    - 連結JSON（{}{}{}...）を波括弧バランスで分割
    """
    with open(path, "r", encoding=ENCODING) as f:
        txt = f.read().strip()
    if not txt:
        return []

    # まずは通常のJSONとして試行
    try:
        obj = json.loads(txt)
        if isinstance(obj, list):
            return [x for x in obj if isinstance(x, dict)]
        if isinstance(obj, dict):
            return [obj]
    except Exception:
        pass

    # 行ごと（JSON Lines）を試行
    records = []
    for line in txt.splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            if isinstance(obj, dict):
                records.append(obj)
        except Exception:
            records = []
            break
    if records:
        return records

    # 波括弧バランスで分割
    parts = []
    depth = 0
    start = None
    for i, ch in enumerate(txt):
        if ch == "{":
            if depth == 0:
                start = i
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0 and start is not None:
                parts.append(txt[start:i+1])
                start = None
    out = []
    for p in parts:
        try:
            obj = json.loads(p)
            if isinstance(obj, dict):
                out.append(obj)
        except Exception:
            # 最後の手段: 末尾のカンマ欠落など軽微な不備は諦めてスキップ
            continue
    return out

# =========================
# データ一次走査: 最大列数の算出
# =========================

json_files = sorted(glob.glob(INPUT_GLOB_PATTERN))
all_records: List[Dict[str, Any]] = []
if not json_files:
    # 出力だけ作成（空ヘッダは無意味なので、最低限の固定列のみ）
    json_files = []

for fp in json_files:
    try:
        recs = parse_json_file(fp)
        all_records.extend(recs)
    except Exception:
        # 読み取り不能ファイルはスキップ
        continue

# レコードを内部表現に正規化しつつ、列の最大数を計測
normalized: List[Dict[str, Any]] = []
max_creators = 0
# 各著者位置(i)ごとの最大 affiliation 数（1-basedではなく0-based保持）
max_affils_per_pos: List[int] = []
max_subjects = 0

for rec in all_records:
    title = sanitize_value(rec.get("title", ""))

    # orcid と affiliation の辞書化
    orcid_map = build_orcid_map(rec.get("orcid"))
    aff_map = build_affiliation_map(rec.get("affiliation"))

    # 著者
    authors_raw = rec.get("author", []) or rec.get("authors", [])
    authors_norm = []
    if isinstance(authors_raw, list):
        for a in authors_raw:
            if not isinstance(a, dict):
                continue
            name = sanitize_value(a.get("author_name") or a.get("name"))
            if not name:
                continue
            aff_idx_list = split_affiliation_indices(a.get("affiliation_index"))
            aff_names = [sanitize_value(aff_map.get(idx, "")) for idx in aff_idx_list]
            aff_names = [x for x in aff_names if x]  # 空は除外
            authors_norm.append({
                "name": name,
                "name_lang": detect_lang(name),
                "orcid": sanitize_value(orcid_map.get(name, "")),
                "affiliations": [{"name": an, "lang": detect_lang(an)} for an in aff_names]
            })

    # subjects（keyword）
    kw = rec.get("keyword") or rec.get("keywords")
    subjects = []
    if isinstance(kw, list):
        for s in kw:
            val = sanitize_value(s)
            if val:
                subjects.append({"value": val, "lang": detect_lang(val)})

    # 概要
    desc = sanitize_value(rec.get("Abstract") or rec.get("abstract") or "")

    item = {
        "title": title,
        "title_lang": detect_lang(title),
        "authors": authors_norm,
        "subjects": subjects,
        "description": desc,
        "description_lang": detect_lang(desc),
    }
    normalized.append(item)

    # 最大数更新
    n_creators = len(authors_norm)
    if n_creators > max_creators:
        max_creators = n_creators
    # 著者位置ごとの所属最大数
    for i, au in enumerate(authors_norm):
        aff_count = len(au.get("affiliations", []))
        if i >= len(max_affils_per_pos):
            max_affils_per_pos.extend([0] * (i + 1 - len(max_affils_per_pos)))
        if aff_count > max_affils_per_pos[i]:
            max_affils_per_pos[i] = aff_count
    # subject最大数
    if len(subjects) > max_subjects:
        max_subjects = len(subjects)

# =========================
# ヘッダ構築
# =========================

columns: List[str] = []
# 1) タイトル
columns.append("/dc:title#1")
columns.append("/dc:title#1@xml:lang")

# 2) クリエータ（可変）
for i in range(1, max_creators + 1):
    columns.append(f"/jpcoar:creator#{i}")  # 値は空（ヘッダ的）
    columns.append(f"/jpcoar:creator#{i}/jpcoar:nameIdentifier#1")
    columns.append(f"/jpcoar:creator#{i}/jpcoar:nameIdentifier#1@nameIdentifierScheme")
    columns.append(f"/jpcoar:creator#{i}/jpcoar:nameIdentifier#1@nameIdentifierURI")
    columns.append(f"/jpcoar:creator#{i}/jpcoar:creatorName#1")
    columns.append(f"/jpcoar:creator#{i}/jpcoar:creatorName#1@xml:lang")
    # 所属（著者位置ごとの最大数）
    max_aff = max_affils_per_pos[i-1] if i-1 < len(max_affils_per_pos) else 0
    for j in range(1, max_aff + 1):
        columns.append(f"/jpcoar:creator#{i}/jpcoar:affiliation#{j}")  # 値は空（ヘッダ的）
        columns.append(f"/jpcoar:creator#{i}/jpcoar:affiliation#{j}/jpcoar:affiliationName#1")
        columns.append(f"/jpcoar:creator#{i}/jpcoar:affiliation#{j}/jpcoar:affiliationName#1@xml:lang")

# 3) アクセス権
columns.append("/dcterms:accessRights#1")
columns.append("/dcterms:accessRights#1@rdf:resource")

# 4) 権利情報
columns.append("/dc:rights#1")
columns.append("/dc:rights#1@xml:lang")
columns.append("/dc:rights#1@rdf:resource")

# 5) 主題（可変）
for k in range(1, max_subjects + 1):
    columns.append(f"/jpcoar:subject#{k}")
    columns.append(f"/jpcoar:subject#{k}@xml:lang")
    columns.append(f"/jpcoar:subject#{k}@subjectScheme")

# 6) 要約
columns.append("/datacite:description#1")
columns.append("/datacite:description#1@xml:lang")
columns.append("/datacite:description#1@descriptionType")

# 7) 出版者(英/日)
columns.append("/dc:publisher#1")
columns.append("/dc:publisher#1@xml:lang")
columns.append("/dc:publisher#2")
columns.append("/dc:publisher#2@xml:lang")

# 8) 日付（Issued）
columns.append("/datacite:date#1")
columns.append("/datacite:date#1@dateType")

# 9) 言語
columns.append("/dc:language#1")

# 10) コンテンツタイプ
columns.append("/local:contentsType#1")

# 11) バージョン/査読
columns.append("/oaire:version#1")
columns.append("/local:peerReviewed#1")

# 12) ソース識別子等
columns.append("/jpcoar:sourceIdentifier#1")
columns.append("/jpcoar:sourceIdentifier#1@identifierType")
columns.append("/jpcoar:sourceIdentifier#2")
columns.append("/jpcoar:sourceIdentifier#2@identifierType")
columns.append("/local:sourceTitle#1")
columns.append("/jpcoar:volume#1")
columns.append("/jpcoar:issue#1")

# =========================
# 行データ生成
# =========================

def row_for_item(item: Dict[str, Any]) -> List[str]:
    row: List[str] = []
    # タイトル
    row.append(item.get("title", ""))
    row.append(item.get("title_lang", ""))

    # クリエータ
    authors = item.get("authors", [])
    for i in range(max_creators):
        if i < len(authors):
            au = authors[i]
            # ヘッダ的列（空）
            row.append("")
            oid = au.get("orcid") or ""
            row.append(oid if valid_orcid(oid) else "")
            row.append("ORCID" if valid_orcid(oid) else "")
            row.append(f"https://orcid.org/{oid}" if valid_orcid(oid) else "")
            row.append(au.get("name", ""))
            row.append(au.get("name_lang", ""))

            # 所属（最大数に合わせて埋める）
            affs = au.get("affiliations", [])
            max_aff = max_affils_per_pos[i] if i < len(max_affils_per_pos) else 0
            for j in range(max_aff):
                if j < len(affs):
                    af = affs[j]
                    row.append("")  # affiliation ヘッダ列（空）
                    row.append(af.get("name", ""))
                    row.append(af.get("lang", ""))
                else:
                    row.extend(["", "", ""])
        else:
            # 著者が存在しない位置 → その位置の全列を空で補完
            row.append("")  # creator#i ヘッダ
            row.append("")  # nameIdentifier
            row.append("")  # nameIdentifierScheme
            row.append("")  # nameIdentifierURI
            row.append("")  # creatorName
            row.append("")  # creatorName@xml:lang
            max_aff = max_affils_per_pos[i] if i < len(max_affils_per_pos) else 0
            for _ in range(max_aff):
                row.extend(["", "", ""])  # affiliation ヘッダ/名称/lang

    # アクセス権
    row.append(DEFAULTS["access_rights"])
    row.append("")  # @rdf:resource 空

    # 権利情報
    row.append(DEFAULTS["rights_label"])
    row.append(DEFAULTS["rights_lang"])
    row.append(DEFAULTS["rights_uri"])

    # 主題
    subs = item.get("subjects", [])
    for k in range(max_subjects):
        if k < len(subs):
            s = subs[k]
            row.append(s.get("value", ""))
            row.append(s.get("lang", ""))
            row.append(DEFAULTS["subject_scheme"])
        else:
            row.extend(["", "", ""])

    # 要約
    row.append(item.get("description", ""))
    row.append(item.get("description_lang", ""))
    row.append(DEFAULTS["description_type"])

    # 出版者（英/日）
    pub1 = DEFAULTS["publisher_en"]
    pub2 = DEFAULTS["publisher_ja"]
    row.append(pub1)
    row.append(detect_lang(pub1))
    row.append(pub2)
    row.append(detect_lang(pub2))

    # 日付
    row.append(DEFAULTS["issued_date"])
    row.append(DEFAULTS["issued_date_type"])

    # 言語
    row.append(DEFAULTS["language_three_letter"])

    # コンテンツタイプ
    row.append(DEFAULTS["contents_type"])

    # バージョン/査読
    row.append(DEFAULTS["version"])
    row.append(DEFAULTS["peer_reviewed"])

    # 出典関連
    row.append(DEFAULTS["pissn"])
    row.append(DEFAULTS["pissn_type"])
    row.append(DEFAULTS["eissn"])
    row.append(DEFAULTS["eissn_type"])
    row.append(DEFAULTS["source_title"])
    row.append(DEFAULTS["volume"])
    row.append(DEFAULTS["issue"])

    return [sanitize_value(x) for x in row]

# =========================
# 書き出し
# =========================

with open(OUTPUT_FILE, "w", encoding=ENCODING, newline="") as fw:
    # ヘッダ
    fw.write("\t".join(columns) + "\n")
    # レコード
    for it in normalized:
        fw.write("\t".join(row_for_item(it)) + "\n")

print(f"処理が完了しました。{len(normalized)}件のレコードを'{OUTPUT_FILE}'に出力しました。")