In [1]:
import json
from glob import glob
import re

In [2]:
files = glob('../data/items/**/*.json')

In [3]:
possible_season_endding_patterns = [
    r'([\(（]?(第)?\s?(\d+|[一二三四五六七八九十]+|FINAL|1st|2nd|3rd|\d+th)\s?(SEASON|Season|season|シリーズ|シーズン|クール|期|章|季|部)[\)）]?$)',
    r'([\(（]?(Season|SEASON|season|Part|part|PART|Volume|シーズン)\s?(\d+|[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+|II|III|IV|V|VI|VII|VIII|IX|X)[\)）]?$)',
]

def trim_season_mark(title: str) -> str:
    for pattern in possible_season_endding_patterns:
        title = re.sub(pattern, '', title).strip()
    return title

In [4]:
with open('matched_data.json') as f:
    matched_bangumis: dict[str, str] = json.load(f)['title_to_tmdb']
with open('agentic_matched.json') as f:
    agentic_matched_bangumis: dict[str, str] = json.load(f)
for trimmed_title, tmdb_id in agentic_matched_bangumis.items():
    if tmdb_id is None:
        continue
    matched_bangumis[trimmed_title] = tmdb_id

In [5]:
for fn in files:
    with open(fn, 'r') as f:
        bangumis = json.load(f)

    for i, item in enumerate(bangumis):
        title = item['title']
        trimmed_title = trim_season_mark(title)
        tmdb_id = matched_bangumis.get(title, None) or matched_bangumis.get(trimmed_title, None)
        if tmdb_id is None:
            continue

        item['sites'] = [site for site in item['sites'] if site['site'] != 'tmdb']
        item['sites'].append({
            "site": "tmdb",
            "id": tmdb_id,
        })

    with open(fn, 'w') as f:
        json.dump(bangumis, f, ensure_ascii=False, indent=2)

In [7]:
!npm run build


> bangumi-data@0.3.132 build
> node script/build.js

done


In [10]:
with open('../dist/data.json', 'r') as f:
    dist_data = json.load(f)['items']

n_matched = 0

for item in dist_data:
    sites = item.get('sites', [])
    tmdb_site = [site for site in sites if site['site'] == 'tmdb']
    n_matched += len(tmdb_site)

print(f'Total matched items in dist/data.json: {n_matched}/{len(dist_data)}, ratio: {n_matched/len(dist_data):.2%}')

Total matched items in dist/data.json: 7914/8292, ratio: 95.44%
