In [4]:
!pip install tqdm requests

import json
import re
import requests
from tqdm import tqdm



In [5]:
with open('matched_data.json') as f:
    data = json.load(f)
    matched_title_to_tmdb_id = data['title_to_tmdb']
    trimmed_title_to_tmdb_id_tv = {
        title: tmdb_id for title, tmdb_id in matched_title_to_tmdb_id.items() if tmdb_id.startswith('tv/')
    }
    trimmed_title_to_tmdb_id_movie = {
        title: tmdb_id for title, tmdb_id in matched_title_to_tmdb_id.items() if tmdb_id.startswith('movie/')
    }

In [14]:
with open('../dist/data.json') as f:
    bangumis = json.load(f)['items']
title_to_bangimi = {
    b['title']: b for b in bangumis
}

In [7]:
possible_season_endding_patterns = [
    r'([\(（]?(第)?\s?(\d+|[一二三四五六七八九十]+|FINAL|1st|2nd|3rd|\d+th)\s?(SEASON|Season|season|シリーズ|シーズン|クール|期|章|季|部)[\)）]?$)',
    r'([\(（]?(Season|SEASON|season|Part|part|PART|Volume|シーズン)\s?(\d+|[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+|II|III|IV|V|VI|VII|VIII|IX|X)[\)）]?$)',
]

def trim_season_mark(title: str) -> str:
    for pattern in possible_season_endding_patterns:
        title = re.sub(pattern, '', title).strip()
    return title

In [None]:
title_to_tmdb_id_tv = {}
for bangumi in bangumis:
    title = bangumi['title']
    trimmed_title = trim_season_mark(title)
    if trimmed_title in trimmed_title_to_tmdb_id_tv:
        title_to_tmdb_id_tv[title] = trimmed_title_to_tmdb_id_tv[trimmed_title]

In [10]:
TMDB_API_KEY = 'fb7bb23f03b6994dafc674c074d01761'

def tmdb_tv_detail(id):
    url = f'https://api.themoviedb.org/3/tv/{id}?api_key={TMDB_API_KEY}&language=ja-JP'
    text = requests.get(url).text
    return json.loads(text)

In [11]:
def tmdb_season_detail(tv_id, season_number):
    url = f'https://api.themoviedb.org/3/tv/{tv_id}/season/{season_number}?api_key={TMDB_API_KEY}&language=ja-JP'
    text = requests.get(url).text
    return json.loads(text)

In [12]:
def is_special(raw_title: str):
    conditions = [
        re.match(r'.*特別編$', raw_title),
        re.match(r'.*特典映像$', raw_title),
        re.match(r'.*SPECIAL$', raw_title),
        re.match(r'.*SPECIAL EDITION$', raw_title),
        re.match(r'.*OVA$', raw_title),
    ]
    return any(conditions)

In [None]:
progress = tqdm(title_to_tmdb_id_tv.items())

n_single_season = 0
non_single_season = {}

for title, tmdb_id in progress:
    skip_conditions = [
        '/season/' in tmdb_id,
        title in non_single_season,
    ]
    if any(skip_conditions):
        continue

    number_id = re.sub("^tv/", "", tmdb_id)
    detail = tmdb_tv_detail(number_id)
    seasons = detail.get('seasons', [])
    seasons_exclude_specials = [s for s in seasons if s.get('season_number', 0) != 0]
    special_seasons = [s for s in seasons if s.get('season_number', 0) == 0]

    air_date = bangumi.get('air_date', '')

    if len(seasons) == 1:
        title_to_tmdb_id_tv[title] = f"{tmdb_id}/season/1"
        n_single_season += 1
    # elif len(special_seasons) == 1 and is_special(title):
    #     special_season = special_seasons[0]
    #     special_season_detail = tmdb_season_detail(number_id, 0)
    #     special_episode_air_date = [ep.get('air_date', '') for ep in special_season_detail.get('episodes', [])]
    #     index = special_episode_air_date.index(air_date)
    else:
        non_single_season[title] = detail

    progress.set_postfix({'single_season': n_single_season, "processed": title})

  0%|          | 0/5097 [00:00<?, ?it/s]

100%|██████████| 5097/5097 [41:23<00:00,  2.05it/s, single_season=2464, processed=藤本タツキ17-26]                                                                                                                                    


In [None]:
with open('matched_season.json', 'w') as f:
    json.dump({
        "title_to_season_id": title_to_tmdb_id_tv,
        "non_single_season": non_single_season,
    }, f, ensure_ascii=False, indent=2)