<a href="https://colab.research.google.com/github/YUMA-NAGAO/Algorithm/blob/main/%E3%83%80%E3%82%A6%E3%83%B3%E3%83%AD%E3%83%BC%E3%83%89Youtube%E3%81%A8%E5%AD%97%E5%B9%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# 1. 必要パッケージをインストール
!pip install -q -U yt-dlp youtube-transcript-api

# 2. モジュールのインポート
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from urllib.parse import urlparse, parse_qs
import os
import sys
import shutil
import json
import re

In [6]:
VIDEO_URL    = "https://www.youtube.com/watch?v=pfrBPp_btAQ"

In [7]:
DOWNLOAD_DIR = "/content/downloads"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

def check_ffmpeg():
    """ffmpeg と ffprobe の存在を確認（ログ用）"""
    ffmpeg = shutil.which("ffmpeg")
    ffprobe = shutil.which("ffprobe")
    print(f"ffmpeg: {ffmpeg}")
    print(f"ffprobe: {ffprobe}")

def get_video_id(url_or_id: str) -> str:
    if "youtube.com" in url_or_id or "youtu.be" in url_or_id:
        if "youtube.com" in url_or_id:
            return parse_qs(urlparse(url_or_id).query).get("v", [""])[0]
        else:
            return urlparse(url_or_id).path.lstrip("/")
    return url_or_id

def get_subtitle(video_id: str) -> str | None:
    """日本語字幕（手動→自動の順）を取得"""
    transcript_list = None
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        manual = transcript_list.find_transcript(["ja"])
        if not manual.is_generated:
            entries = manual.fetch()
            return "【手動字幕】\n" + "\n".join(e.get("text", "") for e in entries)
    except (TranscriptsDisabled, NoTranscriptFound, Exception):
        pass

    if transcript_list is None:
        try:
            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        except Exception:
            return None

    try:
        generated = transcript_list.find_generated_transcript(["ja"])
        entries = generated.fetch()
        return "【自動字幕】\n" + "\n".join(e.get("text", "") for e in entries)
    except (TranscriptsDisabled, NoTranscriptFound, Exception):
        return None

def _sanitize_title_for_fs(title: str, max_len: int = 100) -> str:
    """ファイル名として安全化＆最大100文字に制限"""
    title = re.sub(r'[\\/:*?"<>|\x00-\x1f]', "_", title)
    title = re.sub(r"\s+", " ", title).strip(" .")
    if len(title) > max_len:
        title = title[:max_len].rstrip(" .")
    return title or "video"

def download_youtube(url: str, out_dir: str = DOWNLOAD_DIR):
    """
    動画の長さで画質を切り替えてダウンロード
    - 10分以下: 制限なし（可能なら4KもOK）
    - 10分超: 1080p以下に制限
    """

    # 動画メタ情報を取得
    info_opts = {"quiet": True, "noplaylist": True}
    with yt_dlp.YoutubeDL(info_opts) as ydl:
        info = ydl.extract_info(url, download=False)
        raw_title = info.get("title", "video")
        safe_title = _sanitize_title_for_fs(raw_title, max_len=100)
        duration = info.get("duration", 0)

    # フォーマット条件を切り替え
    if duration > 600:  # 10分超 → 1080p以下
        format_str = (
            "bestvideo[height<=1080][ext=mp4][vcodec^=avc1]"
            "+bestaudio[ext=m4a][acodec^=mp4a]"
        )
    else:  # 10分以下 → 制限なし（4K可）
        format_str = (
            "bestvideo[ext=mp4][vcodec^=avc1]+bestaudio[ext=m4a][acodec^=mp4a]/"
            "best[ext=mp4]/best"
        )

    # ダウンロード
    ydl_opts = {
        "format": format_str,
        "merge_output_format": "mp4",
        "outtmpl": f"{out_dir}/{safe_title}.%(ext)s",
        "restrictfilenames": False,
        "noplaylist": True,
        "quiet": False,
        "postprocessor_args": ["-movflags", "+faststart"],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        print(f"▶ yt-dlp version: {yt_dlp.version.__version__}")
        print(f"▶ Duration: {duration} sec → format: {format_str}")
        ydl.download([url])

def probe_media(filepath: str):
    """ffprobe でコーデック/コンテナ/音声有無を表示"""
    ffprobe = shutil.which("ffprobe")
    if not ffprobe or not os.path.exists(filepath):
        return
    import subprocess, shlex
    cmd = f'{ffprobe} -v error -print_format json -show_streams -show_format "{filepath}"'
    try:
        out = subprocess.check_output(shlex.split(cmd))
        info = json.loads(out.decode("utf-8", errors="ignore"))
        print("=== ffprobe ===")
        print(json.dumps(info, ensure_ascii=False, indent=2)[:5000])
    except Exception as e:
        print("ffprobe error:", e)

if __name__ == "__main__":
    check_ffmpeg()

    print(f"▶ Downloading video to {DOWNLOAD_DIR} …")
    download_youtube(VIDEO_URL)

    # ダウンロードした最新の mp4 を確認
    mp4s = sorted(
        [f for f in os.listdir(DOWNLOAD_DIR) if f.lower().endswith(".mp4")],
        key=lambda x: os.path.getmtime(os.path.join(DOWNLOAD_DIR, x)),
        reverse=True
    )
    if mp4s:
        newest = os.path.join(DOWNLOAD_DIR, mp4s[0])
        print("✔ Latest file:", newest, os.path.getsize(newest), "bytes")
        probe_media(newest)
    else:
        print("⚠ mp4が見つかりません。format指定を見直してください。")

    video_id = get_video_id(VIDEO_URL)
    print(f"▶ Fetching subtitles for video ID: {video_id} …")
    subtitle_text = get_subtitle(video_id)
    if subtitle_text:
        subtitle_path = os.path.join(DOWNLOAD_DIR, f"{video_id}_subtitle.txt")
        with open(subtitle_path, "w", encoding="utf-8") as f:
            f.write(subtitle_text)
        print(f"✔ Subtitles saved to: {subtitle_path}")
    else:
        print("⚠ 日本語字幕が見つかりませんでした。")


ffmpeg: /usr/bin/ffmpeg
ffprobe: /usr/bin/ffprobe
▶︎ Downloading video to /content/downloads …
▶︎ yt-dlp version: 2025.09.05
[youtube] Extracting URL: https://www.youtube.com/watch?v=pfrBPp_btAQ
[youtube] pfrBPp_btAQ: Downloading webpage
[youtube] pfrBPp_btAQ: Downloading tv simply player API JSON
[youtube] pfrBPp_btAQ: Downloading tv client config
[youtube] pfrBPp_btAQ: Downloading tv player API JSON
[info] pfrBPp_btAQ: Downloading 1 format(s): 137+140
[download] Sleeping 4.00 seconds as required by the site...
[download] Destination: /content/downloads/【MV】この世界は噓でできている／高嶺のなでしこ【HoneyWorks】.f137.mp4
[download] 100% of   97.64MiB in 00:00:02 at 47.18MiB/s  
[download] Destination: /content/downloads/【MV】この世界は噓でできている／高嶺のなでしこ【HoneyWorks】.f140.m4a
[download] 100% of    4.59MiB in 00:00:00 at 24.14MiB/s  
[Merger] Merging formats into "/content/downloads/【MV】この世界は噓でできている／高嶺のなでしこ【HoneyWorks】.mp4"
Deleting original file /content/downloads/【MV】この世界は噓でできている／高嶺のなでしこ【HoneyWorks】.f137.mp4 (pass -k