### 准备好 YoutubeUGC 数据 (目前只用了 Lecture 系列)

#### 1. 原始序列是 yuv, 先转成 mkv, 然后做场景分割

In [1]:
size_map = {
    "2160P": "3840x2160",
    "1080P": "1920x1080",
    "720P": "1280x720",
    "540P": "960x540",
    "432P": "768x432",
    "360P": "640x360",
}
fps = 30

In [7]:
""" 1.1 yuv -> mkv """
import os, re
from tqdm import tqdm

yuv_root = "/hdd/YoutubeUGC/origseq/yuv"
mkv_root = "/hdd/YoutubeUGC/origseq/mkv"
os.makedirs(mkv_root, exist_ok=True)

for seq in tqdm(list(filter(lambda x: x.endswith(".yuv"), os.listdir(yuv_root)))):
    src_path = os.path.join(yuv_root, seq)      # Lecture-003a_1080P.yuv
    dst_path = os.path.join(mkv_root, seq.replace(".yuv", ".mkv"))
    
    seq_name = seq.split("_")[0]
    seq_size = re.search(r"(\d+P)", seq)[1]
    
    # YoutubeUGC 的 fps 统一用 30 (因为都是 600 帧, 30 seconds)
    cmd = (f"ffmpeg -y -f rawvideo -pix_fmt yuv420p -video_size {size_map[seq_size]} -r {fps} "
           f"-i {src_path} -c:v libx265 -x265-params log-level=0 -crf 0 -loglevel error {dst_path} &")
    
    os.system(cmd)

100%|██████████| 80/80 [00:00<00:00, 1494.20it/s]


In [8]:
""" 1.2 划分场景 """
import os
from glob import glob
from tqdm import tqdm
from utils import splitScene

mkv_seqs_root = "/hdd/YoutubeUGC/origseq/mkv"
mkv_scenes_root = "/hdd/YoutubeUGC/scenes/mkv/1080P"
os.makedirs(mkv_scenes_root, exist_ok=True)

seqs = sorted(glob(os.path.join(mkv_seqs_root, "*.mkv")))
for seq in tqdm(seqs):
    splitScene(seq, mkv_scenes_root, threshold=15.0)

  0%|          | 0/80 [00:00<?, ?it/s]VideoManager is deprecated and will be removed.

  Detected: 0 | Progress:   0%|          | 0/599 [00:00<?, ?frames/s][A
  Detected: 0 | Progress:   0%|          | 1/599 [00:00<01:16,  7.77frames/s][A
  Detected: 0 | Progress:   3%|▎         | 15/599 [00:00<00:07, 75.54frames/s][A
  Detected: 0 | Progress:   5%|▍         | 28/599 [00:00<00:05, 96.43frames/s][A
  Detected: 0 | Progress:   7%|▋         | 42/599 [00:00<00:05, 103.32frames/s][A
  Detected: 0 | Progress:  10%|▉         | 59/599 [00:00<00:04, 117.63frames/s][A
  Detected: 0 | Progress:  12%|█▏        | 74/599 [00:00<00:04, 120.32frames/s][A
  Detected: 0 | Progress:  15%|█▍        | 88/599 [00:00<00:04, 125.68frames/s][A
  Detected: 0 | Progress:  17%|█▋        | 102/599 [00:00<00:04, 120.05frames/s][A
  Detected: 0 | Progress:  19%|█▉        | 116/599 [00:01<00:03, 125.54frames/s][A
  Detected: 0 | Progress:  22%|██▏       | 130/599 [00:01<00:03, 122.58frames/s][A
  Detected:

#### 2. 把各个场景的 mkv 转码到多种分辨率 (size_map)

In [17]:
""" 2.1 把原始分辨率的 mkv 转到多种分辨率 """
import os, time
from tqdm import tqdm
from utils import countJobs

mkv_root = "/hdd/YoutubeUGC/scenes/mkv"
orig_size = "1080P"
dst_sizes = ["720P", "540P", "360P"]

orig_size_dir = os.path.join(mkv_root, orig_size)
orig_size_seqs = list(filter(lambda x: x.endswith(".mkv"), os.listdir(orig_size_dir)))

for dst_size in dst_sizes:
    dst_size_dir = os.path.join(mkv_root, dst_size)
    os.makedirs(dst_size_dir, exist_ok=True)
    
    for orig_size_seq in tqdm(orig_size_seqs):
        src_path = os.path.join(orig_size_dir, orig_size_seq)
        dst_path = os.path.join(dst_size_dir, orig_size_seq.replace(orig_size, dst_size))
        
        origW, origH = size_map[orig_size].split("x")[0], size_map[orig_size].split("x")[1]
        scaleW, scaleH = size_map[dst_size].split("x")[0], size_map[dst_size].split("x")[1]
        
        cmd = f"ffmpeg -y -i {src_path} -vf scale={scaleW}:{scaleH} -c:v libx265 -crf 0 -max_muxing_queue_size 4096 {dst_path} &"
        os.system(cmd)
        
        while countJobs("ffmpeg") > 200:
            time.sleep(0.5)

100%|██████████| 298/298 [00:48<00:00,  6.13it/s]
100%|██████████| 298/298 [01:03<00:00,  4.68it/s]
100%|██████████| 298/298 [00:58<00:00,  5.11it/s]


In [20]:
""" 2.2 把所有 mkv 转回 yuv, 便于后续转码 """
import os, re
from tqdm import tqdm

mkv_root = "/hdd/YoutubeUGC/scenes/mkv"
yuv_root = "/hdd/YoutubeUGC/scenes/yuv"
os.makedirs(yuv_root, exist_ok=True)

sizes = ["1080P", "720P", "540P", "360P"]
mkv_dirs = [os.path.join(mkv_root, x) for x in sizes]

for size in sizes:
    mkv_dir = os.path.join(mkv_root, size)
    yuv_dir = os.path.join(yuv_root, size)
    os.makedirs(yuv_dir, exist_ok=True)

    for seq in tqdm(list(filter(lambda x: x.endswith(".mkv"), os.listdir(mkv_dir)))):
        src_path = os.path.join(mkv_dir, seq)
        dst_path = os.path.join(yuv_dir, seq.replace(".mkv", ".yuv"))
        
        seq_name = seq.split("_")[0]
        seq_size = re.search(r"(\d+P)", seq)[1]
        
        cmd = f"ffmpeg -i {src_path} -video_size {seq_size} -pix_fmt yuv420p -loglevel error {dst_path} &"
        os.system(cmd)
        
        while countJobs("ffmpeg") > 200:
            time.sleep(0.5)

100%|██████████| 298/298 [00:15<00:00, 19.17it/s]
100%|██████████| 298/298 [03:27<00:00,  1.43it/s]
100%|██████████| 298/298 [04:34<00:00,  1.08it/s]
100%|██████████| 298/298 [03:44<00:00,  1.33it/s]
