This script involves finetuning the SmolVLM2 model on the real/synthetic video video classification task. First we import all our relevant libraries

In [None]:
from datasets import load_dataset
import os
import coremltools as ct
import requests
import zipfile
import tarfile
import shutil
import subprocess
import modelscope

import argparse, os, random, tarfile, zipfile, subprocess, shutil
from pathlib import Path
from typing import List
import csv, requests
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import albumentations as A
import numpy as np
import decord


Then we load the Genvideo dataset (500GB) which contains over one million diverse AI generated and real videos, from state of the art generative video models with temporal artefacts crucial for video-based AI detection.

In [None]:
os.makedirs("D:/GenVideo", exist_ok=True)

In [None]:
GENV_BASE = "https://www.modelscope.cn/datasets/cccnju/Gen-Video/resolve/master/"
K400_TXT   = "https://s3.amazonaws.com/kinetics/400/train/k400_train_path.txt"

GENV_FILES = {
    # real: MSR-VTT
    #"GenVideo-Val.zip": 13.93,
    # fake: AI generators
    "OpenSora.tar.gz": 31.72,
    "Latte.split.aa": 42.95, "Latte.split.ab": 15.85,
    "Pika.split.aa": 42.95,  "Pika.split.ab": 15.62,
    "DynamicCrafter.split.aa": 42.95, "DynamicCrafter.split.ab": 33.80,
    "SEINE.tar.gz": 9.03,
    "SD.split.ah": 24.96,
    "I2VGEN_XL.split.aj":13.92,
    "SVD.split.ac": 9.38

}
# Youku real splits
YOUKU_PREFIX = "Youku_1M_10s"
YOUKU_COUNT = [
    "aa"
]

#K400_PARTS = [f"train/part_{i:03}.tar.gz" for i in range(4)] # first four parts only


def _dl(url: str, dst: Path):
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists(): return
    resp = requests.get(url, stream=True)
    resp.raise_for_status()
    total = int(resp.headers.get("content-length", 0))
    with open(dst, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=dst.name) as p:
        for chunk in resp.iter_content(chunk_size=33_554_432):  # 4MB
            if not chunk: break
            f.write(chunk); p.update(len(chunk))

def download_subset(root: Path):
    root.mkdir(parents=True, exist_ok=True)

    # 1) Full Kinetics-400 train parts
    txt = requests.get(K400_TXT).text.splitlines()
    for line in txt:
        fname = Path(line).name
        _dl(line, root/fname)

    # 2) Youku-1M splits
    for suf in YOUKU_COUNT:
        fn = f"{YOUKU_PREFIX}.split.{suf}"
        url = GENV_BASE + fn
        _dl(url, root/fn)

    # 3) AI archives
    for fn in GENV_FILES:
        _dl(GENV_BASE + fn, root/fn)

def recombine_splits(pieces, tgt: Path, chunk_size=4_194_304):
    with open(tgt, "wb") as w:
        for p in pieces:
            with open(p, "rb") as r:
                shutil.copyfileobj(r, w, length=chunk_size)


def _extract(path: Path, out: Path):
    out.mkdir(parents=True, exist_ok=True)
    if path.suffix == ".zip":
        with zipfile.ZipFile(path) as z:
            z.extractall(out)
    else:
        try:
            # streaming mode
            with tarfile.open(path, "r|gz") as stream:
                for member in stream:
                    try:
                        stream.extract(member, out)
                    except Exception as e:
                        # skip any corrupt member
                        print(f"Skipping bad {member.name}: {e}")
        except (tarfile.ReadError, EOFError, OSError) as e:
            # skip the entire archive if it's unreadable
            print(f"Skipping corrupt archive {path.name}: {e}")


def prepare_folders(root: Path):
    fake_root = root / "fake"
    real_root = root / "real"
    fake_root.mkdir(exist_ok=True, parents=True)
    real_root.mkdir(exist_ok=True, parents=True)

    # 1) Handle MSR-VTT bundle
    msrzip = root / "GenVideo-Val.zip"
    if msrzip.exists():
        tmp = root / "_tmp_msr"
        if tmp.exists(): shutil.rmtree(tmp)
        tmp.mkdir()
        _extract(msrzip, tmp)
        inner = next(tmp.iterdir())  
        for cls, dest in [("Real", real_root), ("Fake", fake_root)]:
            src = inner / cls
            if src.exists():
                for child in src.iterdir():
                    shutil.move(str(child), dest / child.name)
        shutil.rmtree(tmp)

    # 2) Extract full Kinetics-400 parts
    kin_dest = real_root / "Kinetics"
    kin_dest.mkdir(exist_ok=True)
    for tar in sorted(root.glob("part_*.tar.gz")):
        _extract(tar, kin_dest)

    # 3) Extract Youku-1M 
    yk_parts = sorted(root.glob("Youku_1M_10s.split.*"))
    if yk_parts:
        yk_tar = root / "Youku_1M_10s.tar.gz"
        recombine_splits(yk_parts, yk_tar)
        _extract(yk_tar, real_root / "Youku")

    # 4) Extract chosen fake sources
    fake_sources = [
        "OpenSora.tar.gz",
        "Latte.split.aa", "Latte.split.ab",
        "Pika.split.aa", "Pika.split.ab",
        "DynamicCrafter.split.aa", "DynamicCrafter.split.ab",
        "SEINE.tar.gz",
        "SD.split.ah",
        "I2VGEN_XL.split.aj",
        "SVD.split.ac",
    ]

    for fn in fake_sources:
        path = root / fn
        stem = fn.split('.')[0]

        # recombine if it's a split
        if ".split." in fn:
            parts = sorted(root.glob(f"{stem}.split.*"))
            if parts:
                tar_target = root / f"{stem}.tar.gz"
                recombine_splits(parts, tar_target)
                path = tar_target
        # extract into fake/<stem>/
        if path.exists():
            _extract(path, fake_root / stem)


def make_csv(root: Path, seed: int = 42):
    # gather all clips
    real_paths = list((root / "real").rglob("*.mp4"))
    fake_paths = list((root / "fake").rglob("*.mp4"))

    # determine balanced count
    k = min(len(real_paths), len(fake_paths))
    random.seed(seed)
    real_sel = random.sample(real_paths, k)
    fake_sel = random.sample(fake_paths, k)

    all_sel = real_sel + fake_sel
    random.shuffle(all_sel)

    # split 80/10/10
    n = len(all_sel)
    train_end = int(0.8 * n)
    val_end   = int(0.9 * n)
    splits = {
        "train": all_sel[:train_end],
        "val"  : all_sel[train_end:val_end],
        "test" : all_sel[val_end:]
    }

    # write CSVs
    for name, items in splits.items():
        csv_path = root / f"{name}.csv"
        with open(csv_path, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["path", "label"])
            for p in items:
                label = 0 if "real" in p.parts else 1
                writer.writerow([p.as_posix(), label])

    print(f"Wrote splits: {', '.join([f'{k} ({len(v)})' for k,v in splits.items()])}")


Transforms and loaders

In [None]:
root = Path("D:/GenVideo")
print("→ Downloading...")
#download_subset(root)
print("→ Extracting...")
prepare_folders(root)
print("→ Splitting...")
make_csv(root)
print("✓  Done")