This script involves finetuning the SmolVLM2 model on the real/synthetic video video classification task. First we import all our relevant libraries

In [None]:
%pip install albumentations

from datasets import load_dataset
import os
import coremltools as ct
import requests
import zipfile
import tarfile
from pathlib import Path
import subprocess
import modelscope

import argparse, os, random, tarfile, zipfile, subprocess, shutil
from pathlib import Path
from typing import List
import csv, requests
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import albumentations as A
import numpy as np
import decord

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'd:\Course\Computer Science\CS5983\seethrough\venv\Scripts\python.exe -m pip install --upgrade pip' command.


Then we load the Genvideo dataset (500GB) which contains over one million diverse AI generated and real videos, from state of the art generative video models with temporal artefacts crucial for video-based AI detection.

In [None]:
os.makedirs("D:/GenVideo", exist_ok=True)

DatasetNotFoundError: Dataset 'cccnju/Gen-Video' doesn't exist on the Hub or cannot be accessed.

In [None]:
GENV_BASE = "https://www.modelscope.cn/datasets/cccnju/Gen-Video/resolve/master/"
K400_TXT   = "https://s3.amazonaws.com/kinetics/400/train/k400_train_path.txt"

GENV_FILES = {
    # real
    "GenVideo-Val.zip": 13.93,
    # AI
    "OpenSora.tar.gz"         : 31.72,
    "Latte.split.aa"          : 42.95, "Latte.split.ab": 15.85,
    "Pika.split.aa"           : 42.95, "Pika.split.ab" : 15.62,
    "DynamicCrafter.split.aa" : 42.95, "DynamicCrafter.split.ab": 33.80,
}

K400_PARTS = [f"train/part_{i:03}.tar.gz" for i in range(4)] # first four parts only


def _dl(url: str, dst: Path): # download a file only if it doesn't exist
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists(): return
    r = requests.get(url, stream=True)
    r.raise_for_status()
    total = int(r.headers.get("content-length", 0))
    with open(dst, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=dst.name) as p:
        for chunk in r.iter_content(chunk_size=1048576):
            f.write(chunk); p.update(len(chunk))


def download_subset(root: Path): # download the GenVideo subset to the given root folder
    root.mkdir(parents=True, exist_ok=True)
    # 1‑a) GenVideo shards 
    for fname in GENV_FILES:
        _dl(GENV_BASE + fname, root / fname)

    # 1‑b) minimal Kinetics‑400 subset
    txt = requests.get(K400_TXT).text.splitlines()
    for line in txt[:4]:  # first four tar files only
        fname = Path(line).name
        _dl(line, root / fname)

def _extract(path: Path, out_dir: Path):
    if path.suffix == ".zip":
        with zipfile.ZipFile(path) as z: z.extractall(out_dir)
    elif path.suffixes[-2:] == [".tar", ".gz"]:
        with tarfile.open(path, "r:gz") as t: t.extractall(out_dir)

def prepare_folders(root: Path):
    fake, real = root / "fake", root / "real"
    fake.mkdir(exist_ok=True); real.mkdir(exist_ok=True)

    # combine split pieces first
    for stem in {"Latte", "Pika", "DynamicCrafter"}:
        pieces = sorted(root.glob(f"{stem}.split.*"))
        if pieces:
            tgt = root / f"{stem}.tar.gz"
            if not tgt.exists():
                with open(tgt, "wb") as w:
                    for p in pieces:
                        w.write(open(p, "rb").read())
            _extract(tgt, fake / stem)

    # single archives
    _extract(root / "OpenSora.tar.gz", fake / "OpenSora")
    _extract(root / "GenVideo-Val.zip", real / "MSRVTT")

    # Kinetics tars land directly under real/Kinetics
    k_dir = real / "Kinetics"
    k_dir.mkdir(exist_ok=True)
    for tar in root.glob("part_*.tar.gz"):
        _extract(tar, k_dir)

def make_csv(root: Path, seed=42):
    vids: List[Path] = list((root / "real").rglob("*.mp4")) + \
                       list((root / "fake").rglob("*.mp4"))
    random.seed(seed); random.shuffle(vids)
    splits = {
        "train": vids[: int(.8*len(vids))],
        "val"  : vids[int(.8*len(vids)): int(.9*len(vids))],
        "test" : vids[int(.9*len(vids)):]
    }
    for split, paths in splits.items():
        with open(root / f"{split}.csv", "w", newline="") as f:
            w = csv.writer(f); w.writerow(["path", "label"])
            for p in paths:
                w.writerow([p.as_posix(), int("real" not in p.parts)])
    return splits

Transforms and loaders

In [None]:

TFM_TRAIN = A.Compose([
    A.RandomResizedCrop(224, 224, scale=(.6,1.0)),
    A.HorizontalFlip(p=.5),
    A.GaussNoise(p=.3),
    A.GaussianBlur(blur_limit=(3,5), p=.3),
    A.Normalize(mean=(.485,.456,.406), std=(.229,.224,.225)),
])

TFM_VAL = A.Compose([
    A.Resize(224, 224),
    A.Normalize(mean=(.485,.456,.406), std=(.229,.224,.225)),
])

class BalancedVideoDataset(Dataset):
    def __init__(self, csv_file: Path, train=True, n_frames=8):
        import pandas as pd
        self.df = pd.read_csv(csv_file)
        self.train = train
        self.n_frames = n_frames
        self.mean = (.485,.456,.406); self.std = (.229,.224,.225)

        # keep indices balanced each epoch
        self.pos = self.df[self.df.label==1].index.tolist()
        self.neg = self.df[self.df.label==0].index.tolist()
        self.balance()

    def balance(self):
        k = min(len(self.pos), len(self.neg))
        random.shuffle(self.pos); random.shuffle(self.neg)
        self.indices = self.pos[:k] + self.neg[:k]
        random.shuffle(self.indices)

    def __len__(self): return len(self.indices)
    def __getitem__(self, idx):
        import cv2
        row = self.df.loc[self.indices[idx]]
        vr = decord.VideoReader(row.path)
        tot = len(vr); idxs = np.linspace(0, tot-1, self.n_frames, dtype=int)
        frames = vr.get_batch(idxs).asnumpy()  # (n, H, W, 3) uint8
        aug = TFM_TRAIN if self.train else TFM_VAL
        frames = np.stack([aug(image=f)["image"].transpose(2,0,1) for f in frames])
        label = torch.tensor([row.label], dtype=torch.float32)
        return frames, label


def make_loaders(root: Path, batch=8, workers=4):
    train_ds = BalancedVideoDataset(root / "train.csv", train=True)
    val_ds   = BalancedVideoDataset(root / "val.csv",   train=False)
    train_loader = DataLoader(train_ds, batch, True,  num_workers=workers, drop_last=True)
    val_loader   = DataLoader(val_ds,   batch, False, num_workers=workers)
    return train_loader, val_loader


In [None]:

root = Path("D:/GenVideo")
print("→ Downloading...")
download_subset(root)
print("→ Extracting...")
prepare_folders(root)
print("→ Splitting...")
make_csv(root)
print("✓  Done")

