In [1]:
# Analyze how many folders are inside "./DFDC/test/frames" and how many images are in each folder.
import os
import sys
import glob


frames_dir = '../data/DFDC/test/frames'

if not os.path.isdir(frames_dir):
    print(f"The directory {frames_dir} does not exist.")
    sys.exit(1)

folders = glob.glob(os.path.join(frames_dir, '*'))
print(f"Found {len(folders)} folders in {frames_dir}.")

for folder in folders:
    if os.path.isdir(folder):
        images = glob.glob(os.path.join(folder, '*.png'))  # Assuming images are in JPG format
        print(f"Folder: {os.path.basename(folder)}, Images: {len(images)}")

Found 4704 folders in ../data/DFDC/test/frames.
Folder: lvlangwjtw, Images: 32
Folder: gzyvtzlxvc, Images: 32
Folder: abzjyreetb, Images: 32
Folder: vghdtydtqx, Images: 32
Folder: byqtjaxjvy, Images: 32
Folder: xtplsbjrmz, Images: 32
Folder: qlvkranrau, Images: 31
Folder: wpxjryhvlp, Images: 30
Folder: iockopbjaw, Images: 32
Folder: ykjvguyxkr, Images: 32
Folder: gbvgtnetwa, Images: 32
Folder: optrilxlpz, Images: 30
Folder: berivhgeil, Images: 32
Folder: pbdtngzuov, Images: 32
Folder: fgxnfexcmd, Images: 32
Folder: dixmnqidvn, Images: 14
Folder: extyitlbzu, Images: 32
Folder: cyarbaoqfj, Images: 31
Folder: pzmlzeacau, Images: 32
Folder: sgltnomzxa, Images: 32
Folder: lwiirvdzal, Images: 2
Folder: wpugiszidm, Images: 32
Folder: ggvlkoujlu, Images: 32
Folder: qddokaqunw, Images: 32
Folder: hpguuapbbs, Images: 13
Folder: xktecgnpgc, Images: 31
Folder: nlyfavxkln, Images: 32
Folder: isimzrcdst, Images: 32
Folder: zffenlgire, Images: 4
Folder: nwpjsjarju, Images: 32
Folder: dihhcehqig, Imag

In [2]:
# Open DFDC/test/metadata.json

metadata_file = os.path.join(frames_dir, '..', 'metadata.json')

# Analyze the metadata file
if os.path.isfile(metadata_file):
    with open(metadata_file, 'r') as f:
        metadata = f.read()
    print(f"Metadata file {metadata_file} found. Size: {len(metadata)} bytes.")
    
# Create df from metadata
import json
metadata_dict = json.loads(metadata)

Metadata file ../data/DFDC/test/frames/../metadata.json found. Size: 1091075 bytes.


In [3]:
import pandas as pd

# Convert metadata to DataFrame
df = pd.DataFrame(metadata_dict)
df = df.transpose()  # Transpose to have each folder as a row

In [4]:
df.reset_index(inplace=True)

In [5]:
df

Unnamed: 0,index,augmentations,is_fake
0,aalscayrfi.mp4,{'augmenter': {'framerate_change': {'fps': 20}}},0
1,aalyqplqns.mp4,{'augmenter': {'quality_level_change': {'quali...,0
2,aamrozxzsq.mp4,"{'distractor': {'imgs': {'num_imgs': 2, 'rando...",1
3,aaoqanfmgd.mp4,{'augmenter': {'no_audio': {}}},1
4,aaqkmjtoby.mp4,{'augmenter': {'noise': {'level': 40}}},1
...,...,...,...
4995,zzoqoqsqtf.mp4,"{'augmenter': {'contrast': {'level': 2.1}}, 'd...",1
4996,zzrglepohd.mp4,{},1
4997,zztotvpkjc.mp4,{'augmenter': {'greyscale': {}}},0
4998,zztsycpujv.mp4,{'augmenter': {'greyscale': {}}},1


In [6]:
# Set random = 42 and split folders in train, val and test (70%, 15%, 15%)
import numpy as np
np.random.seed(42)

# Shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)

# Split the DataFrame into train, val, and test sets
train_size = int(0.7 * len(df))
val_size = int(0.15 * len(df))
train_df = df[:train_size]
val_df = df[train_size:train_size + val_size]
test_df = df[train_size + val_size:]

In [7]:
for index, row in train_df.iterrows():
        folder_name = row['index'].strip('.mp4')
        is_fake = row['is_fake']
        print(f"Train Folder: {folder_name}, Is Fake: {is_fake}")
        break

Train Folder: hudcvqvyq, Is Fake: 1


In [8]:
import os
import shutil
import glob
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import pandas as pd  # per tipizzare i DataFrame

# -------------------------------------------------------------------
#--- 1) Definisci le directory globali (modifica con i tuoi percorsi) 
# -------------------------------------------------------------------

# Directory di output principale:
output_dir = '../data_frames/DFDC'  # ← MODIFICA se serve
os.makedirs(output_dir, exist_ok=True)

# -------------------------------------------------------------------
#--- 2) Funzione worker che copia TUTTE le immagini di una singola cartella
# -------------------------------------------------------------------

def _copy_one_folder(folder_name: str, is_fake: int, split_name: str) -> None:
    """
    Worker function: prende il nome della cartella (video), la label is_fake,
    e il nome della split ('train'/'val'/'test'), e copia tutti i .png da:
        frames_dir/<folder_name>/*.png
    in
        output_dir/<split_name>/(REAL|FAKE)/
    rinominando ogni file in <folder_name>_<original_name>.png.

    Se la cartella non esiste, non fa nulla (ignora).
    """
    # Costruisco i percorsi:
    src_folder = os.path.join(frames_dir, folder_name)
    class_sub = 'FAKE' if is_fake else 'REAL'
    dest_folder = os.path.join(output_dir, split_name, class_sub, folder_name)
    os.makedirs(dest_folder, exist_ok=True)

    # Controllo che la cartella esista
    if not os.path.isdir(src_folder):
        # Se non esiste, skip semplice
        return

    # Trovo tutti i PNG nella cartella (se hai anche JPG, aggiungi un secondo glob e fai un loop aggiuntivo)
    images = glob.glob(os.path.join(src_folder, '*.png'))
    for img_path in images:
        image_name = os.path.basename(img_path)
        dest_image = os.path.join(dest_folder, f"{folder_name}_{image_name}")
        shutil.copy(img_path, dest_image)
    # Non ritorniamo nulla (None) in caso di successo

# -------------------------------------------------------------------
#--- 3) copy_images paralellizzata
# -------------------------------------------------------------------

def copy_images(df: pd.DataFrame, split_name: str, max_workers: int = None):
    """
    Parallel copy delle immagini. 
    - df: DataFrame che contiene almeno le colonne ['index', 'is_fake'].
      df['index'] contiene nomi di file .mp4 (es. "video123.mp4"), 
      df['is_fake'] è 0/1.
    - split_name: 'train' o 'val' o 'test'.
    - max_workers: numero massimo di processi (default = n° CPU).
    """
    # Preparo le liste da mappare sul pool:
    # 1) folder_names: rimuovo l'estensione .mp4 in modo sicuro
    folder_names = []
    is_fake_flags = []
    for _, row in df.iterrows():
        base_name, ext = os.path.splitext(row['index'])
        folder_names.append(base_name)
        is_fake_flags.append(int(row['is_fake']))

    n_items = len(folder_names)
    if n_items == 0:
        return  # niente da copiare

    # Se non è specificato, uso tutti i CPU disponibili
    if max_workers is None:
        max_workers = os.cpu_count() or 1

    # ---------------------------------------------------------------------------------------------------
    # Nota su tqdm + Executor.map:
    #   - Executor.map restituisce un iteratore di ritorni da ciascun worker.
    #   - Utilizziamo tqdm(...) su quell’iteratore, specificando total=n_items
    #   - In questo modo la barra progredisce man mano che i worker terminano una cartella.
    # ---------------------------------------------------------------------------------------------------

    # Avvio del pool di processi
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # executor.map accetta come argomenti paralleli 3 liste:
        #   executor.map(func, list1, list2, list3) equivale a starmap su ogni tripletta
        # Qui stiamo chiamando _copy_one_folder(folder_name, is_fake, split_name)
        # poiché split_name è sempre uguale, passiamo una lista con lo stesso valore ripetuto.
        split_names = [split_name] * n_items

        # Esegui map parallelo e monitoralo con tqdm
        for _ in tqdm(
            executor.map(_copy_one_folder, folder_names, is_fake_flags, split_names),
            total=n_items,
            desc=f"Copy ({split_name})"
        ):
            # Non ci interessa il valore di ritorno, basta iterare per avanzare la barra
            pass

# -------------------------------------------------------------------
#--- 4) Esempio di utilizzo
# -------------------------------------------------------------------

# Immaginiamo di avere tre DataFrame: train_df, val_df, test_df,
# ciascuno con le colonne ['index', 'is_fake'].

copy_images(train_df, 'train')
copy_images(val_df,   'val')
copy_images(test_df,  'test')


Copy (train): 100%|██████████| 3500/3500 [00:14<00:00, 242.82it/s]
Copy (val): 100%|██████████| 750/750 [00:03<00:00, 189.33it/s]
Copy (test): 100%|██████████| 750/750 [00:03<00:00, 221.54it/s]


In [9]:
# Analyze for each real/fake folder inside train/val/test the number of images
for split in ['train', 'val', 'test']:
    for class_sub in ['REAL', 'FAKE']:
        folder_path = os.path.join(output_dir, split, class_sub)
        if os.path.isdir(folder_path):
            folders = glob.glob(os.path.join(folder_path, '*'))
            print(f"Found {len(folders)} folders in {folder_path}.")
            total=0
            for folder in folders:
                if os.path.isdir(folder):
                    images = glob.glob(os.path.join(folder, '*.png'))
                    total += len(images)
                    # print(f"Folder: {os.path.basename(folder)}, Images: {len(images)}")
            print(f"Total images in {folder_path}: {total}")

Found 1747 folders in ../data_frames/DFDC/train/REAL.


Total images in ../data_frames/DFDC/train/REAL: 44029
Found 1753 folders in ../data_frames/DFDC/train/FAKE.
Total images in ../data_frames/DFDC/train/FAKE: 48332
Found 380 folders in ../data_frames/DFDC/val/REAL.
Total images in ../data_frames/DFDC/val/REAL: 10003
Found 370 folders in ../data_frames/DFDC/val/FAKE.
Total images in ../data_frames/DFDC/val/FAKE: 10067
Found 373 folders in ../data_frames/DFDC/test/REAL.
Total images in ../data_frames/DFDC/test/REAL: 9233
Found 377 folders in ../data_frames/DFDC/test/FAKE.
Total images in ../data_frames/DFDC/test/FAKE: 10452
